| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Search::OpenSearch::Federated; |
|
2
|
1
|
|
|
1
|
|
77514
|
use Moo; |
|
|
1
|
|
|
|
|
67481
|
|
|
|
1
|
|
|
|
|
14
|
|
|
3
|
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
our $VERSION = '0.007'; |
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
has 'debug' => ( is => 'rw' ); |
|
7
|
|
|
|
|
|
|
has 'fields' => ( |
|
8
|
|
|
|
|
|
|
is => 'rw', |
|
9
|
|
|
|
|
|
|
default => sub { [qw( title id author link summary tags modified )] } |
|
10
|
|
|
|
|
|
|
); |
|
11
|
|
|
|
|
|
|
has 'urls' => ( is => 'rw' ); |
|
12
|
|
|
|
|
|
|
has 'total' => ( is => 'rw' ); |
|
13
|
|
|
|
|
|
|
has 'facets' => ( is => 'rw' ); |
|
14
|
|
|
|
|
|
|
has 'subtotals' => ( is => 'rw' ); |
|
15
|
|
|
|
|
|
|
has 'timeout' => ( is => 'rw' ); |
|
16
|
|
|
|
|
|
|
has 'normalize_scores' => ( is => 'rw' ); |
|
17
|
|
|
|
|
|
|
has 'version' => ( is => 'rw', default => sub {$VERSION} ); |
|
18
|
|
|
|
|
|
|
|
|
19
|
1
|
|
|
1
|
|
3059
|
use Carp; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
267
|
|
|
20
|
1
|
|
|
1
|
|
2888
|
use Data::Dump qw( dump ); |
|
|
1
|
|
|
|
|
30279
|
|
|
|
1
|
|
|
|
|
138
|
|
|
21
|
1
|
|
|
1
|
|
2765
|
use Parallel::Iterator qw( iterate_as_array ); |
|
|
1
|
|
|
|
|
75873
|
|
|
|
1
|
|
|
|
|
102
|
|
|
22
|
1
|
|
|
1
|
|
1936
|
use JSON; |
|
|
1
|
|
|
|
|
34704
|
|
|
|
1
|
|
|
|
|
6
|
|
|
23
|
1
|
|
|
1
|
|
4425
|
use LWP::UserAgent; |
|
|
1
|
|
|
|
|
139937
|
|
|
|
1
|
|
|
|
|
138
|
|
|
24
|
1
|
|
|
1
|
|
15
|
use Scalar::Util qw( blessed ); |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
195
|
|
|
25
|
1
|
|
|
1
|
|
5485
|
use Search::Tools::XML; |
|
|
1
|
|
|
|
|
189898
|
|
|
|
1
|
|
|
|
|
79
|
|
|
26
|
1
|
|
|
1
|
|
1607
|
use Data::Transformer; |
|
|
1
|
|
|
|
|
924
|
|
|
|
1
|
|
|
|
|
39
|
|
|
27
|
1
|
|
|
1
|
|
1070
|
use Normalize; |
|
|
1
|
|
|
|
|
5404
|
|
|
|
1
|
|
|
|
|
75
|
|
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# we do not use WWW::OpenSearch because we need to pull out |
|
30
|
|
|
|
|
|
|
# some non-standard data from the XML. |
|
31
|
|
|
|
|
|
|
# we do use XML::Feed to parse XML responses. |
|
32
|
1
|
|
|
1
|
|
931
|
use XML::Simple; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
use XML::Feed; |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
my $OS_NS = 'http://a9.com/-/spec/opensearch/1.1/'; |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
my $XMLer = Search::Tools::XML->new(); |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
my $XML_ESCAPER = Data::Transformer->new( |
|
40
|
|
|
|
|
|
|
normal => sub { local ($_) = shift; $$_ = $XMLer->escape($$_); } ); |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
sub search { |
|
43
|
|
|
|
|
|
|
my $self = shift; |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
my $urls = $self->{urls} or croak "no urls defined"; |
|
46
|
|
|
|
|
|
|
my $num_urls = scalar @$urls; |
|
47
|
|
|
|
|
|
|
my @done = iterate_as_array( |
|
48
|
|
|
|
|
|
|
sub { |
|
49
|
|
|
|
|
|
|
$self->_fetch( $_[1] ); |
|
50
|
|
|
|
|
|
|
}, |
|
51
|
|
|
|
|
|
|
$urls, |
|
52
|
|
|
|
|
|
|
); |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
return $self->_aggregate( \@done ); |
|
55
|
|
|
|
|
|
|
} |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
sub _aggregate { |
|
58
|
|
|
|
|
|
|
my $self = shift; |
|
59
|
|
|
|
|
|
|
my $responses = shift; |
|
60
|
|
|
|
|
|
|
my $results = []; |
|
61
|
|
|
|
|
|
|
my $fields = $self->fields; |
|
62
|
|
|
|
|
|
|
my $total = 0; |
|
63
|
|
|
|
|
|
|
my %subtotals = (); |
|
64
|
|
|
|
|
|
|
my %facets = (); |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
RESP: for my $resp (@$responses) { |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
my $req_uri = $resp->request->uri; |
|
69
|
|
|
|
|
|
|
my $resp_status = $resp->code; |
|
70
|
|
|
|
|
|
|
$self->debug |
|
71
|
|
|
|
|
|
|
and warn |
|
72
|
|
|
|
|
|
|
sprintf( "response for %s = %s\n", $req_uri, $resp_status ); |
|
73
|
|
|
|
|
|
|
next RESP unless $resp_status =~ m/^2/; |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
# temporary buffer to allow for normalizing scores |
|
76
|
|
|
|
|
|
|
my @resp_results = (); |
|
77
|
|
|
|
|
|
|
my $highest_score = 0; |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
if ( $resp->content_type eq 'application/json' ) { |
|
80
|
|
|
|
|
|
|
my $r = decode_json( $resp->content ); |
|
81
|
|
|
|
|
|
|
if ( $r->{results} ) { |
|
82
|
|
|
|
|
|
|
@resp_results = @{ $r->{results} }; |
|
83
|
|
|
|
|
|
|
} |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
# must turn facets inside out in order |
|
86
|
|
|
|
|
|
|
# to aggregate counts correctly |
|
87
|
|
|
|
|
|
|
if ( $r->{facets} ) { |
|
88
|
|
|
|
|
|
|
for my $name ( keys %{ $r->{facets} } ) { |
|
89
|
|
|
|
|
|
|
for my $facet ( @{ $r->{facets}->{$name} } ) { |
|
90
|
|
|
|
|
|
|
$facets{$name}->{ $facet->{term} } += $facet->{count}; |
|
91
|
|
|
|
|
|
|
} |
|
92
|
|
|
|
|
|
|
} |
|
93
|
|
|
|
|
|
|
} |
|
94
|
|
|
|
|
|
|
$total += $r->{total} || 0; |
|
95
|
|
|
|
|
|
|
$subtotals{$req_uri} = $r->{total}; |
|
96
|
|
|
|
|
|
|
} |
|
97
|
|
|
|
|
|
|
elsif ( $resp->content_type eq 'application/xml' ) { |
|
98
|
|
|
|
|
|
|
my $xml = $resp->content; |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
#warn $xml; |
|
101
|
|
|
|
|
|
|
my $feed = XML::Feed->parse( \$xml ); |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
if ( !$feed ) { |
|
104
|
|
|
|
|
|
|
warn XML::Feed->errstr; |
|
105
|
|
|
|
|
|
|
next RESP; |
|
106
|
|
|
|
|
|
|
} |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
#dump $feed; |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
# |
|
111
|
|
|
|
|
|
|
# we must re-escape the XML content since the feed parser |
|
112
|
|
|
|
|
|
|
# and XML::Simple will escape values automatically |
|
113
|
|
|
|
|
|
|
# |
|
114
|
|
|
|
|
|
|
my @entries; |
|
115
|
|
|
|
|
|
|
for my $item ( $feed->entries ) { |
|
116
|
|
|
|
|
|
|
my $e = {}; |
|
117
|
|
|
|
|
|
|
for my $f (@$fields) { |
|
118
|
|
|
|
|
|
|
$e->{$f} = $item->$f; |
|
119
|
|
|
|
|
|
|
if ( blessed( $e->{$f} ) ) { |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
#dump( $e->{$f} ); |
|
122
|
|
|
|
|
|
|
if ( $e->{$f}->isa('XML::Feed::Content') ) { |
|
123
|
|
|
|
|
|
|
$e->{$f} = $XMLer->escape( $e->{$f}->body ); |
|
124
|
|
|
|
|
|
|
} |
|
125
|
|
|
|
|
|
|
elsif ( $e->{$f}->isa('DateTime') ) { |
|
126
|
|
|
|
|
|
|
$e->{$f} = $e->{$f}->epoch; |
|
127
|
|
|
|
|
|
|
} |
|
128
|
|
|
|
|
|
|
} |
|
129
|
|
|
|
|
|
|
else { |
|
130
|
|
|
|
|
|
|
$e->{$f} = $XMLer->escape( $e->{$f} ); |
|
131
|
|
|
|
|
|
|
} |
|
132
|
|
|
|
|
|
|
} |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
#dump $e; |
|
135
|
|
|
|
|
|
|
my $content = $item->content; |
|
136
|
|
|
|
|
|
|
my $fields = XMLin( $content->body, NoAttr => 1 ); |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
#dump $fields; |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
for my $f ( keys %$fields ) { |
|
141
|
|
|
|
|
|
|
$e->{$f} = $fields->{$f}; |
|
142
|
|
|
|
|
|
|
if ( ref $e->{$f} ) { |
|
143
|
|
|
|
|
|
|
$XML_ESCAPER->traverse( $e->{$f} ); |
|
144
|
|
|
|
|
|
|
} |
|
145
|
|
|
|
|
|
|
else { |
|
146
|
|
|
|
|
|
|
$e->{$f} = $XMLer->escape( $e->{$f} ); |
|
147
|
|
|
|
|
|
|
} |
|
148
|
|
|
|
|
|
|
} |
|
149
|
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
# massage some field names |
|
151
|
|
|
|
|
|
|
$e->{mtime} = delete $e->{modified}; |
|
152
|
|
|
|
|
|
|
$e->{uri} = delete $e->{id}; |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
#dump $content; |
|
155
|
|
|
|
|
|
|
#dump $e; |
|
156
|
|
|
|
|
|
|
push @entries, $e; |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
} |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
# facets require digging into the raw xml |
|
161
|
|
|
|
|
|
|
my $xml_feed = XMLin( $feed->as_xml, NoAttr => 1 ); |
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
#dump($xml_feed); |
|
164
|
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
# must turn facets inside out in order |
|
166
|
|
|
|
|
|
|
# to aggregate counts correctly |
|
167
|
|
|
|
|
|
|
if ( $xml_feed->{category}->{sos}->{facets} ) { |
|
168
|
|
|
|
|
|
|
my $facet_feed = $xml_feed->{category}->{sos}->{facets}; |
|
169
|
|
|
|
|
|
|
for my $name ( keys %$facet_feed ) { |
|
170
|
|
|
|
|
|
|
if ( ref $facet_feed->{$name}->{$name} eq 'ARRAY' ) { |
|
171
|
|
|
|
|
|
|
for my $facet ( @{ $facet_feed->{$name}->{$name} } ) { |
|
172
|
|
|
|
|
|
|
$facets{$name}->{ $facet->{term} } |
|
173
|
|
|
|
|
|
|
+= $facet->{count}; |
|
174
|
|
|
|
|
|
|
} |
|
175
|
|
|
|
|
|
|
} |
|
176
|
|
|
|
|
|
|
elsif ( ref $facet_feed->{$name}->{$name} eq 'HASH' ) { |
|
177
|
|
|
|
|
|
|
my $facet = $facet_feed->{$name}->{$name}; |
|
178
|
|
|
|
|
|
|
$facets{$name}->{ $facet->{term} } = $facet->{count}; |
|
179
|
|
|
|
|
|
|
} |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
} |
|
182
|
|
|
|
|
|
|
} |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
my $atom = $feed->{atom}; |
|
185
|
|
|
|
|
|
|
my $this_total = $atom->get( $OS_NS, 'totalResults' ); |
|
186
|
|
|
|
|
|
|
$total += $this_total; |
|
187
|
|
|
|
|
|
|
$subtotals{$req_uri} = $this_total; |
|
188
|
|
|
|
|
|
|
push @resp_results, @entries; |
|
189
|
|
|
|
|
|
|
} |
|
190
|
|
|
|
|
|
|
else { |
|
191
|
|
|
|
|
|
|
croak sprintf( "Unsupported response type '%s' for %s\n", |
|
192
|
|
|
|
|
|
|
scalar $resp->content_type, $req_uri ); |
|
193
|
|
|
|
|
|
|
} |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
# normalize scores |
|
196
|
|
|
|
|
|
|
if ( $self->normalize_scores ) { |
|
197
|
|
|
|
|
|
|
my $normalizer = Normalize->new( 'round_to' => 0.001 ); |
|
198
|
|
|
|
|
|
|
my %normalized = (); |
|
199
|
|
|
|
|
|
|
my $i = 0; |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
# compute |
|
202
|
|
|
|
|
|
|
for my $r (@resp_results) { |
|
203
|
|
|
|
|
|
|
$normalized{ $i++ } = $r->{score}; |
|
204
|
|
|
|
|
|
|
} |
|
205
|
|
|
|
|
|
|
$normalizer->normalize_to_max( \%normalized ); |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
# apply |
|
208
|
|
|
|
|
|
|
for my $idx ( keys %normalized ) { |
|
209
|
|
|
|
|
|
|
$resp_results[$idx]->{score} = ( $normalized{$idx} * 1000 ); |
|
210
|
|
|
|
|
|
|
} |
|
211
|
|
|
|
|
|
|
} |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
# aggregate |
|
214
|
|
|
|
|
|
|
push @$results, @resp_results; |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
} |
|
217
|
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
# transform facets back into arrays of count/term pairs |
|
219
|
|
|
|
|
|
|
my %facets_norm; |
|
220
|
|
|
|
|
|
|
for my $name ( keys %facets ) { |
|
221
|
|
|
|
|
|
|
my @diads = (); |
|
222
|
|
|
|
|
|
|
for my $term ( keys %{ $facets{$name} } ) { |
|
223
|
|
|
|
|
|
|
push @diads, { term => $term, count => $facets{$name}->{$term} }; |
|
224
|
|
|
|
|
|
|
} |
|
225
|
|
|
|
|
|
|
$facets_norm{$name} = [@diads]; |
|
226
|
|
|
|
|
|
|
} |
|
227
|
|
|
|
|
|
|
$self->{facets} = \%facets_norm; |
|
228
|
|
|
|
|
|
|
$self->{total} = $total; |
|
229
|
|
|
|
|
|
|
$self->{subtotals} = \%subtotals; |
|
230
|
|
|
|
|
|
|
return [ sort { $b->{score} <=> $a->{score} } @$results ]; |
|
231
|
|
|
|
|
|
|
} |
|
232
|
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
sub _fetch { |
|
234
|
|
|
|
|
|
|
my $self = shift; |
|
235
|
|
|
|
|
|
|
my $url = shift or croak "url required"; |
|
236
|
|
|
|
|
|
|
my $ua = LWP::UserAgent->new(); |
|
237
|
|
|
|
|
|
|
$ua->agent( 'sos-fedsearch ' . $VERSION ); |
|
238
|
|
|
|
|
|
|
$ua->timeout( $self->{timeout} ) if $self->{timeout}; |
|
239
|
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
my $response = $ua->get($url); |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
$self->debug and warn "got response for $url: " . $response->status_line; |
|
243
|
|
|
|
|
|
|
return $response; |
|
244
|
|
|
|
|
|
|
} |
|
245
|
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
1; |
|
247
|
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
__END__ |