blib/lib/Search/Lemur.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 90 | 124 | 72.5 |
branch | 23 | 32 | 71.8 |
condition | n/a | ||
subroutine | 15 | 19 | 78.9 |
pod | 6 | 6 | 100.0 |
total | 134 | 181 | 74.0 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package Search::Lemur; | ||||||
2 | |||||||
3 | 5 | 5 | 159487 | use warnings; | |||
5 | 13 | ||||||
5 | 169 | ||||||
4 | 5 | 5 | 27 | use strict; | |||
5 | 10 | ||||||
5 | 166 | ||||||
5 | 5 | 5 | 27 | use Carp qw( carp ); | |||
5 | 14 | ||||||
5 | 263 | ||||||
6 | |||||||
7 | 5 | 5 | 2782 | use Search::Lemur::Result; | |||
5 | 12 | ||||||
5 | 148 | ||||||
8 | 5 | 5 | 2513 | use Search::Lemur::ResultItem; | |||
5 | 13 | ||||||
5 | 131 | ||||||
9 | 5 | 5 | 2645 | use Search::Lemur::Database; | |||
5 | 15 | ||||||
5 | 132 | ||||||
10 | |||||||
11 | 5 | 5 | 4573 | use LWP; | |||
5 | 308759 | ||||||
5 | 217 | ||||||
12 | 5 | 5 | 69 | use Data::Dumper; | |||
5 | 10 | ||||||
5 | 415 | ||||||
13 | |||||||
14 | 5 | 5 | 50 | use vars qw( $VERSION ); | |||
5 | 8 | ||||||
5 | 7378 | ||||||
15 | |||||||
16 | =head1 NAME | ||||||
17 | |||||||
18 | Lemur - class to query a Lemur server, and parse the results | ||||||
19 | |||||||
20 | =head1 VERSION | ||||||
21 | |||||||
22 | Version 1.00 | ||||||
23 | |||||||
24 | =cut | ||||||
25 | |||||||
26 | our $VERSION = '1.00'; | ||||||
27 | |||||||
28 | =head1 SYNOPSYS | ||||||
29 | |||||||
30 | use Search::Lemur; | ||||||
31 | |||||||
32 | my $lem = Search::Lemur->new("http://url/to/lemur.cgi"); | ||||||
33 | |||||||
34 | # run some queries, and get back an array of results | ||||||
35 | # a query with a single term: | ||||||
36 | my @results1 = $lem->query("encryption"); | ||||||
37 | # a query with two terms: | ||||||
38 | my @results2 = $lem->query("encryption MD5"); | ||||||
39 | |||||||
40 | # get corpus term frequency of 'MD5': | ||||||
41 | my $md5ctf = $results2[1]->ctf(); | ||||||
42 | |||||||
43 | =head1 DESCRIPTION | ||||||
44 | |||||||
45 | This module will make it easy to interact with a Lemur | ||||||
46 | Toolkit for Language Modeling and Information Retrieval | ||||||
47 | server for information retreival exercises. For more | ||||||
48 | information on Lemur, see L |
||||||
49 | |||||||
50 | This module takes care of all parsing of responses from | ||||||
51 | the server. You can just pass a query as a | ||||||
52 | space-separated list of terms, and the module will give | ||||||
53 | you back an array of C |
||||||
54 | |||||||
55 | =cut | ||||||
56 | |||||||
57 | |||||||
58 | =head2 Main Methods | ||||||
59 | |||||||
60 | =over 2 | ||||||
61 | |||||||
62 | =item new($url) | ||||||
63 | |||||||
64 | Create a new Lemur object, connecting to the given Lemur server. | ||||||
65 | The C<$url> should be a full URL, ending in something like 'lemur.cgi'. | ||||||
66 | |||||||
67 | =cut | ||||||
68 | |||||||
69 | sub new { | ||||||
70 | 4 | 4 | 1 | 949 | my $class = shift; | ||
71 | 4 | 9 | my $url; | ||||
72 | 4 | 100 | 65 | if (@_) { $url = shift; | |||
3 | 10 | ||||||
73 | 1 | 4 | } else { return undef; } | ||||
74 | 3 | 20 | my $self = { baseurl => $url, | ||||
75 | db => 0, | ||||||
76 | n => undef, | ||||||
77 | fullurl => undef }; | ||||||
78 | 3 | 12 | bless $self, $class; | ||||
79 | 3 | 14 | $self->{fullurl} = $self->_makeurl(); | ||||
80 | 3 | 10 | return $self; | ||||
81 | } | ||||||
82 | |||||||
83 | =item url() | ||||||
84 | |||||||
85 | Return the URL of the Lemur server | ||||||
86 | |||||||
87 | =cut | ||||||
88 | |||||||
89 | sub url { | ||||||
90 | 7 | 7 | 1 | 9 | my $self = shift; | ||
91 | 7 | 37 | return $self->{baseurl}; | ||||
92 | } | ||||||
93 | |||||||
94 | =item listdb() | ||||||
95 | |||||||
96 | Get some information about the databases available | ||||||
97 | |||||||
98 | Returns an array of Lemur::Database objects. | ||||||
99 | |||||||
100 | =cut | ||||||
101 | |||||||
102 | sub listdb { | ||||||
103 | 0 | 0 | 1 | 0 | my $self = shift; | ||
104 | 0 | 0 | $self->_makeurl(); | ||||
105 | 0 | 0 | my $url = $self->{fullurl} . "&d=?"; | ||||
106 | 0 | 0 | my $result = $self->_strip($url); | ||||
107 | 0 | 0 | return $self->_makedbs($result); | ||||
108 | } | ||||||
109 | |||||||
110 | =item d([num]) | ||||||
111 | |||||||
112 | Set the database number to query. This will specify the | ||||||
113 | database number instead of just using the default databse 0. | ||||||
114 | |||||||
115 | If the C |
||||||
116 | |||||||
117 | =cut | ||||||
118 | |||||||
119 | sub d { | ||||||
120 | 1 | 1 | 1 | 3 | my $self = shift; | ||
121 | 1 | 50 | 4 | if (@_) { $self->{d} = shift; $self->_makeurl(); } | |||
1 | 2 | ||||||
1 | 4 | ||||||
122 | 1 | 3 | return $self->{d}; | ||||
123 | } | ||||||
124 | |||||||
125 | |||||||
126 | |||||||
127 | =item v(string) | ||||||
128 | |||||||
129 | Make a query to the Lemur server. The query should be a space-delimited | ||||||
130 | list of query terms. If the URL is has not been specified, this will die. | ||||||
131 | |||||||
132 | Be sure there is only one space between words, or something unexpected may | ||||||
133 | happen. | ||||||
134 | |||||||
135 | Returns an array of results (See L |
||||||
136 | be a result for each query term. | ||||||
137 | |||||||
138 | =cut | ||||||
139 | |||||||
140 | # This method really just queries the server, and passes the response on to | ||||||
141 | # &_parse(string). This was done to make testing easier, without having to | ||||||
142 | # query a real server for testing. | ||||||
143 | sub v { | ||||||
144 | 0 | 0 | 1 | 0 | my $self = shift; | ||
145 | 0 | 0 | my $query = shift; | ||||
146 | 0 | 0 | $query =~ s/ +/ /g; | ||||
147 | 0 | 0 | 0 | croak("Something went wrong; I have no URL") unless $self->{baseurl}; | |||
148 | 0 | 0 | my @terms = split(/ +/, $query); | ||||
149 | 0 | 0 | my $url = $self->{fullurl}; | ||||
150 | 0 | 0 | foreach my $term (@terms) { | ||||
151 | 0 | 0 | $url = "$url&v=$term"; | ||||
152 | } | ||||||
153 | 0 | 0 | return $self->_parse([$query, $self->_strip($url)]); | ||||
154 | } | ||||||
155 | |||||||
156 | =item m(string) | ||||||
157 | |||||||
158 | Returns the lexicalized (stopped & stemmed) version of the given | ||||||
159 | word. This is affected by weather or not the current database | ||||||
160 | is stemmed and/or stopworded. Basically, this is the real word | ||||||
161 | you will end up searching for. | ||||||
162 | |||||||
163 | Returns a string. | ||||||
164 | |||||||
165 | =cut | ||||||
166 | |||||||
167 | sub m { | ||||||
168 | 0 | 0 | 1 | 0 | my $self = shift; | ||
169 | 0 | 0 | my $word = shift; | ||||
170 | 0 | 0 | my $url = $self->{fullurl} . "&m=$word"; | ||||
171 | 0 | 0 | my $return = $self->_strip($url); | ||||
172 | 0 | 0 | 0 | if ($return eq "[OOV]") { $return = ""; } | |||
0 | 0 | ||||||
173 | 0 | 0 | return $return; | ||||
174 | } | ||||||
175 | |||||||
176 | # parse information about available databases into an array of | ||||||
177 | # Search::Lemur::Database objects | ||||||
178 | # | ||||||
179 | # string -> arrayref | ||||||
180 | sub _makedbs { | ||||||
181 | 1 | 1 | 11 | my $self = shift; | |||
182 | 1 | 2 | my $input = shift; | ||||
183 | 1 | 11 | my @input = split(/\n/, $input); | ||||
184 | 1 | 2 | my @return; | ||||
185 | 1 | 1 | my ($num, $title, $stop, $stem, $numdocs, | ||||
186 | $numterms, $numuniq, $avgdoclen); | ||||||
187 | 1 | 4 | while (scalar(@input) >= 1){ | ||||
188 | 24 | 24 | my $line = shift(@input); | ||||
189 | 24 | 100 | 129 | if ($line =~ m/(\d*): ([\w|\d|\s]*) (NOSTOP|STOP) (NOSTEMM|STEMM);/){ | |||
100 | |||||||
100 | |||||||
100 | |||||||
100 | |||||||
50 | |||||||
190 | 4 | 7 | $num = $1; | ||||
191 | 4 | 7 | $title = $2; | ||||
192 | 4 | 100 | 9 | $stop = ($3 eq "STOP") ? 1 : 0; | |||
193 | 4 | 100 | 13 | $stem = ($4 eq "STEMM") ? 1 : 0; | |||
194 | } elsif ($line =~ m/ NUM_DOCS = ?(\d*);/){ | ||||||
195 | 4 | 9 | $numdocs = $1; | ||||
196 | } elsif ($line =~ m/ NUM_UNIQUE_TERMS = ?(\d*);/){ | ||||||
197 | 4 | 11 | $numuniq = $1; | ||||
198 | } elsif ($line =~ m/ NUM_TERMS = ?(\d*);/){ | ||||||
199 | 4 | 16 | $numterms = $1; | ||||
200 | } elsif ($line =~ m/ AVE_DOCLEN = ?(\d*);/){ | ||||||
201 | 4 | 12 | $avgdoclen = $1; | ||||
202 | } elsif ($line =~ m/ /){ |
||||||
203 | 4 | 16 | my $db = Search::Lemur::Database->_new($num, $title, $stop, | ||||
204 | $stem, $numdocs, $numterms, $numuniq, $avgdoclen); | ||||||
205 | 4 | 10 | push @return, $db; | ||||
206 | } | ||||||
207 | } | ||||||
208 | 1 | 4 | return \@return; | ||||
209 | } | ||||||
210 | |||||||
211 | # parse the result from the server | ||||||
212 | # | ||||||
213 | # Takes a reference to an array with two items: | ||||||
214 | # - a string containing the query terms, separated by spaces | ||||||
215 | # - a string containing the response | ||||||
216 | # | ||||||
217 | # returns array of results | ||||||
218 | sub _parse { | ||||||
219 | 2 | 2 | 459 | my $self = shift; | |||
220 | 2 | 4 | my $inputref = shift; | ||||
221 | 2 | 6 | my @input = @$inputref; | ||||
222 | 2 | 6 | my @terms = split(/ /, $input[0]); | ||||
223 | # print Dumper($input[1]); | ||||||
224 | 2 | 17 | my @response = split(/\D+/, $input[1]); | ||||
225 | 2 | 50 | 11 | shift(@response) if ($response[0] eq ""); #TODO Why am I doing this? this makes tests fail. | |||
226 | 2 | 5 | my $numterms = scalar(@terms); | ||||
227 | |||||||
228 | 2 | 3 | my @return; | ||||
229 | |||||||
230 | # build a result object for each term | ||||||
231 | 2 | 3 | foreach my $term (@terms) { | ||||
232 | # print Dumper(@response); | ||||||
233 | 2 | 6 | my $ctf = shift(@response); | ||||
234 | 2 | 3 | my $df = shift(@response); | ||||
235 | 2 | 16 | my $result = Search::Lemur::Result->_new($term, $ctf, $df); | ||||
236 | # build a resultItem object for each document | ||||||
237 | 2 | 10 | for (my $i = 0; $i < $df; $i++){ | ||||
238 | 3 | 5 | my $docid = shift(@response); | ||||
239 | 3 | 6 | my $doclen = shift(@response); | ||||
240 | 3 | 14 | my $tf = shift(@response); | ||||
241 | 3 | 16 | my $resultItem = Search::Lemur::ResultItem->_new($docid, $doclen, $tf); | ||||
242 | 3 | 11 | $result->_add($resultItem); | ||||
243 | } | ||||||
244 | 2 | 6 | push(@return, $result); | ||||
245 | } | ||||||
246 | |||||||
247 | 2 | 11 | return \@return; | ||||
248 | } | ||||||
249 | |||||||
250 | # build the full url to use for all queries | ||||||
251 | # This url consists of the base url (ending in lemur.cgi) plus | ||||||
252 | # d=n (specifies the database) and n=x (the number of results | ||||||
253 | # to return. If either of these are undef, then they are left | ||||||
254 | # off, and the server is free to use its defaults | ||||||
255 | # | ||||||
256 | # the n value seems to only affect the q= query, and not the | ||||||
257 | # inverted list v= query. | ||||||
258 | # | ||||||
259 | # returns a string, and updates the fullurl instance variable | ||||||
260 | sub _makeurl { | ||||||
261 | 7 | 7 | 21 | my $self = shift; | |||
262 | 7 | 22 | my $return = $self->url() . "?g=p"; | ||||
263 | 7 | 100 | 35 | if ($self->{d}) { $return = $return . "&d=$self->{d}"; } | |||
3 | 6 | ||||||
264 | 7 | 100 | 23 | if ($self->{n}) { $return = $return . "&n=$self->{n}"; } | |||
1 | 4 | ||||||
265 | 7 | 17 | $self->{fullurl} = $return; | ||||
266 | 7 | 24 | return $return; | ||||
267 | } | ||||||
268 | |||||||
269 | # strip_: make a request to the server, and strip out anything | ||||||
270 | # useless | ||||||
271 | # | ||||||
272 | # This will get the result from the server, and strip put any | ||||||
273 | # html, etc that is not useful to the parser. | ||||||
274 | # | ||||||
275 | # string -> string | ||||||
276 | # | ||||||
277 | # takes in a url argument to fetch, and returns the stripped | ||||||
278 | # result. | ||||||
279 | sub _strip { | ||||||
280 | 0 | 0 | my $self = shift; | ||||
281 | 0 | my $url = shift; | |||||
282 | # print "$url\n\n"; | ||||||
283 | 0 | my $ua = LWP::UserAgent->new; | |||||
284 | 0 | $ua->agent("Lemur.pm/$VERSION"); | |||||
285 | 0 | my $req = HTTP::Request->new(GET => $url); | |||||
286 | 0 | $req->content_type('application/x-www-form-urlencoded'); | |||||
287 | 0 | $req->content('query=libwww-perl&mode=dist'); | |||||
288 | # make request | ||||||
289 | 0 | my $res = $ua->request($req); | |||||
290 | |||||||
291 | 0 | 0 | if ($res->is_success) { | ||||
292 | 0 | $res->content() =~ m/.*\n\n((\s|\d|\n|\w|\[|\]|:|;|=|<|>)*?)\n /; |
|||||
293 | # print $1 . "\n\n"; | ||||||
294 | 0 | return $1; | |||||
295 | } | ||||||
296 | else { | ||||||
297 | 0 | Carp::carp($res->status_line, "\n"); | |||||
298 | 0 | return undef; | |||||
299 | } | ||||||
300 | } | ||||||
301 | |||||||
302 | |||||||
303 | |||||||
304 | |||||||
305 | =back | ||||||
306 | |||||||
307 | =head1 AUTHOR | ||||||
308 | |||||||
309 | Patrick Kaeding, C<< |
||||||
310 | |||||||
311 | =head1 BUGS | ||||||
312 | |||||||
313 | Please report any bugs or feature requests to | ||||||
314 | C |
||||||
315 | L |
||||||
316 | I will be notified, and then you'll automatically be notified of progress on | ||||||
317 | your bug as I make changes. | ||||||
318 | |||||||
319 | =head1 SUPPORT | ||||||
320 | |||||||
321 | You can find documentation for this module with the perldoc command. | ||||||
322 | |||||||
323 | perldoc Search::Lemur | ||||||
324 | |||||||
325 | You can also look for information at: | ||||||
326 | |||||||
327 | =over 4 | ||||||
328 | |||||||
329 | =item * AnnoCPAN: Annotated CPAN documentation | ||||||
330 | |||||||
331 | L |
||||||
332 | |||||||
333 | =item * CPAN Ratings | ||||||
334 | |||||||
335 | L |
||||||
336 | |||||||
337 | =item * RT: CPAN's request tracker | ||||||
338 | |||||||
339 | L |
||||||
340 | |||||||
341 | =item * Search CPAN | ||||||
342 | |||||||
343 | L |
||||||
344 | |||||||
345 | =back | ||||||
346 | |||||||
347 | =head1 ACKNOWLEDGEMENTS | ||||||
348 | |||||||
349 | =head1 COPYRIGHT & LICENSE | ||||||
350 | |||||||
351 | Copyright 2007 Patrick Kaeding, all rights reserved. | ||||||
352 | |||||||
353 | This program is free software; you can redistribute it and/or modify it | ||||||
354 | under the same terms as Perl itself. | ||||||
355 | |||||||
356 | =cut | ||||||
357 | |||||||
358 | 1; # End of Search::Lemur |