line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# $File: //depot/OurNet-Query/Query.pm $ $Author: autrijus $ |
2
|
|
|
|
|
|
|
# $Revision: #4 $ $Change: 1925 $ $DateTime: 2001/09/28 15:12:40 $ |
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
package OurNet::Query; |
5
|
|
|
|
|
|
|
require 5.005; |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
$OurNet::Query::VERSION = '1.56'; |
8
|
|
|
|
|
|
|
|
9
|
1
|
|
|
1
|
|
6938
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
33
|
|
10
|
|
|
|
|
|
|
|
11
|
1
|
|
|
1
|
|
497
|
use OurNet::Site; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
27
|
|
12
|
1
|
|
|
1
|
|
866
|
use HTTP::Request::Common; |
|
1
|
|
|
|
|
50573
|
|
|
1
|
|
|
|
|
115
|
|
13
|
1
|
|
|
1
|
|
1791
|
use LWP::Parallel::UserAgent; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=head1 NAME |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
OurNet::Query - Scriptable queries with template extraction |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=head1 SYNOPSIS |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
use OurNet::Query; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
# Set query parameters |
24
|
|
|
|
|
|
|
my ($query, $hits) = ('autrijus', 10); |
25
|
|
|
|
|
|
|
my @sites = ('google', 'google'); # XXX: write more templates! |
26
|
|
|
|
|
|
|
my %found; |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
# Generate a new Query object |
29
|
|
|
|
|
|
|
my $bot = OurNet::Query->new($query, $hits, @sites); |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# Perform a query |
32
|
|
|
|
|
|
|
my $found = $bot->begin(\&callback, 30); # Timeout after 30 seconds |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
print '*** ' . ($found ? $found : 'No') . ' match(es) found.'; |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
sub callback { |
37
|
|
|
|
|
|
|
my %entry = @_; |
38
|
|
|
|
|
|
|
my $entry = \%entry; |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
unless ($found{$entry{url}}) { |
41
|
|
|
|
|
|
|
print "*** [$entry->{title}]" . |
42
|
|
|
|
|
|
|
" ($entry->{score})" . |
43
|
|
|
|
|
|
|
" - [$entry->{id}]\n" . |
44
|
|
|
|
|
|
|
" URL: [$entry->{url}]\n"; |
45
|
|
|
|
|
|
|
} |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
$found{$entry{url}}++; |
48
|
|
|
|
|
|
|
} |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head1 DESCRIPTION |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
This module provides an easy interface to perform multiple queries |
53
|
|
|
|
|
|
|
to internet services, and I them into your own format at once. |
54
|
|
|
|
|
|
|
The results are processed on-the-fly and are returned via callback |
55
|
|
|
|
|
|
|
functions. |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
Its interfaces resembles that of I's, but implements it |
58
|
|
|
|
|
|
|
in a different fashion. While I relies on additional |
59
|
|
|
|
|
|
|
subclasses to parse returned results, I uses I
|
60
|
|
|
|
|
|
|
descriptors> for search search engine, which makes it much easier |
61
|
|
|
|
|
|
|
to add new backends. |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
Site descriptors may be written in XML, I toolkit format, |
64
|
|
|
|
|
|
|
or the I<.fmt> format from the commercial Inforia Quest product. |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=head1 CAVEATS |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
The only confirmed, working site descriptor currently is F. |
69
|
|
|
|
|
|
|
The majority of F<*.xml> descriptors are outdated, and need volunteers |
70
|
|
|
|
|
|
|
to either correct them, or convert them to C<.tt2> format. |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
This package is supposedly to I turn your web pages built |
73
|
|
|
|
|
|
|
with Template Toolkit into web services overnight, using I-based |
74
|
|
|
|
|
|
|
induction heuristics; but this is not happening yet. Stay tuned. |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
There should be instructions of how to write templates in various |
77
|
|
|
|
|
|
|
formats. |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
=head1 COMPONENTS |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
Most Query Toolkit components are independently useful; they rely on |
82
|
|
|
|
|
|
|
several front-end interfaces to glue themselves together. |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
=head2 Full-Text Search Engine (FuzzyIndex) |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
The indexing module I implement an indexing mechanism suitable |
87
|
|
|
|
|
|
|
to handle variable-byte encoding charsets, e.g. big-5 or utf8. Its |
88
|
|
|
|
|
|
|
index file I require original data be presented, nor |
89
|
|
|
|
|
|
|
exceed the original data size on verage. |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=head2 Interactive Queries (ChatBot) |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
The interactive query module I accept context-free queries against |
94
|
|
|
|
|
|
|
any indexed database generated by the Search Engine, and provide feedbacks |
95
|
|
|
|
|
|
|
based on the entries contained within. It I develop a heuristic to |
96
|
|
|
|
|
|
|
accumulate user input, and build connections between entries based on |
97
|
|
|
|
|
|
|
relevancy. |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=head2 Template Extraction (Template::Extract) |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
This component I support the C Toolkit format, and |
102
|
|
|
|
|
|
|
I support additional template formats. It I be capable of |
103
|
|
|
|
|
|
|
taking a document and the original template used to generated it, |
104
|
|
|
|
|
|
|
and produce the original parameter list. |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
All simple assignment and loop directives I be supported; it |
107
|
|
|
|
|
|
|
I also accept nested loops and structure elements. |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
=head2 Site Descriptors (Site) |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
This includes a collection of oft-used web sites, akin to the |
112
|
|
|
|
|
|
|
C or Inforia Quest collection. It I also support |
113
|
|
|
|
|
|
|
basic validation and variable interpolation within the descriptors. |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
=head2 Template Generation (Template::Generate) |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
This module I be able to generate the original template, based |
118
|
|
|
|
|
|
|
on two or more distinct outputs. It I operate without any |
119
|
|
|
|
|
|
|
prompt of original structures, but I draw on such information to |
120
|
|
|
|
|
|
|
increase its accuracy. |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=head2 Front-End Interface (bin/*) |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
All above components I come with at least one command-line |
125
|
|
|
|
|
|
|
utility, capable of exporting most of their functions to the normal |
126
|
|
|
|
|
|
|
user. The utilities I assume a common look-and-feel. |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
=head2 Documentation (pod/*) |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
The Query Toolkit Manual I contain a tutorial, an overview |
131
|
|
|
|
|
|
|
of functions, and guides on how to embedd Query components into |
132
|
|
|
|
|
|
|
existing programs. |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
=head1 MILESTONES |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
=head2 Milestone 0 - v1.56 - 2001/09/01 |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
This milestone represents the raw, unconnected state of all tools. |
139
|
|
|
|
|
|
|
It provides all basic functionalities except for template generation, |
140
|
|
|
|
|
|
|
yet offers only fzindex / fzquery as useful user-accessible interfaces. |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
FuzzyIndex big-5 & latin-1 support |
143
|
|
|
|
|
|
|
ChatBot automatic building of default database |
144
|
|
|
|
|
|
|
T::Extract template toolkit support; nested fetch |
145
|
|
|
|
|
|
|
Site google (as proof-of-concept) |
146
|
|
|
|
|
|
|
bin/* all above interfaces |
147
|
|
|
|
|
|
|
pod/* overview of functions |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
=head2 Milestone 1 - v1.6 - 2001/10/15 |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
This milestone aims to export a consistent interface to other developers, |
152
|
|
|
|
|
|
|
by populating the missing descriptor and documents. |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
FuzzyIndex gb-1312 support |
155
|
|
|
|
|
|
|
Site all major search engines and news sources |
156
|
|
|
|
|
|
|
T::Generate simple diff-based heuristic framework |
157
|
|
|
|
|
|
|
bin/* a parallel, configurable sitequery coupled with fzindex |
158
|
|
|
|
|
|
|
pod/* embbed-howto, including win32 COM+ port |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
=head2 Milestone 2 - v1.7 - 2002/01/01 |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
This milestone will be the first feature-complete release of Query Toolkit, |
163
|
|
|
|
|
|
|
capable of being used in a more diversed environment. |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
=cut |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# --------------- |
168
|
|
|
|
|
|
|
# Variable Fields |
169
|
|
|
|
|
|
|
# --------------- |
170
|
|
|
|
|
|
|
use fields qw/callback pua timeout query sites bots hits found/; |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
# ----------------- |
173
|
|
|
|
|
|
|
# Package Constants |
174
|
|
|
|
|
|
|
# ----------------- |
175
|
|
|
|
|
|
|
use constant ERROR_QUERY_NEEDED => __PACKAGE__ . ' needs a query'; |
176
|
|
|
|
|
|
|
use constant ERROR_HITS_NEEDED => __PACKAGE__ . ' needs sufficient hits'; |
177
|
|
|
|
|
|
|
use constant ERROR_SITES_NEEDED => __PACKAGE__ . ' needs one or more sites'; |
178
|
|
|
|
|
|
|
use constant ERROR_CALLBACK_NEEDED => __PACKAGE__ . ' needs a callback sub'; |
179
|
|
|
|
|
|
|
use constant ERROR_PROTOCOL_UNDEF => __PACKAGE__ . ' cannot use the protocol'; |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# ------------------------------------- |
182
|
|
|
|
|
|
|
# Subroutine new($query, $hits, @sites) |
183
|
|
|
|
|
|
|
# ------------------------------------- |
184
|
|
|
|
|
|
|
sub new { |
185
|
|
|
|
|
|
|
my $class = shift; |
186
|
|
|
|
|
|
|
my $self = ($] > 5.00562) ? fields::new($class) |
187
|
|
|
|
|
|
|
: do { no strict 'refs'; |
188
|
|
|
|
|
|
|
bless [\%{"$class\::FIELDS"}], $class }; |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
$self->{query} = shift or (warn(ERROR_QUERY_NEEDED), return); |
191
|
|
|
|
|
|
|
$self->{hits} = shift or (warn(ERROR_HITS_NEEDED), return); |
192
|
|
|
|
|
|
|
$self->{sites} = [ @_ ] or (warn(ERROR_SITES_NEEDED), return); |
193
|
|
|
|
|
|
|
$self->{pua} = LWP::Parallel::UserAgent->new; |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
return $self; |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
# --------------------------------------------- |
199
|
|
|
|
|
|
|
# Subroutine begin($self, \&callback, $timeout) |
200
|
|
|
|
|
|
|
# --------------------------------------------- |
201
|
|
|
|
|
|
|
sub begin { |
202
|
|
|
|
|
|
|
my $self = shift; |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
$self->{callback} = ($_[0] ? $_[0] : $self->{callback}) |
205
|
|
|
|
|
|
|
or (warn(ERROR_CALLBACK_NEEDED), return); |
206
|
|
|
|
|
|
|
$self->{timeout} = ($_[1] ? $_[1] : $self->{timeout}); |
207
|
|
|
|
|
|
|
$self->{pua}->initialize; |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
foreach my $count (0 .. $#{$self->{sites}}) { |
210
|
|
|
|
|
|
|
$self->{bots}[$count] = OurNet::Site->new( |
211
|
|
|
|
|
|
|
$self->{sites}[$count] |
212
|
|
|
|
|
|
|
); |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
my $siteurl = $self->{bots}[$count]->geturl( |
215
|
|
|
|
|
|
|
$self->{query}, $self->{hits} |
216
|
|
|
|
|
|
|
); |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
my $request = ($siteurl =~ m|^post:([^\?]+)\?(.+)|) |
219
|
|
|
|
|
|
|
? POST("http:$1", [split('[&;=]', $2)]) |
220
|
|
|
|
|
|
|
: GET($siteurl) |
221
|
|
|
|
|
|
|
or (warn(ERROR_PROTOCOL_UNDEF), return); |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
# Closure is not something that most Perl programmers need |
224
|
|
|
|
|
|
|
# trouble themselves about to begin with. (perlref.pod) |
225
|
|
|
|
|
|
|
$self->{pua}->register($request, sub { |
226
|
|
|
|
|
|
|
$self->{bots}[$count]->callme($self, $count, |
227
|
|
|
|
|
|
|
$_[0], \&callmeback); |
228
|
|
|
|
|
|
|
return; |
229
|
|
|
|
|
|
|
}); |
230
|
|
|
|
|
|
|
} |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
$self->{found} = 0; |
233
|
|
|
|
|
|
|
$self->{pua}->wait($self->{timeout}); |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
return $self->{found}; |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
# -------------------------------------- |
239
|
|
|
|
|
|
|
# Subroutine callmeback($self, $himself) |
240
|
|
|
|
|
|
|
# -------------------------------------- |
241
|
|
|
|
|
|
|
sub callmeback { |
242
|
|
|
|
|
|
|
my ($self, $himself) = @_; |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
foreach my $entry (@{$himself->{response}}) { |
245
|
|
|
|
|
|
|
if (exists($entry->{url})) { |
246
|
|
|
|
|
|
|
&{$self->{callback}}(%{$entry}); |
247
|
|
|
|
|
|
|
delete($entry->{url}); |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
$self->{found}++; |
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
} |
252
|
|
|
|
|
|
|
} |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
1; |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
=head1 SEE ALSO |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
L |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
=head1 AUTHORS |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
Autrijus Tang Eautrijus@autrijus.org> |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
=head1 COPYRIGHT |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
Copyright 2001 by Autrijus Tang Eautrijus@autrijus.org>. |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or |
269
|
|
|
|
|
|
|
modify it under the same terms as Perl itself. |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
See L |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
=cut |