File Coverage

blib/lib/URI/Sequin.pm
Criterion Covered Total %
statement 20 83 24.1
branch 5 36 13.8
condition 0 5 0.0
subroutine 3 5 60.0
pod 3 3 100.0
total 31 132 23.4


line stmt bran cond sub pod time code
1             # Sequin v1.1.2
2             #
3             # by Peter Sergeant
4             #
5             # A module for extracting and parsing search engine URLs from
6             # server referrer files. Proper usage information is in the
7             # README file
8              
9              
10             # Magic Package Stuff
11             require 5.005;
12 1     1   637 use strict;
  1         2  
  1         98  
13             require Exporter;
14             package URI::Sequin;
15 1     1   5 use vars qw(@ISA $VERSION @EXPORT_OK %log_types);
  1         2  
  1         1680  
16             @ISA = qw(Exporter);
17             @EXPORT_OK = qw(se_extract log_extract %log_types key_extract);
18             $VERSION = '1.2';
19              
20              
21              
22             # &log_extract v1.0
23             # =-=-=-=-=-=- =-=-
24             #
25             #
26             # The purpose of this subroutine is to allow raw log files lines to be
27             # handled. The subroutine accepts a log line, plus some information on
28             # how it should be analysed, and returns a scalar value: the referring
29             # URL.
30             #
31             # The subroutine knows a certain number of log types, and keeps regexs
32             # with which to handle these logs in a globally accessable hash below,
33             # called '%log_types'. If your log type is not already in the array it
34             # can be added and used.
35             #
36             # Examples:
37             # ---------
38             #
39             # Adding a new regex to %log_types:
40             # => $log_types{'MyWebServer'} = '.+? Referer:(.+?) ';
41             #
42             # > It's worth pointing out that the subroutine uses $1 straight
43             # > after the match has taken place to get the referrer. Because
44             # > of this, you should make sure the part of string to be taken
45             # > is enclosed in ()'s. If you're still unsure, this is clearly
46             # > demonstrated below, where %log_types is set.
47             #
48             # Parsing a Log Entry
49             # => $referrer = &log_extract($log_line, 'NCSA');
50             #
51             # > As I hope is clear, $log_line is the log-file line that needs
52             # > to be parsed, and 'Apache' refers to the relevant regex below
53             # > in the %log_types hash.
54             #
55              
56             %log_types = (
57             # Microsoft IIS 3.0 and 2.0
58             'IIS1' => '(http:.+?),',
59              
60             # Microsoft IIS4.0 (W3SVC format)
61             'IIS2' => '(http:.+?)$',
62              
63             # NCSA (Apache, Netscape)
64             'NCSA' => '"(http:.+?)"',
65              
66             # O'Reilly WebSite format
67             'ORW' => ' (http:.+?) ',
68              
69             # General (works for most logtypes)
70             'General' => '(?:\s|"|,|^)(http:.+?)(\s|"|,|$)',
71            
72             );
73              
74              
75              
76             sub log_extract {
77              
78 0     0 1 0 my $log_file_line = $_[0];
79 0   0     0 my $log_file_type = $_[1] || 'General';
80              
81 0         0 chomp($log_file_line);
82              
83             # Check that the $log_file_type contains a valid regex by using
84             # (eval) on it to see if we crash the regex engine, and by also
85             # checking if there is a regex in $log_types{$log_file_type}
86              
87 0         0 my $re = eval { qr/$log_types{$log_file_type}/ };
  0         0  
88 0 0       0 warn "Bad re: '$log_types{$log_file_type}' ($@)\n" if $@;
89              
90 0 0       0 unless (defined $log_types{$log_file_type}) {
91 0         0 warn "Unknown Logtype - \"$log_file_type\"\n";
92             }
93            
94            
95             # Return what we found
96              
97 0 0       0 if ($log_file_line =~ m/$log_types{$log_file_type}/i) { return $1 };
  0         0  
98              
99 0         0 return;
100              
101             }
102              
103              
104              
105             # &se_extract v1.1
106             # =-=-=-=-=-= =-=-
107             #
108             # The purpose of this subroutine is to break down the referring URL in
109             # to an array, containing the $search_engine_name and the
110             # $search_engine_url.
111             #
112             # Example:
113             # => ($name, $url) = @{&se_extract($url)};
114             #
115              
116             sub se_extract {
117              
118 0     0 1 0 my $input_url = $_[0];
119 0         0 chomp($input_url);
120              
121             # Break down the $input_url into two more useful variables, so
122             # that we can check if there is information in the query
123             # string, and if there is, we just get on with life.
124              
125 0         0 my ($location, $query_string) = split(/\?/, $input_url);
126 0 0       0 return [] unless $query_string;
127              
128 0         0 my $search_engine_name;
129             my $search_engine_url;
130              
131             # This is a scary regex. It picks out with suprising accuracy
132             # the main part of a URL - the 'MSN' part of:
133             # http://biteme15.search.cgi.msn.com.uk/?asdfasdf
134              
135 0 0       0 if ($location =~ m!(http://)?(\d+\.\d+\.\d+\.\d+(\:\d+)?)/!) {
136 0         0 return ["Unknown (IP)", $2];
137             }
138              
139 0 0       0 if ($location =~ m!^(.+?\.
    0          
    0          
140             ([^\.]+)
141             \.
142             (com|net|org|int|mil|\w\w|
143             (gov|mil|com|net|org|\w\w)\.\w\w
144             )
145             (?:/|:\d+/)
146             )!x) {
147 0         0 $search_engine_url = $1;
148 0         0 $search_engine_name = "\u$2";
149              
150             } elsif ($location =~ m!^(http://)?((\w+)\.\w+(\:\d+/?)?)!) {
151 0         0 $search_engine_url = $2;
152 0         0 $search_engine_name = "\u$3";
153             } elsif ($location =~ m!^(http://)?((\w+)([^\.\w:]|(\:\d+/?)?))!) {
154 0         0 $search_engine_url = $2;
155 0         0 $search_engine_name = "\u$3";
156             }
157              
158              
159             # This has allowed us to quite accurately get the name and URL
160             # of any given search-engine. However, in the interests of
161             # total accuracy, we have a list of search-engines that we know
162             # so we can provide even more information, and make sure it's
163             # correct.
164              
165             # Define this list:
166              
167 0         0 my @search_engine_array = (
168             ['Altavista', 'http://www.av.com',
169             '(altavista|av)'],
170             ['HotBot', 'http://www.hotbot.com',
171             'hotbot\.lycos'],
172             ['Infoseek', 'http://www.infoseek.com',
173             'infoseek\.go'],
174             ['Magellan', 'http://magellan.excite.com',
175             'magellan\.excite'],
176             ['Ask Jeeves', 'http://www.aj.com',
177             '(aj|askjeeves)'],
178             ['CNET Search', 'http://www.search.com',
179             '(cnet|search\.com|savysearch)'],
180             );
181              
182             # Cycle through the list
183              
184 0         0 for (@search_engine_array) {
185              
186 0         0 my ($se_name, $se_url, $se_regex) = @{$_};
  0         0  
187              
188 0 0       0 if ($location =~ m/$se_regex/) {
189 0         0 $search_engine_url = $se_url;
190 0         0 $search_engine_name = $se_name;
191             }
192              
193             }
194              
195             # Return what we know.
196             # jm: allow HTTPS search engines too ;)
197              
198 0 0 0     0 if (defined $search_engine_url && $search_engine_url !~ m!^https?://!) {
199 0         0 $search_engine_url =~ s!^!http://!;
200             }
201              
202 0         0 return [$search_engine_name, $search_engine_url];
203              
204             }
205              
206              
207             # &key_extract v1.1
208             # =-=-=-=-=-= =-=-
209             #
210             # The purpose of this subroutine is to break down the referring URL in
211             # to a string containing the search terms.
212             #
213             # Example:
214             # => $terms = &key_extract($url);
215             #
216              
217             sub key_extract {
218              
219 1     1 1 47 my $input_url = $_[0];
220              
221 1         5 chomp($input_url);
222              
223             # Break down the $input_url in to two more useful variables
224              
225 1         5 my ($location, $query_string) = split(/\?/, $input_url);
226 1 50       4 return unless $query_string;
227              
228             # Google Caching ... What a bitch... This will deal with it,
229             # how Google currently works...
230            
231 1 50       6 if ($query_string =~ m!q=cache\:.+/(.+?)&!i) {
232 0         0 $_ = $1;
233 0         0 tr/+/ /;
234 0         0 s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
  0         0  
235 0         0 s/\s+/ /gs;
236 0         0 s/^\s+//g;
237 0         0 s/\s+$//g;
238 0         0 return $_;
239             }
240            
241             # There are a number of ways in which we now try and determine
242             # what the search terms are. The first is quite clever, IMHO.
243             # We search for spaces in any of the submitted fields that
244             # isn't called 'next' or 'submit' or 'col' or 'btnG' (blame
245             # google).
246              
247            
248              
249 1 50       12 if ($query_string =~ m/(?
250             (?
251             ([^&]*(?:\+|%2b)[^&]*)/xi) {
252              
253            
254 1         5 my $key_string = $1;
255 1         3 my $false = 0;
256              
257             # Some search engines are determined to try and fool us
258             # :). Therefore, we kill some pseudo-matches containing
259             # %07C ( a pipe: | ) and %02C, by setting the $false
260             # scalar to a positive value, that overides a little
261             # later on.
262              
263 1 50       5 $false++ if $key_string =~ m/(%02|%7C%7C)/;
264              
265             # Clean our information from those nasty escape
266             # sequences.
267              
268 1         3 for ($key_string) {
269 1         9 tr/+/ /;
270 1         3 s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
  0         0  
271 1         8 s/\s+/ /gs;
272             }
273              
274             # Unless we decided to abort earlier, return the
275             # field that we found.
276              
277 1 50       7 return $key_string unless ($false);
278              
279             }
280              
281             # Okay. If that failed, then we need to take a closer look.
282             # In the array below are many many possible prefixes for a term
283             # that might contain our data. They're in a particular order
284             # because some search engines use two of the variables.
285              
286             # NB: This isn't quite finished. If you're finding that the
287             # wrong prefix is being used, please email me and tell me
288             # at pete_sergeant@hotmail.com
289              
290             # If you're wondering why they're ordered in this slightly
291             # bizarre and seemingly random order, it's because some
292             # search engines have decided to use more than one of these
293             # variables, and the order these are in hopefully pick the
294             # right one first.
295              
296 0           my @prefix_array = (
297              
298            
299             '\w*query\w*', # CNET Search, Netscape
300             '\w*search(?!Type)\w*',
301             '\w*term\w*',
302             'ask', # Ask Jeeves
303             '.\w?key.\w?',
304             'palabras',
305             'DTqb1',
306             'request',
307             'ShowMatch', # syndic8
308             'keywords?', # Snap, overture.com
309             'general', # MetaCrawler, Go2Net
310             'key', # Looksmart
311             'MetaTopic', # AJ
312             'query0', # elf8888.at, thx to http://www.tnl.net/
313             'queryString', # blogdigger.com
314             'serachfor', # mysearch.com dyslexia ;)
315             'terms', # abcsearch.com
316             'word', # baidu.com
317             'rn',
318             'mt', # MSN, HotBot
319             'qt', # Go, Infoseek, search.com
320             'oq',
321             'dom', # Domainsurfer
322             's', # Excite, blogsphere.us
323             'q', # Altavista, Google, Dogpile, Evreka, Metafind
324             'p', # Yahoo
325             't',
326             'qry',
327             'qkw', # dpxml, msxml
328             'qr', # northernlight.com
329             'qu',
330             'kw', # Sapo
331             'general',
332             'B1',
333             'sc', # Gohip
334             'szukaj',
335             'PA',
336             'MT', # goo.ne.jp
337             'req', # dir.com
338             'k', # galaxy.com
339             'cat', # Dmoz
340             'u', # Google translation
341             'va', # search.yahoo.com
342             'K', # srd.yahoo.com
343             'as_epq' # Google, sometimes. Advanced query maybe?
344              
345             );
346              
347             # Cycle through each prefix and see if it's contained in the
348             # query_string. If it is, we extract the field, clean it, and
349             # return it. Simple.
350              
351              
352 0           for (@prefix_array) {
353 0 0         if ($query_string =~ m/(^|\&)$_=(.+?)(\&|$)/i) {
354              
355 0           my $key_string = $2;
356              
357 0           for ($key_string) {
358 0           tr/+/ /;
359 0           s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
  0            
360 0           s/\s+/ /gs;
361             }
362              
363 0 0         if ($key_string =~ /\w/) {
364 0           return $key_string;
365             }
366              
367             }
368              
369             }
370              
371             # Failing all that, some Search-Engines don't overload the
372             # query_string with values, and just make the query_string
373             # the search terms. The next part looks for that, and returns
374             # the whole query_string (cleaned) if this appears to be the
375             # case.
376              
377              
378 0 0         if ($query_string !~ /\=/) {
379              
380 0           for ($query_string) {
381 0           tr/+/ /;
382 0           s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
  0            
383 0           s/\s+/ /gs;
384             }
385              
386 0           return $query_string;
387             }
388              
389              
390 0           return;
391             }
392              
393             1;
394              
395             __END__