File Coverage

blib/lib/URI/Sequin.pm

Criterion	Covered	Total	%
statement	20	83	24.1
branch	5	36	13.8
condition	0	5	0.0
subroutine	3	5	60.0
pod	3	3	100.0
total	31	132	23.4

line	stmt	bran	cond	sub	pod	time	code
1							# Sequin v1.1.2
2							#
3							# by Peter Sergeant
4							#
5							# A module for extracting and parsing search engine URLs from
6							# server referrer files. Proper usage information is in the
7							# README file
8
9
10							# Magic Package Stuff
11							require 5.005;
12	1			1		637	use strict;
	1					2
	1					98
13							require Exporter;
14							package URI::Sequin;
15	1			1		5	use vars qw(@ISA $VERSION @EXPORT_OK %log_types);
	1					2
	1					1680
16							@ISA = qw(Exporter);
17							@EXPORT_OK = qw(se_extract log_extract %log_types key_extract);
18							$VERSION = '1.2';
19
20
21
22							# &log_extract v1.0
23							# =-=-=-=-=-=- =-=-
24							#
25							#
26							# The purpose of this subroutine is to allow raw log files lines to be
27							# handled. The subroutine accepts a log line, plus some information on
28							# how it should be analysed, and returns a scalar value: the referring
29							# URL.
30							#
31							# The subroutine knows a certain number of log types, and keeps regexs
32							# with which to handle these logs in a globally accessable hash below,
33							# called '%log_types'. If your log type is not already in the array it
34							# can be added and used.
35							#
36							# Examples:
37							# ---------
38							#
39							# Adding a new regex to %log_types:
40							# => $log_types{'MyWebServer'} = '.+? Referer:(.+?) ';
41							#
42							# > It's worth pointing out that the subroutine uses $1 straight
43							# > after the match has taken place to get the referrer. Because
44							# > of this, you should make sure the part of string to be taken
45							# > is enclosed in ()'s. If you're still unsure, this is clearly
46							# > demonstrated below, where %log_types is set.
47							#
48							# Parsing a Log Entry
49							# => $referrer = &log_extract($log_line, 'NCSA');
50							#
51							# > As I hope is clear, $log_line is the log-file line that needs
52							# > to be parsed, and 'Apache' refers to the relevant regex below
53							# > in the %log_types hash.
54							#
55
56							%log_types = (
57							# Microsoft IIS 3.0 and 2.0
58							'IIS1' => '(http:.+?),',
59
60							# Microsoft IIS4.0 (W3SVC format)
61							'IIS2' => '(http:.+?)$',
62
63							# NCSA (Apache, Netscape)
64							'NCSA' => '"(http:.+?)"',
65
66							# O'Reilly WebSite format
67							'ORW' => ' (http:.+?) ',
68
69							# General (works for most logtypes)
70							'General' => '(?:\s\|"\|,\|^)(http:.+?)(\s\|"\|,\|$)',
71
72							);
73
74
75
76							sub log_extract {
77
78	0			0	1	0	my $log_file_line = $_[0];
79	0		0			0	my $log_file_type = $_[1] \|\| 'General';
80
81	0					0	chomp($log_file_line);
82
83							# Check that the $log_file_type contains a valid regex by using
84							# (eval) on it to see if we crash the regex engine, and by also
85							# checking if there is a regex in $log_types{$log_file_type}
86
87	0					0	my $re = eval { qr/$log_types{$log_file_type}/ };
	0					0
88	0	0				0	warn "Bad re: '$log_types{$log_file_type}' ($@)\n" if $@;
89
90	0	0				0	unless (defined $log_types{$log_file_type}) {
91	0					0	warn "Unknown Logtype - \"$log_file_type\"\n";
92							}
93
94
95							# Return what we found
96
97	0	0				0	if ($log_file_line =~ m/$log_types{$log_file_type}/i) { return $1 };
	0					0
98
99	0					0	return;
100
101							}
102
103
104
105							# &se_extract v1.1
106							# =-=-=-=-=-= =-=-
107							#
108							# The purpose of this subroutine is to break down the referring URL in
109							# to an array, containing the $search_engine_name and the
110							# $search_engine_url.
111							#
112							# Example:
113							# => ($name, $url) = @{&se_extract($url)};
114							#
115
116							sub se_extract {
117
118	0			0	1	0	my $input_url = $_[0];
119	0					0	chomp($input_url);
120
121							# Break down the $input_url into two more useful variables, so
122							# that we can check if there is information in the query
123							# string, and if there is, we just get on with life.
124
125	0					0	my ($location, $query_string) = split(/\?/, $input_url);
126	0	0				0	return [] unless $query_string;
127
128	0					0	my $search_engine_name;
129							my $search_engine_url;
130
131							# This is a scary regex. It picks out with suprising accuracy
132							# the main part of a URL - the 'MSN' part of:
133							# http://biteme15.search.cgi.msn.com.uk/?asdfasdf
134
135	0	0				0	if ($location =~ m!(http://)?(\d+\.\d+\.\d+\.\d+(\:\d+)?)/!) {
136	0					0	return ["Unknown (IP)", $2];
137							}
138
139	0	0				0	if ($location =~ m!^(.+?\.
		0
		0
140							([^\.]+)
141							\.
142							(com\|net\|org\|int\|mil\|\w\w\|
143							(gov\|mil\|com\|net\|org\|\w\w)\.\w\w
144							)
145							(?:/\|:\d+/)
146							)!x) {
147	0					0	$search_engine_url = $1;
148	0					0	$search_engine_name = "\u$2";
149
150							} elsif ($location =~ m!^(http://)?((\w+)\.\w+(\:\d+/?)?)!) {
151	0					0	$search_engine_url = $2;
152	0					0	$search_engine_name = "\u$3";
153							} elsif ($location =~ m!^(http://)?((\w+)([^\.\w:]\|(\:\d+/?)?))!) {
154	0					0	$search_engine_url = $2;
155	0					0	$search_engine_name = "\u$3";
156							}
157
158
159							# This has allowed us to quite accurately get the name and URL
160							# of any given search-engine. However, in the interests of
161							# total accuracy, we have a list of search-engines that we know
162							# so we can provide even more information, and make sure it's
163							# correct.
164
165							# Define this list:
166
167	0					0	my @search_engine_array = (
168							['Altavista', 'http://www.av.com',
169							'(altavista\|av)'],
170							['HotBot', 'http://www.hotbot.com',
171							'hotbot\.lycos'],
172							['Infoseek', 'http://www.infoseek.com',
173							'infoseek\.go'],
174							['Magellan', 'http://magellan.excite.com',
175							'magellan\.excite'],
176							['Ask Jeeves', 'http://www.aj.com',
177							'(aj\|askjeeves)'],
178							['CNET Search', 'http://www.search.com',
179							'(cnet\|search\.com\|savysearch)'],
180							);
181
182							# Cycle through the list
183
184	0					0	for (@search_engine_array) {
185
186	0					0	my ($se_name, $se_url, $se_regex) = @{$_};
	0					0
187
188	0	0				0	if ($location =~ m/$se_regex/) {
189	0					0	$search_engine_url = $se_url;
190	0					0	$search_engine_name = $se_name;
191							}
192
193							}
194
195							# Return what we know.
196							# jm: allow HTTPS search engines too ;)
197
198	0	0	0			0	if (defined $search_engine_url && $search_engine_url !~ m!^https?://!) {
199	0					0	$search_engine_url =~ s!^!http://!;
200							}
201
202	0					0	return [$search_engine_name, $search_engine_url];
203
204							}
205
206
207							# &key_extract v1.1
208							# =-=-=-=-=-= =-=-
209							#
210							# The purpose of this subroutine is to break down the referring URL in
211							# to a string containing the search terms.
212							#
213							# Example:
214							# => $terms = &key_extract($url);
215							#
216
217							sub key_extract {
218
219	1			1	1	47	my $input_url = $_[0];
220
221	1					5	chomp($input_url);
222
223							# Break down the $input_url in to two more useful variables
224
225	1					5	my ($location, $query_string) = split(/\?/, $input_url);
226	1	50				4	return unless $query_string;
227
228							# Google Caching ... What a bitch... This will deal with it,
229							# how Google currently works...
230
231	1	50				6	if ($query_string =~ m!q=cache\:.+/(.+?)&!i) {
232	0					0	$_ = $1;
233	0					0	tr/+/ /;
234	0					0	s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
	0					0
235	0					0	s/\s+/ /gs;
236	0					0	s/^\s+//g;
237	0					0	s/\s+$//g;
238	0					0	return $_;
239							}
240
241							# There are a number of ways in which we now try and determine
242							# what the search terms are. The first is quite clever, IMHO.
243							# We search for spaces in any of the submitted fields that
244							# isn't called 'next' or 'submit' or 'col' or 'btnG' (blame
245							# google).
246
247
248
249	1	50				12	if ($query_string =~ m/(?
250							(?
251							([^&](?:\+\|%2b)[^&])/xi) {
252
253
254	1					5	my $key_string = $1;
255	1					3	my $false = 0;
256
257							# Some search engines are determined to try and fool us
258							# :). Therefore, we kill some pseudo-matches containing
259							# %07C ( a pipe: \| ) and %02C, by setting the $false
260							# scalar to a positive value, that overides a little
261							# later on.
262
263	1	50				5	$false++ if $key_string =~ m/(%02\|%7C%7C)/;
264
265							# Clean our information from those nasty escape
266							# sequences.
267
268	1					3	for ($key_string) {
269	1					9	tr/+/ /;
270	1					3	s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
	0					0
271	1					8	s/\s+/ /gs;
272							}
273
274							# Unless we decided to abort earlier, return the
275							# field that we found.
276
277	1	50				7	return $key_string unless ($false);
278
279							}
280
281							# Okay. If that failed, then we need to take a closer look.
282							# In the array below are many many possible prefixes for a term
283							# that might contain our data. They're in a particular order
284							# because some search engines use two of the variables.
285
286							# NB: This isn't quite finished. If you're finding that the
287							# wrong prefix is being used, please email me and tell me
288							# at pete_sergeant@hotmail.com
289
290							# If you're wondering why they're ordered in this slightly
291							# bizarre and seemingly random order, it's because some
292							# search engines have decided to use more than one of these
293							# variables, and the order these are in hopefully pick the
294							# right one first.
295
296	0						my @prefix_array = (
297
298
299							'\wquery\w', # CNET Search, Netscape
300							'\wsearch(?!Type)\w',
301							'\wterm\w',
302							'ask', # Ask Jeeves
303							'.\w?key.\w?',
304							'palabras',
305							'DTqb1',
306							'request',
307							'ShowMatch', # syndic8
308							'keywords?', # Snap, overture.com
309							'general', # MetaCrawler, Go2Net
310							'key', # Looksmart
311							'MetaTopic', # AJ
312							'query0', # elf8888.at, thx to http://www.tnl.net/
313							'queryString', # blogdigger.com
314							'serachfor', # mysearch.com dyslexia ;)
315							'terms', # abcsearch.com
316							'word', # baidu.com
317							'rn',
318							'mt', # MSN, HotBot
319							'qt', # Go, Infoseek, search.com
320							'oq',
321							'dom', # Domainsurfer
322							's', # Excite, blogsphere.us
323							'q', # Altavista, Google, Dogpile, Evreka, Metafind
324							'p', # Yahoo
325							't',
326							'qry',
327							'qkw', # dpxml, msxml
328							'qr', # northernlight.com
329							'qu',
330							'kw', # Sapo
331							'general',
332							'B1',
333							'sc', # Gohip
334							'szukaj',
335							'PA',
336							'MT', # goo.ne.jp
337							'req', # dir.com
338							'k', # galaxy.com
339							'cat', # Dmoz
340							'u', # Google translation
341							'va', # search.yahoo.com
342							'K', # srd.yahoo.com
343							'as_epq' # Google, sometimes. Advanced query maybe?
344
345							);
346
347							# Cycle through each prefix and see if it's contained in the
348							# query_string. If it is, we extract the field, clean it, and
349							# return it. Simple.
350
351
352	0						for (@prefix_array) {
353	0	0					if ($query_string =~ m/(^\|\&)$_=(.+?)(\&\|$)/i) {
354
355	0						my $key_string = $2;
356
357	0						for ($key_string) {
358	0						tr/+/ /;
359	0						s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
	0
360	0						s/\s+/ /gs;
361							}
362
363	0	0					if ($key_string =~ /\w/) {
364	0						return $key_string;
365							}
366
367							}
368
369							}
370
371							# Failing all that, some Search-Engines don't overload the
372							# query_string with values, and just make the query_string
373							# the search terms. The next part looks for that, and returns
374							# the whole query_string (cleaned) if this appears to be the
375							# case.
376
377
378	0	0					if ($query_string !~ /\=/) {
379
380	0						for ($query_string) {
381	0						tr/+/ /;
382	0						s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
	0
383	0						s/\s+/ /gs;
384							}
385
386	0						return $query_string;
387							}
388
389
390	0						return;
391							}
392
393							1;
394
395							__END__