line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# Sequin v1.1.2 |
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# by Peter Sergeant |
4
|
|
|
|
|
|
|
# |
5
|
|
|
|
|
|
|
# A module for extracting and parsing search engine URLs from |
6
|
|
|
|
|
|
|
# server referrer files. Proper usage information is in the |
7
|
|
|
|
|
|
|
# README file |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
# Magic Package Stuff |
11
|
|
|
|
|
|
|
require 5.005; |
12
|
1
|
|
|
1
|
|
637
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
98
|
|
13
|
|
|
|
|
|
|
require Exporter; |
14
|
|
|
|
|
|
|
package URI::Sequin; |
15
|
1
|
|
|
1
|
|
5
|
use vars qw(@ISA $VERSION @EXPORT_OK %log_types); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
1680
|
|
16
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
17
|
|
|
|
|
|
|
@EXPORT_OK = qw(se_extract log_extract %log_types key_extract); |
18
|
|
|
|
|
|
|
$VERSION = '1.2'; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
# &log_extract v1.0 |
23
|
|
|
|
|
|
|
# =-=-=-=-=-=- =-=- |
24
|
|
|
|
|
|
|
# |
25
|
|
|
|
|
|
|
# |
26
|
|
|
|
|
|
|
# The purpose of this subroutine is to allow raw log files lines to be |
27
|
|
|
|
|
|
|
# handled. The subroutine accepts a log line, plus some information on |
28
|
|
|
|
|
|
|
# how it should be analysed, and returns a scalar value: the referring |
29
|
|
|
|
|
|
|
# URL. |
30
|
|
|
|
|
|
|
# |
31
|
|
|
|
|
|
|
# The subroutine knows a certain number of log types, and keeps regexs |
32
|
|
|
|
|
|
|
# with which to handle these logs in a globally accessable hash below, |
33
|
|
|
|
|
|
|
# called '%log_types'. If your log type is not already in the array it |
34
|
|
|
|
|
|
|
# can be added and used. |
35
|
|
|
|
|
|
|
# |
36
|
|
|
|
|
|
|
# Examples: |
37
|
|
|
|
|
|
|
# --------- |
38
|
|
|
|
|
|
|
# |
39
|
|
|
|
|
|
|
# Adding a new regex to %log_types: |
40
|
|
|
|
|
|
|
# => $log_types{'MyWebServer'} = '.+? Referer:(.+?) '; |
41
|
|
|
|
|
|
|
# |
42
|
|
|
|
|
|
|
# > It's worth pointing out that the subroutine uses $1 straight |
43
|
|
|
|
|
|
|
# > after the match has taken place to get the referrer. Because |
44
|
|
|
|
|
|
|
# > of this, you should make sure the part of string to be taken |
45
|
|
|
|
|
|
|
# > is enclosed in ()'s. If you're still unsure, this is clearly |
46
|
|
|
|
|
|
|
# > demonstrated below, where %log_types is set. |
47
|
|
|
|
|
|
|
# |
48
|
|
|
|
|
|
|
# Parsing a Log Entry |
49
|
|
|
|
|
|
|
# => $referrer = &log_extract($log_line, 'NCSA'); |
50
|
|
|
|
|
|
|
# |
51
|
|
|
|
|
|
|
# > As I hope is clear, $log_line is the log-file line that needs |
52
|
|
|
|
|
|
|
# > to be parsed, and 'Apache' refers to the relevant regex below |
53
|
|
|
|
|
|
|
# > in the %log_types hash. |
54
|
|
|
|
|
|
|
# |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
%log_types = ( |
57
|
|
|
|
|
|
|
# Microsoft IIS 3.0 and 2.0 |
58
|
|
|
|
|
|
|
'IIS1' => '(http:.+?),', |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# Microsoft IIS4.0 (W3SVC format) |
61
|
|
|
|
|
|
|
'IIS2' => '(http:.+?)$', |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
# NCSA (Apache, Netscape) |
64
|
|
|
|
|
|
|
'NCSA' => '"(http:.+?)"', |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
# O'Reilly WebSite format |
67
|
|
|
|
|
|
|
'ORW' => ' (http:.+?) ', |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
# General (works for most logtypes) |
70
|
|
|
|
|
|
|
'General' => '(?:\s|"|,|^)(http:.+?)(\s|"|,|$)', |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
); |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
sub log_extract { |
77
|
|
|
|
|
|
|
|
78
|
0
|
|
|
0
|
1
|
0
|
my $log_file_line = $_[0]; |
79
|
0
|
|
0
|
|
|
0
|
my $log_file_type = $_[1] || 'General'; |
80
|
|
|
|
|
|
|
|
81
|
0
|
|
|
|
|
0
|
chomp($log_file_line); |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
# Check that the $log_file_type contains a valid regex by using |
84
|
|
|
|
|
|
|
# (eval) on it to see if we crash the regex engine, and by also |
85
|
|
|
|
|
|
|
# checking if there is a regex in $log_types{$log_file_type} |
86
|
|
|
|
|
|
|
|
87
|
0
|
|
|
|
|
0
|
my $re = eval { qr/$log_types{$log_file_type}/ }; |
|
0
|
|
|
|
|
0
|
|
88
|
0
|
0
|
|
|
|
0
|
warn "Bad re: '$log_types{$log_file_type}' ($@)\n" if $@; |
89
|
|
|
|
|
|
|
|
90
|
0
|
0
|
|
|
|
0
|
unless (defined $log_types{$log_file_type}) { |
91
|
0
|
|
|
|
|
0
|
warn "Unknown Logtype - \"$log_file_type\"\n"; |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
# Return what we found |
96
|
|
|
|
|
|
|
|
97
|
0
|
0
|
|
|
|
0
|
if ($log_file_line =~ m/$log_types{$log_file_type}/i) { return $1 }; |
|
0
|
|
|
|
|
0
|
|
98
|
|
|
|
|
|
|
|
99
|
0
|
|
|
|
|
0
|
return; |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
} |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
# &se_extract v1.1 |
106
|
|
|
|
|
|
|
# =-=-=-=-=-= =-=- |
107
|
|
|
|
|
|
|
# |
108
|
|
|
|
|
|
|
# The purpose of this subroutine is to break down the referring URL in |
109
|
|
|
|
|
|
|
# to an array, containing the $search_engine_name and the |
110
|
|
|
|
|
|
|
# $search_engine_url. |
111
|
|
|
|
|
|
|
# |
112
|
|
|
|
|
|
|
# Example: |
113
|
|
|
|
|
|
|
# => ($name, $url) = @{&se_extract($url)}; |
114
|
|
|
|
|
|
|
# |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
sub se_extract { |
117
|
|
|
|
|
|
|
|
118
|
0
|
|
|
0
|
1
|
0
|
my $input_url = $_[0]; |
119
|
0
|
|
|
|
|
0
|
chomp($input_url); |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
# Break down the $input_url into two more useful variables, so |
122
|
|
|
|
|
|
|
# that we can check if there is information in the query |
123
|
|
|
|
|
|
|
# string, and if there is, we just get on with life. |
124
|
|
|
|
|
|
|
|
125
|
0
|
|
|
|
|
0
|
my ($location, $query_string) = split(/\?/, $input_url); |
126
|
0
|
0
|
|
|
|
0
|
return [] unless $query_string; |
127
|
|
|
|
|
|
|
|
128
|
0
|
|
|
|
|
0
|
my $search_engine_name; |
129
|
|
|
|
|
|
|
my $search_engine_url; |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
# This is a scary regex. It picks out with suprising accuracy |
132
|
|
|
|
|
|
|
# the main part of a URL - the 'MSN' part of: |
133
|
|
|
|
|
|
|
# http://biteme15.search.cgi.msn.com.uk/?asdfasdf |
134
|
|
|
|
|
|
|
|
135
|
0
|
0
|
|
|
|
0
|
if ($location =~ m!(http://)?(\d+\.\d+\.\d+\.\d+(\:\d+)?)/!) { |
136
|
0
|
|
|
|
|
0
|
return ["Unknown (IP)", $2]; |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
|
139
|
0
|
0
|
|
|
|
0
|
if ($location =~ m!^(.+?\. |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
140
|
|
|
|
|
|
|
([^\.]+) |
141
|
|
|
|
|
|
|
\. |
142
|
|
|
|
|
|
|
(com|net|org|int|mil|\w\w| |
143
|
|
|
|
|
|
|
(gov|mil|com|net|org|\w\w)\.\w\w |
144
|
|
|
|
|
|
|
) |
145
|
|
|
|
|
|
|
(?:/|:\d+/) |
146
|
|
|
|
|
|
|
)!x) { |
147
|
0
|
|
|
|
|
0
|
$search_engine_url = $1; |
148
|
0
|
|
|
|
|
0
|
$search_engine_name = "\u$2"; |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
} elsif ($location =~ m!^(http://)?((\w+)\.\w+(\:\d+/?)?)!) { |
151
|
0
|
|
|
|
|
0
|
$search_engine_url = $2; |
152
|
0
|
|
|
|
|
0
|
$search_engine_name = "\u$3"; |
153
|
|
|
|
|
|
|
} elsif ($location =~ m!^(http://)?((\w+)([^\.\w:]|(\:\d+/?)?))!) { |
154
|
0
|
|
|
|
|
0
|
$search_engine_url = $2; |
155
|
0
|
|
|
|
|
0
|
$search_engine_name = "\u$3"; |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
# This has allowed us to quite accurately get the name and URL |
160
|
|
|
|
|
|
|
# of any given search-engine. However, in the interests of |
161
|
|
|
|
|
|
|
# total accuracy, we have a list of search-engines that we know |
162
|
|
|
|
|
|
|
# so we can provide even more information, and make sure it's |
163
|
|
|
|
|
|
|
# correct. |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
# Define this list: |
166
|
|
|
|
|
|
|
|
167
|
0
|
|
|
|
|
0
|
my @search_engine_array = ( |
168
|
|
|
|
|
|
|
['Altavista', 'http://www.av.com', |
169
|
|
|
|
|
|
|
'(altavista|av)'], |
170
|
|
|
|
|
|
|
['HotBot', 'http://www.hotbot.com', |
171
|
|
|
|
|
|
|
'hotbot\.lycos'], |
172
|
|
|
|
|
|
|
['Infoseek', 'http://www.infoseek.com', |
173
|
|
|
|
|
|
|
'infoseek\.go'], |
174
|
|
|
|
|
|
|
['Magellan', 'http://magellan.excite.com', |
175
|
|
|
|
|
|
|
'magellan\.excite'], |
176
|
|
|
|
|
|
|
['Ask Jeeves', 'http://www.aj.com', |
177
|
|
|
|
|
|
|
'(aj|askjeeves)'], |
178
|
|
|
|
|
|
|
['CNET Search', 'http://www.search.com', |
179
|
|
|
|
|
|
|
'(cnet|search\.com|savysearch)'], |
180
|
|
|
|
|
|
|
); |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
# Cycle through the list |
183
|
|
|
|
|
|
|
|
184
|
0
|
|
|
|
|
0
|
for (@search_engine_array) { |
185
|
|
|
|
|
|
|
|
186
|
0
|
|
|
|
|
0
|
my ($se_name, $se_url, $se_regex) = @{$_}; |
|
0
|
|
|
|
|
0
|
|
187
|
|
|
|
|
|
|
|
188
|
0
|
0
|
|
|
|
0
|
if ($location =~ m/$se_regex/) { |
189
|
0
|
|
|
|
|
0
|
$search_engine_url = $se_url; |
190
|
0
|
|
|
|
|
0
|
$search_engine_name = $se_name; |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
# Return what we know. |
196
|
|
|
|
|
|
|
# jm: allow HTTPS search engines too ;) |
197
|
|
|
|
|
|
|
|
198
|
0
|
0
|
0
|
|
|
0
|
if (defined $search_engine_url && $search_engine_url !~ m!^https?://!) { |
199
|
0
|
|
|
|
|
0
|
$search_engine_url =~ s!^!http://!; |
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
|
202
|
0
|
|
|
|
|
0
|
return [$search_engine_name, $search_engine_url]; |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
} |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
# &key_extract v1.1 |
208
|
|
|
|
|
|
|
# =-=-=-=-=-= =-=- |
209
|
|
|
|
|
|
|
# |
210
|
|
|
|
|
|
|
# The purpose of this subroutine is to break down the referring URL in |
211
|
|
|
|
|
|
|
# to a string containing the search terms. |
212
|
|
|
|
|
|
|
# |
213
|
|
|
|
|
|
|
# Example: |
214
|
|
|
|
|
|
|
# => $terms = &key_extract($url); |
215
|
|
|
|
|
|
|
# |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
sub key_extract { |
218
|
|
|
|
|
|
|
|
219
|
1
|
|
|
1
|
1
|
47
|
my $input_url = $_[0]; |
220
|
|
|
|
|
|
|
|
221
|
1
|
|
|
|
|
5
|
chomp($input_url); |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
# Break down the $input_url in to two more useful variables |
224
|
|
|
|
|
|
|
|
225
|
1
|
|
|
|
|
5
|
my ($location, $query_string) = split(/\?/, $input_url); |
226
|
1
|
50
|
|
|
|
4
|
return unless $query_string; |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
# Google Caching ... What a bitch... This will deal with it, |
229
|
|
|
|
|
|
|
# how Google currently works... |
230
|
|
|
|
|
|
|
|
231
|
1
|
50
|
|
|
|
6
|
if ($query_string =~ m!q=cache\:.+/(.+?)&!i) { |
232
|
0
|
|
|
|
|
0
|
$_ = $1; |
233
|
0
|
|
|
|
|
0
|
tr/+/ /; |
234
|
0
|
|
|
|
|
0
|
s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; |
|
0
|
|
|
|
|
0
|
|
235
|
0
|
|
|
|
|
0
|
s/\s+/ /gs; |
236
|
0
|
|
|
|
|
0
|
s/^\s+//g; |
237
|
0
|
|
|
|
|
0
|
s/\s+$//g; |
238
|
0
|
|
|
|
|
0
|
return $_; |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
# There are a number of ways in which we now try and determine |
242
|
|
|
|
|
|
|
# what the search terms are. The first is quite clever, IMHO. |
243
|
|
|
|
|
|
|
# We search for spaces in any of the submitted fields that |
244
|
|
|
|
|
|
|
# isn't called 'next' or 'submit' or 'col' or 'btnG' (blame |
245
|
|
|
|
|
|
|
# google). |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
|
249
|
1
|
50
|
|
|
|
12
|
if ($query_string =~ m/(?
|
250
|
|
|
|
|
|
|
(?
|
251
|
|
|
|
|
|
|
([^&]*(?:\+|%2b)[^&]*)/xi) { |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
|
254
|
1
|
|
|
|
|
5
|
my $key_string = $1; |
255
|
1
|
|
|
|
|
3
|
my $false = 0; |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
# Some search engines are determined to try and fool us |
258
|
|
|
|
|
|
|
# :). Therefore, we kill some pseudo-matches containing |
259
|
|
|
|
|
|
|
# %07C ( a pipe: | ) and %02C, by setting the $false |
260
|
|
|
|
|
|
|
# scalar to a positive value, that overides a little |
261
|
|
|
|
|
|
|
# later on. |
262
|
|
|
|
|
|
|
|
263
|
1
|
50
|
|
|
|
5
|
$false++ if $key_string =~ m/(%02|%7C%7C)/; |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
# Clean our information from those nasty escape |
266
|
|
|
|
|
|
|
# sequences. |
267
|
|
|
|
|
|
|
|
268
|
1
|
|
|
|
|
3
|
for ($key_string) { |
269
|
1
|
|
|
|
|
9
|
tr/+/ /; |
270
|
1
|
|
|
|
|
3
|
s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; |
|
0
|
|
|
|
|
0
|
|
271
|
1
|
|
|
|
|
8
|
s/\s+/ /gs; |
272
|
|
|
|
|
|
|
} |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
# Unless we decided to abort earlier, return the |
275
|
|
|
|
|
|
|
# field that we found. |
276
|
|
|
|
|
|
|
|
277
|
1
|
50
|
|
|
|
7
|
return $key_string unless ($false); |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
} |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
# Okay. If that failed, then we need to take a closer look. |
282
|
|
|
|
|
|
|
# In the array below are many many possible prefixes for a term |
283
|
|
|
|
|
|
|
# that might contain our data. They're in a particular order |
284
|
|
|
|
|
|
|
# because some search engines use two of the variables. |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
# NB: This isn't quite finished. If you're finding that the |
287
|
|
|
|
|
|
|
# wrong prefix is being used, please email me and tell me |
288
|
|
|
|
|
|
|
# at pete_sergeant@hotmail.com |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
# If you're wondering why they're ordered in this slightly |
291
|
|
|
|
|
|
|
# bizarre and seemingly random order, it's because some |
292
|
|
|
|
|
|
|
# search engines have decided to use more than one of these |
293
|
|
|
|
|
|
|
# variables, and the order these are in hopefully pick the |
294
|
|
|
|
|
|
|
# right one first. |
295
|
|
|
|
|
|
|
|
296
|
0
|
|
|
|
|
|
my @prefix_array = ( |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
'\w*query\w*', # CNET Search, Netscape |
300
|
|
|
|
|
|
|
'\w*search(?!Type)\w*', |
301
|
|
|
|
|
|
|
'\w*term\w*', |
302
|
|
|
|
|
|
|
'ask', # Ask Jeeves |
303
|
|
|
|
|
|
|
'.\w?key.\w?', |
304
|
|
|
|
|
|
|
'palabras', |
305
|
|
|
|
|
|
|
'DTqb1', |
306
|
|
|
|
|
|
|
'request', |
307
|
|
|
|
|
|
|
'ShowMatch', # syndic8 |
308
|
|
|
|
|
|
|
'keywords?', # Snap, overture.com |
309
|
|
|
|
|
|
|
'general', # MetaCrawler, Go2Net |
310
|
|
|
|
|
|
|
'key', # Looksmart |
311
|
|
|
|
|
|
|
'MetaTopic', # AJ |
312
|
|
|
|
|
|
|
'query0', # elf8888.at, thx to http://www.tnl.net/ |
313
|
|
|
|
|
|
|
'queryString', # blogdigger.com |
314
|
|
|
|
|
|
|
'serachfor', # mysearch.com dyslexia ;) |
315
|
|
|
|
|
|
|
'terms', # abcsearch.com |
316
|
|
|
|
|
|
|
'word', # baidu.com |
317
|
|
|
|
|
|
|
'rn', |
318
|
|
|
|
|
|
|
'mt', # MSN, HotBot |
319
|
|
|
|
|
|
|
'qt', # Go, Infoseek, search.com |
320
|
|
|
|
|
|
|
'oq', |
321
|
|
|
|
|
|
|
'dom', # Domainsurfer |
322
|
|
|
|
|
|
|
's', # Excite, blogsphere.us |
323
|
|
|
|
|
|
|
'q', # Altavista, Google, Dogpile, Evreka, Metafind |
324
|
|
|
|
|
|
|
'p', # Yahoo |
325
|
|
|
|
|
|
|
't', |
326
|
|
|
|
|
|
|
'qry', |
327
|
|
|
|
|
|
|
'qkw', # dpxml, msxml |
328
|
|
|
|
|
|
|
'qr', # northernlight.com |
329
|
|
|
|
|
|
|
'qu', |
330
|
|
|
|
|
|
|
'kw', # Sapo |
331
|
|
|
|
|
|
|
'general', |
332
|
|
|
|
|
|
|
'B1', |
333
|
|
|
|
|
|
|
'sc', # Gohip |
334
|
|
|
|
|
|
|
'szukaj', |
335
|
|
|
|
|
|
|
'PA', |
336
|
|
|
|
|
|
|
'MT', # goo.ne.jp |
337
|
|
|
|
|
|
|
'req', # dir.com |
338
|
|
|
|
|
|
|
'k', # galaxy.com |
339
|
|
|
|
|
|
|
'cat', # Dmoz |
340
|
|
|
|
|
|
|
'u', # Google translation |
341
|
|
|
|
|
|
|
'va', # search.yahoo.com |
342
|
|
|
|
|
|
|
'K', # srd.yahoo.com |
343
|
|
|
|
|
|
|
'as_epq' # Google, sometimes. Advanced query maybe? |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
); |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
# Cycle through each prefix and see if it's contained in the |
348
|
|
|
|
|
|
|
# query_string. If it is, we extract the field, clean it, and |
349
|
|
|
|
|
|
|
# return it. Simple. |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
|
352
|
0
|
|
|
|
|
|
for (@prefix_array) { |
353
|
0
|
0
|
|
|
|
|
if ($query_string =~ m/(^|\&)$_=(.+?)(\&|$)/i) { |
354
|
|
|
|
|
|
|
|
355
|
0
|
|
|
|
|
|
my $key_string = $2; |
356
|
|
|
|
|
|
|
|
357
|
0
|
|
|
|
|
|
for ($key_string) { |
358
|
0
|
|
|
|
|
|
tr/+/ /; |
359
|
0
|
|
|
|
|
|
s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; |
|
0
|
|
|
|
|
|
|
360
|
0
|
|
|
|
|
|
s/\s+/ /gs; |
361
|
|
|
|
|
|
|
} |
362
|
|
|
|
|
|
|
|
363
|
0
|
0
|
|
|
|
|
if ($key_string =~ /\w/) { |
364
|
0
|
|
|
|
|
|
return $key_string; |
365
|
|
|
|
|
|
|
} |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
} |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
} |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
# Failing all that, some Search-Engines don't overload the |
372
|
|
|
|
|
|
|
# query_string with values, and just make the query_string |
373
|
|
|
|
|
|
|
# the search terms. The next part looks for that, and returns |
374
|
|
|
|
|
|
|
# the whole query_string (cleaned) if this appears to be the |
375
|
|
|
|
|
|
|
# case. |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
|
378
|
0
|
0
|
|
|
|
|
if ($query_string !~ /\=/) { |
379
|
|
|
|
|
|
|
|
380
|
0
|
|
|
|
|
|
for ($query_string) { |
381
|
0
|
|
|
|
|
|
tr/+/ /; |
382
|
0
|
|
|
|
|
|
s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; |
|
0
|
|
|
|
|
|
|
383
|
0
|
|
|
|
|
|
s/\s+/ /gs; |
384
|
|
|
|
|
|
|
} |
385
|
|
|
|
|
|
|
|
386
|
0
|
|
|
|
|
|
return $query_string; |
387
|
|
|
|
|
|
|
} |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
|
390
|
0
|
|
|
|
|
|
return; |
391
|
|
|
|
|
|
|
} |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
1; |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
__END__ |