| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | # Sequin v1.1.2 | 
| 2 |  |  |  |  |  |  | # | 
| 3 |  |  |  |  |  |  | # by Peter Sergeant | 
| 4 |  |  |  |  |  |  | # | 
| 5 |  |  |  |  |  |  | # A module for extracting and parsing search engine URLs from | 
| 6 |  |  |  |  |  |  | # server referrer files. Proper usage information is in the | 
| 7 |  |  |  |  |  |  | # README file | 
| 8 |  |  |  |  |  |  |  | 
| 9 |  |  |  |  |  |  |  | 
| 10 |  |  |  |  |  |  | # Magic Package Stuff | 
| 11 |  |  |  |  |  |  | require 5.005; | 
| 12 | 1 |  |  | 1 |  | 637 | use strict; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 98 |  | 
| 13 |  |  |  |  |  |  | require Exporter; | 
| 14 |  |  |  |  |  |  | package URI::Sequin; | 
| 15 | 1 |  |  | 1 |  | 5 | use vars qw(@ISA $VERSION @EXPORT_OK %log_types); | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 1680 |  | 
| 16 |  |  |  |  |  |  | @ISA = qw(Exporter); | 
| 17 |  |  |  |  |  |  | @EXPORT_OK = qw(se_extract log_extract %log_types key_extract); | 
| 18 |  |  |  |  |  |  | $VERSION = '1.2'; | 
| 19 |  |  |  |  |  |  |  | 
| 20 |  |  |  |  |  |  |  | 
| 21 |  |  |  |  |  |  |  | 
| 22 |  |  |  |  |  |  | # &log_extract v1.0 | 
| 23 |  |  |  |  |  |  | # =-=-=-=-=-=- =-=- | 
| 24 |  |  |  |  |  |  | # | 
| 25 |  |  |  |  |  |  | # | 
| 26 |  |  |  |  |  |  | # The purpose of this subroutine is to allow raw log files lines to be | 
| 27 |  |  |  |  |  |  | # handled. The subroutine accepts a log line, plus some information on | 
| 28 |  |  |  |  |  |  | # how it should be analysed, and returns a scalar value: the referring | 
| 29 |  |  |  |  |  |  | # URL. | 
| 30 |  |  |  |  |  |  | # | 
| 31 |  |  |  |  |  |  | # The subroutine knows a certain number of log types, and keeps regexs | 
| 32 |  |  |  |  |  |  | # with which to handle these logs in a globally accessable hash below, | 
| 33 |  |  |  |  |  |  | # called '%log_types'. If your log type is not already in the array it | 
| 34 |  |  |  |  |  |  | # can be added and used. | 
| 35 |  |  |  |  |  |  | # | 
| 36 |  |  |  |  |  |  | # Examples: | 
| 37 |  |  |  |  |  |  | # --------- | 
| 38 |  |  |  |  |  |  | # | 
| 39 |  |  |  |  |  |  | # Adding a new regex to %log_types: | 
| 40 |  |  |  |  |  |  | # => $log_types{'MyWebServer'} = '.+? Referer:(.+?) '; | 
| 41 |  |  |  |  |  |  | # | 
| 42 |  |  |  |  |  |  | #	> It's worth pointing out that the subroutine uses $1 straight | 
| 43 |  |  |  |  |  |  | #	> after the match has taken place to get the referrer. Because | 
| 44 |  |  |  |  |  |  | #	> of this, you should make sure the part of string to be taken | 
| 45 |  |  |  |  |  |  | #	> is enclosed in ()'s. If you're still unsure, this is clearly | 
| 46 |  |  |  |  |  |  | #	> demonstrated below, where %log_types is set. | 
| 47 |  |  |  |  |  |  | # | 
| 48 |  |  |  |  |  |  | # Parsing a Log Entry | 
| 49 |  |  |  |  |  |  | # => $referrer = &log_extract($log_line, 'NCSA'); | 
| 50 |  |  |  |  |  |  | # | 
| 51 |  |  |  |  |  |  | #	> As I hope is clear, $log_line is the log-file line that needs | 
| 52 |  |  |  |  |  |  | #	> to be parsed, and 'Apache' refers to the relevant regex below | 
| 53 |  |  |  |  |  |  | #	> in the %log_types hash. | 
| 54 |  |  |  |  |  |  | # | 
| 55 |  |  |  |  |  |  |  | 
| 56 |  |  |  |  |  |  | %log_types = ( | 
| 57 |  |  |  |  |  |  | # Microsoft IIS 3.0 and 2.0 | 
| 58 |  |  |  |  |  |  | 'IIS1'		=>	'(http:.+?),', | 
| 59 |  |  |  |  |  |  |  | 
| 60 |  |  |  |  |  |  | # Microsoft IIS4.0 (W3SVC format) | 
| 61 |  |  |  |  |  |  | 'IIS2'		=>	'(http:.+?)$', | 
| 62 |  |  |  |  |  |  |  | 
| 63 |  |  |  |  |  |  | # NCSA (Apache, Netscape) | 
| 64 |  |  |  |  |  |  | 'NCSA'		=>	'"(http:.+?)"', | 
| 65 |  |  |  |  |  |  |  | 
| 66 |  |  |  |  |  |  | # O'Reilly WebSite format | 
| 67 |  |  |  |  |  |  | 'ORW'		=>	' (http:.+?) ', | 
| 68 |  |  |  |  |  |  |  | 
| 69 |  |  |  |  |  |  | # General (works for most logtypes) | 
| 70 |  |  |  |  |  |  | 'General'	=>	'(?:\s|"|,|^)(http:.+?)(\s|"|,|$)', | 
| 71 |  |  |  |  |  |  |  | 
| 72 |  |  |  |  |  |  | ); | 
| 73 |  |  |  |  |  |  |  | 
| 74 |  |  |  |  |  |  |  | 
| 75 |  |  |  |  |  |  |  | 
| 76 |  |  |  |  |  |  | sub log_extract { | 
| 77 |  |  |  |  |  |  |  | 
| 78 | 0 |  |  | 0 | 1 | 0 | my $log_file_line	=	$_[0]; | 
| 79 | 0 |  | 0 |  |  | 0 | my $log_file_type	=	$_[1] || 'General'; | 
| 80 |  |  |  |  |  |  |  | 
| 81 | 0 |  |  |  |  | 0 | chomp($log_file_line); | 
| 82 |  |  |  |  |  |  |  | 
| 83 |  |  |  |  |  |  | # Check that the $log_file_type contains a valid regex by using | 
| 84 |  |  |  |  |  |  | # (eval) on it to see if we crash the regex engine, and by also | 
| 85 |  |  |  |  |  |  | # checking if there is a regex in $log_types{$log_file_type} | 
| 86 |  |  |  |  |  |  |  | 
| 87 | 0 |  |  |  |  | 0 | my $re = eval { qr/$log_types{$log_file_type}/ }; | 
|  | 0 |  |  |  |  | 0 |  | 
| 88 | 0 | 0 |  |  |  | 0 | warn "Bad re: '$log_types{$log_file_type}' ($@)\n" if $@; | 
| 89 |  |  |  |  |  |  |  | 
| 90 | 0 | 0 |  |  |  | 0 | unless (defined $log_types{$log_file_type}) { | 
| 91 | 0 |  |  |  |  | 0 | warn "Unknown Logtype - \"$log_file_type\"\n"; | 
| 92 |  |  |  |  |  |  | } | 
| 93 |  |  |  |  |  |  |  | 
| 94 |  |  |  |  |  |  |  | 
| 95 |  |  |  |  |  |  | # Return what we found | 
| 96 |  |  |  |  |  |  |  | 
| 97 | 0 | 0 |  |  |  | 0 | if ($log_file_line =~ m/$log_types{$log_file_type}/i) { return $1 }; | 
|  | 0 |  |  |  |  | 0 |  | 
| 98 |  |  |  |  |  |  |  | 
| 99 | 0 |  |  |  |  | 0 | return; | 
| 100 |  |  |  |  |  |  |  | 
| 101 |  |  |  |  |  |  | } | 
| 102 |  |  |  |  |  |  |  | 
| 103 |  |  |  |  |  |  |  | 
| 104 |  |  |  |  |  |  |  | 
| 105 |  |  |  |  |  |  | # &se_extract v1.1 | 
| 106 |  |  |  |  |  |  | # =-=-=-=-=-= =-=- | 
| 107 |  |  |  |  |  |  | # | 
| 108 |  |  |  |  |  |  | # The purpose of this subroutine is to break down the referring URL in | 
| 109 |  |  |  |  |  |  | # to an array, containing the $search_engine_name and the | 
| 110 |  |  |  |  |  |  | # $search_engine_url. | 
| 111 |  |  |  |  |  |  | # | 
| 112 |  |  |  |  |  |  | # Example: | 
| 113 |  |  |  |  |  |  | # => ($name, $url) = @{&se_extract($url)}; | 
| 114 |  |  |  |  |  |  | # | 
| 115 |  |  |  |  |  |  |  | 
| 116 |  |  |  |  |  |  | sub se_extract { | 
| 117 |  |  |  |  |  |  |  | 
| 118 | 0 |  |  | 0 | 1 | 0 | my $input_url = $_[0]; | 
| 119 | 0 |  |  |  |  | 0 | chomp($input_url); | 
| 120 |  |  |  |  |  |  |  | 
| 121 |  |  |  |  |  |  | # Break down the $input_url into two more useful variables, so | 
| 122 |  |  |  |  |  |  | # that we can check if there is information in the query | 
| 123 |  |  |  |  |  |  | # string, and if there is, we just get on with life. | 
| 124 |  |  |  |  |  |  |  | 
| 125 | 0 |  |  |  |  | 0 | my ($location, $query_string) = split(/\?/, $input_url); | 
| 126 | 0 | 0 |  |  |  | 0 | return [] unless $query_string; | 
| 127 |  |  |  |  |  |  |  | 
| 128 | 0 |  |  |  |  | 0 | my $search_engine_name; | 
| 129 |  |  |  |  |  |  | my $search_engine_url; | 
| 130 |  |  |  |  |  |  |  | 
| 131 |  |  |  |  |  |  | # This is a scary regex. It picks out with suprising accuracy | 
| 132 |  |  |  |  |  |  | # the main part of a URL - the 'MSN' part of: | 
| 133 |  |  |  |  |  |  | # http://biteme15.search.cgi.msn.com.uk/?asdfasdf | 
| 134 |  |  |  |  |  |  |  | 
| 135 | 0 | 0 |  |  |  | 0 | if ($location =~ m!(http://)?(\d+\.\d+\.\d+\.\d+(\:\d+)?)/!) { | 
| 136 | 0 |  |  |  |  | 0 | return ["Unknown (IP)", $2]; | 
| 137 |  |  |  |  |  |  | } | 
| 138 |  |  |  |  |  |  |  | 
| 139 | 0 | 0 |  |  |  | 0 | if ($location =~ m!^(.+?\. | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
| 140 |  |  |  |  |  |  | ([^\.]+) | 
| 141 |  |  |  |  |  |  | \. | 
| 142 |  |  |  |  |  |  | (com|net|org|int|mil|\w\w| | 
| 143 |  |  |  |  |  |  | (gov|mil|com|net|org|\w\w)\.\w\w | 
| 144 |  |  |  |  |  |  | ) | 
| 145 |  |  |  |  |  |  | (?:/|:\d+/) | 
| 146 |  |  |  |  |  |  | )!x) { | 
| 147 | 0 |  |  |  |  | 0 | $search_engine_url	= $1; | 
| 148 | 0 |  |  |  |  | 0 | $search_engine_name	= "\u$2"; | 
| 149 |  |  |  |  |  |  |  | 
| 150 |  |  |  |  |  |  | } elsif ($location =~ m!^(http://)?((\w+)\.\w+(\:\d+/?)?)!) { | 
| 151 | 0 |  |  |  |  | 0 | $search_engine_url	= $2; | 
| 152 | 0 |  |  |  |  | 0 | $search_engine_name	= "\u$3"; | 
| 153 |  |  |  |  |  |  | } elsif ($location =~ m!^(http://)?((\w+)([^\.\w:]|(\:\d+/?)?))!) { | 
| 154 | 0 |  |  |  |  | 0 | $search_engine_url 	= $2; | 
| 155 | 0 |  |  |  |  | 0 | $search_engine_name	= "\u$3"; | 
| 156 |  |  |  |  |  |  | } | 
| 157 |  |  |  |  |  |  |  | 
| 158 |  |  |  |  |  |  |  | 
| 159 |  |  |  |  |  |  | # This has allowed us to quite accurately get the name and URL | 
| 160 |  |  |  |  |  |  | # of any given search-engine. However, in the interests of | 
| 161 |  |  |  |  |  |  | # total accuracy, we have a list of search-engines that we know | 
| 162 |  |  |  |  |  |  | # so we can provide even more information, and make sure it's | 
| 163 |  |  |  |  |  |  | # correct. | 
| 164 |  |  |  |  |  |  |  | 
| 165 |  |  |  |  |  |  | # Define this list: | 
| 166 |  |  |  |  |  |  |  | 
| 167 | 0 |  |  |  |  | 0 | my @search_engine_array = ( | 
| 168 |  |  |  |  |  |  | ['Altavista',	'http://www.av.com', | 
| 169 |  |  |  |  |  |  | '(altavista|av)'], | 
| 170 |  |  |  |  |  |  | ['HotBot',	'http://www.hotbot.com', | 
| 171 |  |  |  |  |  |  | 'hotbot\.lycos'], | 
| 172 |  |  |  |  |  |  | ['Infoseek',	'http://www.infoseek.com', | 
| 173 |  |  |  |  |  |  | 'infoseek\.go'], | 
| 174 |  |  |  |  |  |  | ['Magellan',	'http://magellan.excite.com', | 
| 175 |  |  |  |  |  |  | 'magellan\.excite'], | 
| 176 |  |  |  |  |  |  | ['Ask Jeeves',	'http://www.aj.com', | 
| 177 |  |  |  |  |  |  | '(aj|askjeeves)'], | 
| 178 |  |  |  |  |  |  | ['CNET Search',	'http://www.search.com', | 
| 179 |  |  |  |  |  |  | '(cnet|search\.com|savysearch)'], | 
| 180 |  |  |  |  |  |  | ); | 
| 181 |  |  |  |  |  |  |  | 
| 182 |  |  |  |  |  |  | # Cycle through the list | 
| 183 |  |  |  |  |  |  |  | 
| 184 | 0 |  |  |  |  | 0 | for (@search_engine_array) { | 
| 185 |  |  |  |  |  |  |  | 
| 186 | 0 |  |  |  |  | 0 | my ($se_name, $se_url, $se_regex) = @{$_}; | 
|  | 0 |  |  |  |  | 0 |  | 
| 187 |  |  |  |  |  |  |  | 
| 188 | 0 | 0 |  |  |  | 0 | if ($location =~ m/$se_regex/) { | 
| 189 | 0 |  |  |  |  | 0 | $search_engine_url = $se_url; | 
| 190 | 0 |  |  |  |  | 0 | $search_engine_name = $se_name; | 
| 191 |  |  |  |  |  |  | } | 
| 192 |  |  |  |  |  |  |  | 
| 193 |  |  |  |  |  |  | } | 
| 194 |  |  |  |  |  |  |  | 
| 195 |  |  |  |  |  |  | # Return what we know. | 
| 196 |  |  |  |  |  |  | # jm: allow HTTPS search engines too ;) | 
| 197 |  |  |  |  |  |  |  | 
| 198 | 0 | 0 | 0 |  |  | 0 | if (defined $search_engine_url && $search_engine_url !~ m!^https?://!) { | 
| 199 | 0 |  |  |  |  | 0 | $search_engine_url =~ s!^!http://!; | 
| 200 |  |  |  |  |  |  | } | 
| 201 |  |  |  |  |  |  |  | 
| 202 | 0 |  |  |  |  | 0 | return [$search_engine_name, $search_engine_url]; | 
| 203 |  |  |  |  |  |  |  | 
| 204 |  |  |  |  |  |  | } | 
| 205 |  |  |  |  |  |  |  | 
| 206 |  |  |  |  |  |  |  | 
| 207 |  |  |  |  |  |  | # &key_extract v1.1 | 
| 208 |  |  |  |  |  |  | # =-=-=-=-=-= =-=- | 
| 209 |  |  |  |  |  |  | # | 
| 210 |  |  |  |  |  |  | # The purpose of this subroutine is to break down the referring URL in | 
| 211 |  |  |  |  |  |  | # to a string containing the search terms. | 
| 212 |  |  |  |  |  |  | # | 
| 213 |  |  |  |  |  |  | # Example: | 
| 214 |  |  |  |  |  |  | # => $terms = &key_extract($url); | 
| 215 |  |  |  |  |  |  | # | 
| 216 |  |  |  |  |  |  |  | 
| 217 |  |  |  |  |  |  | sub key_extract { | 
| 218 |  |  |  |  |  |  |  | 
| 219 | 1 |  |  | 1 | 1 | 47 | my $input_url = $_[0]; | 
| 220 |  |  |  |  |  |  |  | 
| 221 | 1 |  |  |  |  | 5 | chomp($input_url); | 
| 222 |  |  |  |  |  |  |  | 
| 223 |  |  |  |  |  |  | # Break down the $input_url in to two more useful variables | 
| 224 |  |  |  |  |  |  |  | 
| 225 | 1 |  |  |  |  | 5 | my ($location, $query_string) = split(/\?/, $input_url); | 
| 226 | 1 | 50 |  |  |  | 4 | return unless $query_string; | 
| 227 |  |  |  |  |  |  |  | 
| 228 |  |  |  |  |  |  | # Google Caching ... What a bitch... This will deal with it, | 
| 229 |  |  |  |  |  |  | # how Google currently works... | 
| 230 |  |  |  |  |  |  |  | 
| 231 | 1 | 50 |  |  |  | 6 | if ($query_string =~ m!q=cache\:.+/(.+?)&!i) { | 
| 232 | 0 |  |  |  |  | 0 | $_ = $1; | 
| 233 | 0 |  |  |  |  | 0 | tr/+/ /; | 
| 234 | 0 |  |  |  |  | 0 | s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; | 
|  | 0 |  |  |  |  | 0 |  | 
| 235 | 0 |  |  |  |  | 0 | s/\s+/ /gs; | 
| 236 | 0 |  |  |  |  | 0 | s/^\s+//g; | 
| 237 | 0 |  |  |  |  | 0 | s/\s+$//g; | 
| 238 | 0 |  |  |  |  | 0 | return $_; | 
| 239 |  |  |  |  |  |  | } | 
| 240 |  |  |  |  |  |  |  | 
| 241 |  |  |  |  |  |  | # There are a number of ways in which we now try and determine | 
| 242 |  |  |  |  |  |  | # what the search terms are. The first is quite clever, IMHO. | 
| 243 |  |  |  |  |  |  | # We search for spaces in any of the submitted fields that | 
| 244 |  |  |  |  |  |  | # isn't called 'next' or 'submit' or 'col' or 'btnG' (blame | 
| 245 |  |  |  |  |  |  | # google). | 
| 246 |  |  |  |  |  |  |  | 
| 247 |  |  |  |  |  |  |  | 
| 248 |  |  |  |  |  |  |  | 
| 249 | 1 | 50 |  |  |  | 12 | if ($query_string =~ m/(? | 
| 250 |  |  |  |  |  |  | (? | 
| 251 |  |  |  |  |  |  | ([^&]*(?:\+|%2b)[^&]*)/xi) { | 
| 252 |  |  |  |  |  |  |  | 
| 253 |  |  |  |  |  |  |  | 
| 254 | 1 |  |  |  |  | 5 | my $key_string = $1; | 
| 255 | 1 |  |  |  |  | 3 | my $false = 0; | 
| 256 |  |  |  |  |  |  |  | 
| 257 |  |  |  |  |  |  | # Some search engines are determined to try and fool us | 
| 258 |  |  |  |  |  |  | # :). Therefore, we kill some pseudo-matches containing | 
| 259 |  |  |  |  |  |  | # %07C ( a pipe: | ) and %02C, by setting the $false | 
| 260 |  |  |  |  |  |  | # scalar to a positive value, that overides a little | 
| 261 |  |  |  |  |  |  | # later on. | 
| 262 |  |  |  |  |  |  |  | 
| 263 | 1 | 50 |  |  |  | 5 | $false++ if $key_string =~ m/(%02|%7C%7C)/; | 
| 264 |  |  |  |  |  |  |  | 
| 265 |  |  |  |  |  |  | # Clean our information from those nasty escape | 
| 266 |  |  |  |  |  |  | # sequences. | 
| 267 |  |  |  |  |  |  |  | 
| 268 | 1 |  |  |  |  | 3 | for ($key_string) { | 
| 269 | 1 |  |  |  |  | 9 | tr/+/ /; | 
| 270 | 1 |  |  |  |  | 3 | s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; | 
|  | 0 |  |  |  |  | 0 |  | 
| 271 | 1 |  |  |  |  | 8 | s/\s+/ /gs; | 
| 272 |  |  |  |  |  |  | } | 
| 273 |  |  |  |  |  |  |  | 
| 274 |  |  |  |  |  |  | # Unless we decided to abort earlier, return the | 
| 275 |  |  |  |  |  |  | # field that we found. | 
| 276 |  |  |  |  |  |  |  | 
| 277 | 1 | 50 |  |  |  | 7 | return $key_string unless ($false); | 
| 278 |  |  |  |  |  |  |  | 
| 279 |  |  |  |  |  |  | } | 
| 280 |  |  |  |  |  |  |  | 
| 281 |  |  |  |  |  |  | # Okay. If that failed, then we need to take a closer look. | 
| 282 |  |  |  |  |  |  | # In the array below are many many possible prefixes for a term | 
| 283 |  |  |  |  |  |  | # that might contain our data. They're in a particular order | 
| 284 |  |  |  |  |  |  | # because some search engines use two of the variables. | 
| 285 |  |  |  |  |  |  |  | 
| 286 |  |  |  |  |  |  | # NB: This isn't quite finished. If you're finding that the | 
| 287 |  |  |  |  |  |  | #     wrong prefix is being used, please email me and tell me | 
| 288 |  |  |  |  |  |  | #     at pete_sergeant@hotmail.com | 
| 289 |  |  |  |  |  |  |  | 
| 290 |  |  |  |  |  |  | # If you're wondering why they're ordered in this slightly | 
| 291 |  |  |  |  |  |  | # bizarre and seemingly random order, it's because some | 
| 292 |  |  |  |  |  |  | # search engines have decided to use more than one of these | 
| 293 |  |  |  |  |  |  | # variables, and the order these are in hopefully pick the | 
| 294 |  |  |  |  |  |  | # right one first. | 
| 295 |  |  |  |  |  |  |  | 
| 296 | 0 |  |  |  |  |  | my @prefix_array = ( | 
| 297 |  |  |  |  |  |  |  | 
| 298 |  |  |  |  |  |  |  | 
| 299 |  |  |  |  |  |  | '\w*query\w*',   # CNET Search, Netscape | 
| 300 |  |  |  |  |  |  | '\w*search(?!Type)\w*', | 
| 301 |  |  |  |  |  |  | '\w*term\w*', | 
| 302 |  |  |  |  |  |  | 'ask',             # Ask Jeeves | 
| 303 |  |  |  |  |  |  | '.\w?key.\w?', | 
| 304 |  |  |  |  |  |  | 'palabras', | 
| 305 |  |  |  |  |  |  | 'DTqb1', | 
| 306 |  |  |  |  |  |  | 'request', | 
| 307 |  |  |  |  |  |  | 'ShowMatch',    # syndic8 | 
| 308 |  |  |  |  |  |  | 'keywords?',  # Snap, overture.com | 
| 309 |  |  |  |  |  |  | 'general',  # MetaCrawler, Go2Net | 
| 310 |  |  |  |  |  |  | 'key',      # Looksmart | 
| 311 |  |  |  |  |  |  | 'MetaTopic', # AJ | 
| 312 |  |  |  |  |  |  | 'query0', # elf8888.at, thx to http://www.tnl.net/ | 
| 313 |  |  |  |  |  |  | 'queryString', # blogdigger.com | 
| 314 |  |  |  |  |  |  | 'serachfor', # mysearch.com dyslexia ;) | 
| 315 |  |  |  |  |  |  | 'terms', # abcsearch.com | 
| 316 |  |  |  |  |  |  | 'word', # baidu.com | 
| 317 |  |  |  |  |  |  | 'rn', | 
| 318 |  |  |  |  |  |  | 'mt',  # MSN, HotBot | 
| 319 |  |  |  |  |  |  | 'qt',  # Go, Infoseek, search.com | 
| 320 |  |  |  |  |  |  | 'oq', | 
| 321 |  |  |  |  |  |  | 'dom', # Domainsurfer | 
| 322 |  |  |  |  |  |  | 's',   # Excite, blogsphere.us | 
| 323 |  |  |  |  |  |  | 'q',   # Altavista, Google, Dogpile, Evreka, Metafind | 
| 324 |  |  |  |  |  |  | 'p',   # Yahoo | 
| 325 |  |  |  |  |  |  | 't', | 
| 326 |  |  |  |  |  |  | 'qry', | 
| 327 |  |  |  |  |  |  | 'qkw', # dpxml, msxml | 
| 328 |  |  |  |  |  |  | 'qr',  # northernlight.com | 
| 329 |  |  |  |  |  |  | 'qu', | 
| 330 |  |  |  |  |  |  | 'kw',  # Sapo | 
| 331 |  |  |  |  |  |  | 'general', | 
| 332 |  |  |  |  |  |  | 'B1', | 
| 333 |  |  |  |  |  |  | 'sc',  # Gohip | 
| 334 |  |  |  |  |  |  | 'szukaj', | 
| 335 |  |  |  |  |  |  | 'PA', | 
| 336 |  |  |  |  |  |  | 'MT',  # goo.ne.jp | 
| 337 |  |  |  |  |  |  | 'req', # dir.com | 
| 338 |  |  |  |  |  |  | 'k',   # galaxy.com | 
| 339 |  |  |  |  |  |  | 'cat', # Dmoz | 
| 340 |  |  |  |  |  |  | 'u',   # Google translation | 
| 341 |  |  |  |  |  |  | 'va',  # search.yahoo.com | 
| 342 |  |  |  |  |  |  | 'K',   # srd.yahoo.com | 
| 343 |  |  |  |  |  |  | 'as_epq' # Google, sometimes. Advanced query maybe? | 
| 344 |  |  |  |  |  |  |  | 
| 345 |  |  |  |  |  |  | ); | 
| 346 |  |  |  |  |  |  |  | 
| 347 |  |  |  |  |  |  | # Cycle through each prefix and see if it's contained in the | 
| 348 |  |  |  |  |  |  | # query_string. If it is, we extract the field, clean it, and | 
| 349 |  |  |  |  |  |  | # return it. Simple. | 
| 350 |  |  |  |  |  |  |  | 
| 351 |  |  |  |  |  |  |  | 
| 352 | 0 |  |  |  |  |  | for (@prefix_array) { | 
| 353 | 0 | 0 |  |  |  |  | if ($query_string =~ m/(^|\&)$_=(.+?)(\&|$)/i) { | 
| 354 |  |  |  |  |  |  |  | 
| 355 | 0 |  |  |  |  |  | my $key_string = $2; | 
| 356 |  |  |  |  |  |  |  | 
| 357 | 0 |  |  |  |  |  | for ($key_string) { | 
| 358 | 0 |  |  |  |  |  | tr/+/ /; | 
| 359 | 0 |  |  |  |  |  | s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; | 
|  | 0 |  |  |  |  |  |  | 
| 360 | 0 |  |  |  |  |  | s/\s+/ /gs; | 
| 361 |  |  |  |  |  |  | } | 
| 362 |  |  |  |  |  |  |  | 
| 363 | 0 | 0 |  |  |  |  | if ($key_string =~ /\w/) { | 
| 364 | 0 |  |  |  |  |  | return $key_string; | 
| 365 |  |  |  |  |  |  | } | 
| 366 |  |  |  |  |  |  |  | 
| 367 |  |  |  |  |  |  | } | 
| 368 |  |  |  |  |  |  |  | 
| 369 |  |  |  |  |  |  | } | 
| 370 |  |  |  |  |  |  |  | 
| 371 |  |  |  |  |  |  | # Failing all that, some Search-Engines don't overload the | 
| 372 |  |  |  |  |  |  | # query_string with values, and just make the query_string | 
| 373 |  |  |  |  |  |  | # the search terms. The next part looks for that, and returns | 
| 374 |  |  |  |  |  |  | # the whole query_string (cleaned) if this appears to be the | 
| 375 |  |  |  |  |  |  | # case. | 
| 376 |  |  |  |  |  |  |  | 
| 377 |  |  |  |  |  |  |  | 
| 378 | 0 | 0 |  |  |  |  | if ($query_string !~ /\=/) { | 
| 379 |  |  |  |  |  |  |  | 
| 380 | 0 |  |  |  |  |  | for ($query_string) { | 
| 381 | 0 |  |  |  |  |  | tr/+/ /; | 
| 382 | 0 |  |  |  |  |  | s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; | 
|  | 0 |  |  |  |  |  |  | 
| 383 | 0 |  |  |  |  |  | s/\s+/ /gs; | 
| 384 |  |  |  |  |  |  | } | 
| 385 |  |  |  |  |  |  |  | 
| 386 | 0 |  |  |  |  |  | return $query_string; | 
| 387 |  |  |  |  |  |  | } | 
| 388 |  |  |  |  |  |  |  | 
| 389 |  |  |  |  |  |  |  | 
| 390 | 0 |  |  |  |  |  | return; | 
| 391 |  |  |  |  |  |  | } | 
| 392 |  |  |  |  |  |  |  | 
| 393 |  |  |  |  |  |  | 1; | 
| 394 |  |  |  |  |  |  |  | 
| 395 |  |  |  |  |  |  | __END__ |