line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package URI::ParseSearchString; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
require Exporter; |
4
|
|
|
|
|
|
|
@ISA = (Exporter); |
5
|
|
|
|
|
|
|
@EXPORT = ( qw (parse_search_string findEngine se_host se_name se_term) ); |
6
|
|
|
|
|
|
|
|
7
|
3
|
|
|
3
|
|
30280
|
use warnings; |
|
3
|
|
|
|
|
8
|
|
|
3
|
|
|
|
|
116
|
|
8
|
3
|
|
|
3
|
|
19
|
use strict; |
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
100
|
|
9
|
3
|
|
|
3
|
|
2832
|
use URI; |
|
3
|
|
|
|
|
18044
|
|
|
3
|
|
|
|
|
109
|
|
10
|
3
|
|
|
3
|
|
34686
|
use Data::Dumper; |
|
3
|
|
|
|
|
23182
|
|
|
3
|
|
|
|
|
12954
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=encoding utf8 |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
URI::ParseSearchString - parse search engine referrer URLs and extract keywords used |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 VERSION |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
Version 3.51 (Diablo 3 edition) |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=cut |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
our $VERSION = '3.51'; |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=head1 SYNOPSIS |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
use URI::ParseSearchString ; |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
my $uparse = new URI::ParseSearchString(); |
31
|
|
|
|
|
|
|
my $ref = 'http://www.google.com/search?hl=en&q=a+simple+test&btnG=Google+Search'; |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
my $query_terms = $uparse->se_term( $ref ); |
34
|
|
|
|
|
|
|
my $canonical = $uparse->se_name( $ref ); |
35
|
|
|
|
|
|
|
my $hostname = $uparse->se_host( $ref ); |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=head1 FUNCTIONS |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
=head2 new |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
Creates a new instance object of the module. |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
my $uparse = new URI::ParseSearchString() ; |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=cut |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
my $RH_LOOKUPS = { |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
'answers.yahoo.com' => { name => 'Yahoo Answers', q=>'p' }, |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
'sapo.pt' => { name => 'Pesquisa SAPO', q => 'q'}, |
52
|
|
|
|
|
|
|
'iol.pt' => { name => 'Pesquisa Iol', q => 'q'}, |
53
|
|
|
|
|
|
|
'pesquisa.clix.pt' => { name => 'Pesquisa Clix', q => 'question'}, |
54
|
|
|
|
|
|
|
'aeiou.pt' => { name => 'Aeiou', q => 'q'}, |
55
|
|
|
|
|
|
|
'cuil.pt' => { name => 'Cuil PT', q => 'q' }, |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
'fotos.sapo.pt' => { name => 'SAPO fotos', q => 'word'}, |
59
|
|
|
|
|
|
|
'videos.sapo.pt' => { name => 'SAPO videos', q => 'word'}, |
60
|
|
|
|
|
|
|
'sabores.sapo.pt' => { name => 'SAPO sabores', q => 'cxSearch'}, |
61
|
|
|
|
|
|
|
'jn.sapo.pt' => { name => 'Jornal Noticias', q => 'Pesquisa'}, |
62
|
|
|
|
|
|
|
'dn.sapo.pt' => { name => 'Diario Noticias', q => 'Pesquisa'}, |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
'rtp.pt' => { name => 'Rtp', q => 'search'}, |
66
|
|
|
|
|
|
|
'record.pt' => { name => 'Jornal Record', q => 'q'}, |
67
|
|
|
|
|
|
|
'correiodamanha.pt' => { name => 'Correio da Manha', q => 'pesquisa'}, |
68
|
|
|
|
|
|
|
'correiomanha.pt' => { name => 'Correio Manha', q => 'pesquisa'}, |
69
|
|
|
|
|
|
|
'publico.clix.pt' => { name => 'Publico', q => 'q'}, |
70
|
|
|
|
|
|
|
'xl.pt' => { name => 'XL', q => 'pesquisa'}, |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
'abacho.com' => { name => 'Abacho', q => 'q'}, |
73
|
|
|
|
|
|
|
'alice.it' => { name => 'Alice.it', q => 'qs' }, |
74
|
|
|
|
|
|
|
'altavista.com' => { name => 'Altavista', q => 'q' }, |
75
|
|
|
|
|
|
|
'aolsearch.aol.com' => { name => 'AOL Search', q => 'query' }, |
76
|
|
|
|
|
|
|
'as.starware.com' => { name => 'Starware', q => 'qry' }, |
77
|
|
|
|
|
|
|
'blogs.icerocket.com' => { name => 'IceRocket', q => 'q' }, |
78
|
|
|
|
|
|
|
'blogsearch.google.com' => { name => 'Google Blogsearch', q => 'q' }, |
79
|
|
|
|
|
|
|
'busca.orange.es' => { name => 'Orange ES', q => 'buscar' }, |
80
|
|
|
|
|
|
|
'buscador.lycos.es' => { name => 'Lycos ES', q => 'query' }, |
81
|
|
|
|
|
|
|
'buscador.terra.es' => { name => 'Terra ES', q => 'query' }, |
82
|
|
|
|
|
|
|
'buscar.ozu.es' => { name => 'Ozu ES', q => 'q' }, |
83
|
|
|
|
|
|
|
'categorico.it' => { name => 'Categorico IT', q => 'q' }, |
84
|
|
|
|
|
|
|
'cuil.com' => { name => 'Cuil', q => 'q' }, |
85
|
|
|
|
|
|
|
'clusty.com' => { name => 'Clusty', q => 'query' }, |
86
|
|
|
|
|
|
|
'excite.com' => { name => 'Excite', q => 'q' }, |
87
|
|
|
|
|
|
|
'excite.it' => { name => 'Excite IT', q => 'q' }, |
88
|
|
|
|
|
|
|
'fastweb.it' => { name => 'Fastweb IT', q => 'q' }, |
89
|
|
|
|
|
|
|
'fastbrowsersearch.com' => { name => 'Fastbrowsersearch', q=> 'q' }, |
90
|
|
|
|
|
|
|
'godado.com' => { name => 'Godado', q => 'key' }, |
91
|
|
|
|
|
|
|
'godado.it' => { name => 'Godado (IT)', q => 'key' }, |
92
|
|
|
|
|
|
|
'gps.virgin.net' => { name => 'Virgin Search', q => 'q' }, |
93
|
|
|
|
|
|
|
'ilmotore.com' => { name => 'ilMotore', q => 'query' }, |
94
|
|
|
|
|
|
|
'ithaki.net' => { name => 'Ithaki', q => 'query' }, |
95
|
|
|
|
|
|
|
'kataweb.it' => { name => 'Kataweb IT', q => 'q' }, |
96
|
|
|
|
|
|
|
'libero.it' => { name => 'Libero IT', q => 'query' }, |
97
|
|
|
|
|
|
|
'lycos.it' => { name => 'Lycos IT', q => 'query' }, |
98
|
|
|
|
|
|
|
'search.aol.co.uk' => { name => 'AOL UK', q => 'query' }, |
99
|
|
|
|
|
|
|
'search.arabia.msn.com' => { name => 'MSN Arabia', q => 'q' }, |
100
|
|
|
|
|
|
|
'search.bbc.co.uk' => { name => 'BBC Search', q => 'q' }, |
101
|
|
|
|
|
|
|
'search.conduit.com' => { name => 'Conduit', q => 'q' }, |
102
|
|
|
|
|
|
|
'search.icq.com' => { name => 'ICQ dot com', q => 'q' }, |
103
|
|
|
|
|
|
|
'search.live.com' => { name => 'Live.com', q => 'q' }, |
104
|
|
|
|
|
|
|
'search.lycos.co.uk' => { name => 'Lycos UK', q => 'query' }, |
105
|
|
|
|
|
|
|
'search.lycos.com' => { name => 'Lycos', q => 'query' }, |
106
|
|
|
|
|
|
|
'search.msn.co.uk' => { name => 'MSN UK', q => 'q' }, |
107
|
|
|
|
|
|
|
'search.msn.com' => { name => 'MSN', q => 'q' }, |
108
|
|
|
|
|
|
|
'search.myway.com' => { name => 'MyWay', q => 'searchfor' }, |
109
|
|
|
|
|
|
|
'search.mywebsearch.com' => { name => 'My Web Search', q => 'searchfor' }, |
110
|
|
|
|
|
|
|
'search.ntlworld.com' => { name => 'NTLWorld', q => 'q' }, |
111
|
|
|
|
|
|
|
'search.orange.co.uk' => { name => 'Orange Search', q => 'q' }, |
112
|
|
|
|
|
|
|
'search.prodigy.msn.com' => { name => 'MSN Prodigy', q => 'q' }, |
113
|
|
|
|
|
|
|
'search.sweetim.com' => { name => 'Sweetim', q => 'q' }, |
114
|
|
|
|
|
|
|
'search.virginmedia.com' => { name => 'VirginMedia', q => 'q' }, |
115
|
|
|
|
|
|
|
'search.yahoo.co.jp' => { name => 'Yahoo Japan', q => 'p' }, |
116
|
|
|
|
|
|
|
'search.yahoo.com' => { name => 'Yahoo!', q => 'p' }, |
117
|
|
|
|
|
|
|
'search.yahoo.jp' => { name => 'Yahoo! Japan', q => 'p' }, |
118
|
|
|
|
|
|
|
'simpatico.ws' => { name => 'Simpatico IT', q => 'query' }, |
119
|
|
|
|
|
|
|
'soso.com' => { name => 'Soso', q => 'w' }, |
120
|
|
|
|
|
|
|
'suche.fireball.de' => { name => 'Fireball DE', q => 'query' }, |
121
|
|
|
|
|
|
|
'suche.web.de' => { name => 'Suche DE', q => 'su' }, |
122
|
|
|
|
|
|
|
'suche.t-online.de' => { name => 'T-Online', q => 'q' }, |
123
|
|
|
|
|
|
|
'thespider.it' => { name => 'TheSpider IT', q => 'q' }, |
124
|
|
|
|
|
|
|
'uk.altavista.com' => { name => 'Altavista UK', q => 'q' }, |
125
|
|
|
|
|
|
|
'uk.ask.com' => { name => 'Ask UK', q => 'q' }, |
126
|
|
|
|
|
|
|
'uk.search.yahoo.com' => { name => 'Yahoo! UK', q => 'p' }, |
127
|
|
|
|
|
|
|
'alltheweb.com' => { name => 'AllTheWeb', q => 'q' }, |
128
|
|
|
|
|
|
|
'ask.com' => { name => 'Ask dot com', q => 'q' }, |
129
|
|
|
|
|
|
|
'blueyonder.co.uk' => { name => 'Blueyonder', q => 'q' }, |
130
|
|
|
|
|
|
|
'feedster.com' => { name => 'Feedster', q => 'q' }, |
131
|
|
|
|
|
|
|
'google.ad' => { name => 'Google Andorra',q => 'q' }, |
132
|
|
|
|
|
|
|
'google.ae' => { name => 'Google United Arab Emirates', q => 'q' }, |
133
|
|
|
|
|
|
|
'google.af' => { name => 'Google Afghanistan', q => 'q' }, |
134
|
|
|
|
|
|
|
'google.ag' => { name => 'Google Antiqua and Barbuda', q => 'q' }, |
135
|
|
|
|
|
|
|
'google.am' => { name => 'Google Armenia', q => 'q' }, |
136
|
|
|
|
|
|
|
'google.as' => { name => 'Google American Samoa', q => 'q' }, |
137
|
|
|
|
|
|
|
'google.at' => { name => 'Google Austria', q => 'q' }, |
138
|
|
|
|
|
|
|
'google.az' => { name => 'Google Azerbaijan', q => 'q' }, |
139
|
|
|
|
|
|
|
'google.ba' => { name => 'Google Bosnia and Herzegovina', q => 'q' }, |
140
|
|
|
|
|
|
|
'google.be' => { name => 'Google Belgium', q => 'q' }, |
141
|
|
|
|
|
|
|
'google.bg' => { name => 'Google Bulgaria',q => 'q' }, |
142
|
|
|
|
|
|
|
'google.bi' => { name => 'Google Burundi', q => 'q' }, |
143
|
|
|
|
|
|
|
'google.biz' => { name => 'Google dot biz', q => 'q' }, |
144
|
|
|
|
|
|
|
'google.bo' => { name => 'Google Bolivia', q => 'q' }, |
145
|
|
|
|
|
|
|
'google.bs' => { name => 'Google Bahamas', q => 'q' }, |
146
|
|
|
|
|
|
|
'google.bz' => { name => 'Google Belize', q => 'q' }, |
147
|
|
|
|
|
|
|
'google.ca' => { name => 'Google Canada', q => 'q' }, |
148
|
|
|
|
|
|
|
'google.cc' => { name => 'Google Cocos Islands', q => 'q' }, |
149
|
|
|
|
|
|
|
'google.cd' => { name => 'Google Dem Rep of Congo', q => 'q' }, |
150
|
|
|
|
|
|
|
'google.cg' => { name => 'Google Rep of Congo', q => 'q' }, |
151
|
|
|
|
|
|
|
'google.ch' => { name => 'Google Switzerland', q => 'q' }, |
152
|
|
|
|
|
|
|
'google.ci' => { name => 'Google Cote dIvoire', q => 'q' }, |
153
|
|
|
|
|
|
|
'google.cl' => { name => 'Google Chile', q => 'q' }, |
154
|
|
|
|
|
|
|
'google.cn' => { name => 'Google China', q => 'q' }, |
155
|
|
|
|
|
|
|
'google.co.at' => { name => 'Google Austria', q => 'q' }, |
156
|
|
|
|
|
|
|
'google.co.bi' => { name => 'Google Burundi', q => 'q' }, |
157
|
|
|
|
|
|
|
'google.co.bw' => { name => 'Google Botswana', q => 'q' }, |
158
|
|
|
|
|
|
|
'google.co.ci' => { name => 'Google Ivory Coast', q => 'q' }, |
159
|
|
|
|
|
|
|
'google.co.ck' => { name => 'Google Cook Islands', q => 'q' }, |
160
|
|
|
|
|
|
|
'google.co.cr' => { name => 'Google Costa Rica', q => 'q' }, |
161
|
|
|
|
|
|
|
'google.co.gg' => { name => 'Google Guernsey', q => 'q' }, |
162
|
|
|
|
|
|
|
'google.co.gl' => { name => 'Google Greenland', q => 'q' }, |
163
|
|
|
|
|
|
|
'google.co.gy' => { name => 'Google Guyana', q => 'q' }, |
164
|
|
|
|
|
|
|
'google.co.hu' => { name => 'Google Hungary', q => 'q' }, |
165
|
|
|
|
|
|
|
'google.co.id' => { name => 'Google Indonesia', q => 'q' }, |
166
|
|
|
|
|
|
|
'google.co.il' => { name => 'Google Israel', q => 'q' }, |
167
|
|
|
|
|
|
|
'google.co.im' => { name => 'Google Isle of Man', q => 'q' }, |
168
|
|
|
|
|
|
|
'google.co.in' => { name => 'Google India', q => 'q' }, |
169
|
|
|
|
|
|
|
'google.co.it' => { name => 'Google Italy', q => 'q' }, |
170
|
|
|
|
|
|
|
'google.co.je' => { name => 'Google Jersey', q => 'q' }, |
171
|
|
|
|
|
|
|
'google.co.jp' => { name => 'Google Japan', q => 'q' }, |
172
|
|
|
|
|
|
|
'google.co.ke' => { name => 'Google Kenya', q => 'q' }, |
173
|
|
|
|
|
|
|
'google.co.kr' => { name => 'Google South Korea', q => 'q' }, |
174
|
|
|
|
|
|
|
'google.co.ls' => { name => 'Google Lesotho', q => 'q' }, |
175
|
|
|
|
|
|
|
'google.co.ma' => { name => 'Google Morocco', q => 'q' }, |
176
|
|
|
|
|
|
|
'google.co.mu' => { name => 'Google Mauritius', q => 'q' }, |
177
|
|
|
|
|
|
|
'google.co.mw' => { name => 'Google Malawi', q => 'q' }, |
178
|
|
|
|
|
|
|
'google.co.nz' => { name => 'Google New Zeland', q => 'q' }, |
179
|
|
|
|
|
|
|
'google.co.pn' => { name => 'Google Pitcairn Islands', q => 'q' }, |
180
|
|
|
|
|
|
|
'google.co.th' => { name => 'Google Thailand', q => 'q' }, |
181
|
|
|
|
|
|
|
'google.co.tt' => { name => 'Google Trinidad and Tobago', q => 'q' }, |
182
|
|
|
|
|
|
|
'google.co.ug' => { name => 'Google Uganda', q => 'q' }, |
183
|
|
|
|
|
|
|
'google.co.uk' => { name => 'Google UK', q => 'q' }, |
184
|
|
|
|
|
|
|
'google.co.uz' => { name => 'Google Uzbekistan', q => 'q' }, |
185
|
|
|
|
|
|
|
'google.co.ve' => { name => 'Google Venezuela', q => 'q' }, |
186
|
|
|
|
|
|
|
'google.co.vi' => { name => 'Google US Virgin Islands', q => 'q' }, |
187
|
|
|
|
|
|
|
'google.co.za' => { name => 'Google South Africa',q => 'q' }, |
188
|
|
|
|
|
|
|
'google.co.zm' => { name => 'Google Zambia', q => 'q' }, |
189
|
|
|
|
|
|
|
'google.co.zw' => { name => 'Google Zimbabwe', q => 'q' }, |
190
|
|
|
|
|
|
|
'google.com' => { name => 'Google', q => 'q' }, |
191
|
|
|
|
|
|
|
'google.com.af' => { name => 'Google Afghanistan', q => 'q' }, |
192
|
|
|
|
|
|
|
'google.com.ag' => { name => 'Google Antiqua and Barbuda', q => 'q' }, |
193
|
|
|
|
|
|
|
'google.com.ai' => { name => 'Google Anguilla', q => 'q' }, |
194
|
|
|
|
|
|
|
'google.com.ar' => { name => 'Google Argentina', q => 'q' }, |
195
|
|
|
|
|
|
|
'google.com.au' => { name => 'Google Australia', q => 'q' }, |
196
|
|
|
|
|
|
|
'google.com.az' => { name => 'Google Azerbaijan', q => 'q' }, |
197
|
|
|
|
|
|
|
'google.com.bd' => { name => 'Google Bangladesh', q => 'q' }, |
198
|
|
|
|
|
|
|
'google.com.bh' => { name => 'Google Bahrain', q => 'q' }, |
199
|
|
|
|
|
|
|
'google.com.bi' => { name => 'Google Burundi', q => 'q' }, |
200
|
|
|
|
|
|
|
'google.com.bn' => { name => 'Google Brunei Darussalam', q => 'q' }, |
201
|
|
|
|
|
|
|
'google.com.bo' => { name => 'Google Bolivia', q => 'q' }, |
202
|
|
|
|
|
|
|
'google.com.br' => { name => 'Google Brazil', q => 'q' }, |
203
|
|
|
|
|
|
|
'google.com.bs' => { name => 'Google Bahamas', q => 'q' }, |
204
|
|
|
|
|
|
|
'google.com.bz' => { name => 'Google Belize', q => 'q' }, |
205
|
|
|
|
|
|
|
'google.com.cn' => { name => 'Google China', q => 'q' }, |
206
|
|
|
|
|
|
|
'google.com.co' => { name => 'Google', q => 'q' }, |
207
|
|
|
|
|
|
|
'google.com.cu' => { name => 'Google Cuba', q => 'q' }, |
208
|
|
|
|
|
|
|
'google.com.do' => { name => 'Google Dominican Rep', q => 'q' }, |
209
|
|
|
|
|
|
|
'google.com.ec' => { name => 'Google Ecuador', q => 'q' }, |
210
|
|
|
|
|
|
|
'google.com.eg' => { name => 'Google Egypt', q => 'q' }, |
211
|
|
|
|
|
|
|
'google.com.et' => { name => 'Google Ethiopia', q => 'q' }, |
212
|
|
|
|
|
|
|
'google.com.fj' => { name => 'Google Fiji', q => 'q' }, |
213
|
|
|
|
|
|
|
'google.com.ge' => { name => 'Google Georgia', q => 'q' }, |
214
|
|
|
|
|
|
|
'google.com.gh' => { name => 'Google Ghana', q => 'q' }, |
215
|
|
|
|
|
|
|
'google.com.gi' => { name => 'Google Gibraltar', q => 'q' }, |
216
|
|
|
|
|
|
|
'google.com.gl' => { name => 'Google Greenland', q => 'q' }, |
217
|
|
|
|
|
|
|
'google.com.gp' => { name => 'Google Guadeloupe', q => 'q' }, |
218
|
|
|
|
|
|
|
'google.com.gr' => { name => 'Google Greece', q => 'q' }, |
219
|
|
|
|
|
|
|
'google.com.gt' => { name => 'Google Guatemala', q => 'q' }, |
220
|
|
|
|
|
|
|
'google.com.gy' => { name => 'Google Guyana', q => 'q' }, |
221
|
|
|
|
|
|
|
'google.com.hk' => { name => 'Google Hong Kong', q => 'q' }, |
222
|
|
|
|
|
|
|
'google.com.hn' => { name => 'Google Honduras', q => 'q' }, |
223
|
|
|
|
|
|
|
'google.com.hr' => { name => 'Google Croatia', q => 'q' }, |
224
|
|
|
|
|
|
|
'google.com.jm' => { name => 'Google Jamaica', q => 'q' }, |
225
|
|
|
|
|
|
|
'google.com.jo' => { name => 'Google Jordan', q => 'q' }, |
226
|
|
|
|
|
|
|
'google.com.kg' => { name => 'Google Kyrgyzstan', q => 'q' }, |
227
|
|
|
|
|
|
|
'google.com.kh' => { name => 'Google Cambodia', q => 'q' }, |
228
|
|
|
|
|
|
|
'google.com.ki' => { name => 'Google Kiribati', q => 'q' }, |
229
|
|
|
|
|
|
|
'google.com.kz' => { name => 'Google Kazakhstan', q => 'q' }, |
230
|
|
|
|
|
|
|
'google.com.lk' => { name => 'Google Sri Lanka', q => 'q' }, |
231
|
|
|
|
|
|
|
'google.com.lv' => { name => 'Google Latvia', q => 'q' }, |
232
|
|
|
|
|
|
|
'google.com.ly' => { name => 'Google Libya', q => 'q' }, |
233
|
|
|
|
|
|
|
'google.com.mt' => { name => 'Google Malta', q => 'q' }, |
234
|
|
|
|
|
|
|
'google.com.mu' => { name => 'Google Mauritius', q => 'q' }, |
235
|
|
|
|
|
|
|
'google.com.mw' => { name => 'Google Malawi', q => 'q' }, |
236
|
|
|
|
|
|
|
'google.com.mx' => { name => 'Google Mexico', q => 'q' }, |
237
|
|
|
|
|
|
|
'google.com.my' => { name => 'Google Malaysia', q => 'q' }, |
238
|
|
|
|
|
|
|
'google.com.na' => { name => 'Google Namibia', q => 'q' }, |
239
|
|
|
|
|
|
|
'google.com.nf' => { name => 'Google Norfolk Island', q => 'q' }, |
240
|
|
|
|
|
|
|
'google.com.ng' => { name => 'Google Nigeria', q => 'q' }, |
241
|
|
|
|
|
|
|
'google.com.ni' => { name => 'Google Nicaragua', q => 'q' }, |
242
|
|
|
|
|
|
|
'google.com.np' => { name => 'Google Nepal', q => 'q' }, |
243
|
|
|
|
|
|
|
'google.com.nr' => { name => 'Google Nauru', q => 'q' }, |
244
|
|
|
|
|
|
|
'google.com.om' => { name => 'Google Oman', q => 'q' }, |
245
|
|
|
|
|
|
|
'google.com.pa' => { name => 'Google Panama', q => 'q' }, |
246
|
|
|
|
|
|
|
'google.com.pe' => { name => 'Google Peru', q => 'q' }, |
247
|
|
|
|
|
|
|
'google.com.ph' => { name => 'Google Philipines', q => 'q' }, |
248
|
|
|
|
|
|
|
'google.com.pk' => { name => 'Google Pakistan', q => 'q' }, |
249
|
|
|
|
|
|
|
'google.com.pl' => { name => 'Google Poland', q => 'q' }, |
250
|
|
|
|
|
|
|
'google.com.pr' => { name => 'Google Puerto Rico', q => 'q' }, |
251
|
|
|
|
|
|
|
'google.com.pt' => { name => 'Google Portugal', q => 'q' }, |
252
|
|
|
|
|
|
|
'google.com.py' => { name => 'Google Paraguay', q => 'q' }, |
253
|
|
|
|
|
|
|
'google.com.qa' => { name => 'Google', q => 'q' }, |
254
|
|
|
|
|
|
|
'google.com.ru' => { name => 'Google Russia', q => 'q' }, |
255
|
|
|
|
|
|
|
'google.com.sa' => { name => 'Google Saudi Arabia', q => 'q' }, |
256
|
|
|
|
|
|
|
'google.com.sb' => { name => 'Google Solomon Islands', q => 'q' }, |
257
|
|
|
|
|
|
|
'google.com.sc' => { name => 'Google Seychelles', q => 'q' }, |
258
|
|
|
|
|
|
|
'google.com.sg' => { name => 'Google Singapore', q => 'q' }, |
259
|
|
|
|
|
|
|
'google.com.sv' => { name => 'Google El Savador', q => 'q' }, |
260
|
|
|
|
|
|
|
'google.com.tj' => { name => 'Google Tajikistan', q => 'q' }, |
261
|
|
|
|
|
|
|
'google.com.tr' => { name => 'Google Turkey', q => 'q' }, |
262
|
|
|
|
|
|
|
'google.com.tt' => { name => 'Google Trinidad and Tobago', q => 'q' }, |
263
|
|
|
|
|
|
|
'google.com.tw' => { name => 'Google Taiwan', q => 'q' }, |
264
|
|
|
|
|
|
|
'google.com.ua' => { name => 'Google Ukraine', q => 'q' }, |
265
|
|
|
|
|
|
|
'google.com.uy' => { name => 'Google Uruguay', q => 'q' }, |
266
|
|
|
|
|
|
|
'google.com.uz' => { name => 'Google Uzbekistan', q => 'q' }, |
267
|
|
|
|
|
|
|
'google.com.ve' => { name => 'Google Venezuela', q => 'q' }, |
268
|
|
|
|
|
|
|
'google.com.vi' => { name => 'Google US Virgin Islands', q => 'q' }, |
269
|
|
|
|
|
|
|
'google.com.vn' => { name => 'Google Vietnam', q => 'q' }, |
270
|
|
|
|
|
|
|
'google.com.ws' => { name => 'Google Samoa', q => 'q' }, |
271
|
|
|
|
|
|
|
'google.cz' => { name => 'Google Czech Rep', q => 'q' }, |
272
|
|
|
|
|
|
|
'google.de' => { name => 'Google Germany', q => 'q' }, |
273
|
|
|
|
|
|
|
'google.dj' => { name => 'Google Djubouti', q => 'q' }, |
274
|
|
|
|
|
|
|
'google.dk' => { name => 'Google Denmark', q => 'q' }, |
275
|
|
|
|
|
|
|
'google.dm' => { name => 'Google Dominica', q => 'q' }, |
276
|
|
|
|
|
|
|
'google.ec' => { name => 'Google Ecuador', q => 'q' }, |
277
|
|
|
|
|
|
|
'google.ee' => { name => 'Google Estonia', q => 'q' }, |
278
|
|
|
|
|
|
|
'google.es' => { name => 'Google Spain', q => 'q' }, |
279
|
|
|
|
|
|
|
'google.fi' => { name => 'Google Finland', q => 'q' }, |
280
|
|
|
|
|
|
|
'google.fm' => { name => 'Google Micronesia', q => 'q' }, |
281
|
|
|
|
|
|
|
'google.fr' => { name => 'Google France', q => 'q' }, |
282
|
|
|
|
|
|
|
'google.gd' => { name => 'Google Grenada', q => 'q' }, |
283
|
|
|
|
|
|
|
'google.ge' => { name => 'Google Georgia', q => 'q' }, |
284
|
|
|
|
|
|
|
'google.gf' => { name => 'Google French Guiana', q => 'q' }, |
285
|
|
|
|
|
|
|
'google.gg' => { name => 'Google Guernsey', q => 'q' }, |
286
|
|
|
|
|
|
|
'google.gl' => { name => 'Google Greenland', q => 'q' }, |
287
|
|
|
|
|
|
|
'google.gm' => { name => 'Google Gambia', q => 'q' }, |
288
|
|
|
|
|
|
|
'google.gp' => { name => 'Google Guadeloupe', q => 'q' }, |
289
|
|
|
|
|
|
|
'google.gr' => { name => 'Google Greece', q => 'q' }, |
290
|
|
|
|
|
|
|
'google.gy' => { name => 'Google Guyana', q => 'q' }, |
291
|
|
|
|
|
|
|
'google.hk' => { name => 'Google Hong Kong', q => 'q' }, |
292
|
|
|
|
|
|
|
'google.hn' => { name => 'Google Honduras', q => 'q' }, |
293
|
|
|
|
|
|
|
'google.hr' => { name => 'Google Croatia', q => 'q' }, |
294
|
|
|
|
|
|
|
'google.ht' => { name => 'Google Haiti', q => 'q' }, |
295
|
|
|
|
|
|
|
'google.hu' => { name => 'Google Hungary', q => 'q' }, |
296
|
|
|
|
|
|
|
'google.ie' => { name => 'Google Ireland', q => 'q' }, |
297
|
|
|
|
|
|
|
'google.im' => { name => 'Google Isle of Man', q => 'q' }, |
298
|
|
|
|
|
|
|
'google.in' => { name => 'Google India', q => 'q' }, |
299
|
|
|
|
|
|
|
'google.info' => { name => 'Google dot info', q => 'q' }, |
300
|
|
|
|
|
|
|
'google.is' => { name => 'Google Iceland', q => 'q' }, |
301
|
|
|
|
|
|
|
'google.it' => { name => 'Google Italy', q => 'q' }, |
302
|
|
|
|
|
|
|
'google.je' => { name => 'Google Jersey', q => 'q' }, |
303
|
|
|
|
|
|
|
'google.jo' => { name => 'Google Jordan', q => 'q' }, |
304
|
|
|
|
|
|
|
'google.jobs' => { name => 'Google dot jobs', q => 'q' }, |
305
|
|
|
|
|
|
|
'google.jp' => { name => 'Google Japan', q => 'q' }, |
306
|
|
|
|
|
|
|
'google.kg' => { name => 'Google Kyrgyzstan', q => 'q' }, |
307
|
|
|
|
|
|
|
'google.ki' => { name => 'Google Kiribati', q => 'q' }, |
308
|
|
|
|
|
|
|
'google.kz' => { name => 'Google Kazakhstan', q => 'q' }, |
309
|
|
|
|
|
|
|
'google.la' => { name => 'Google Laos', q => 'q' }, |
310
|
|
|
|
|
|
|
'google.li' => { name => 'Google Liechtenstein', q => 'q' }, |
311
|
|
|
|
|
|
|
'google.lk' => { name => 'Google Sri Lanka', q => 'q' }, |
312
|
|
|
|
|
|
|
'google.lt' => { name => 'Google Lithuania', q => 'q' }, |
313
|
|
|
|
|
|
|
'google.lu' => { name => 'Google Luxembourg', q => 'q' }, |
314
|
|
|
|
|
|
|
'google.lv' => { name => 'Google Latvia', q => 'q' }, |
315
|
|
|
|
|
|
|
'google.ma' => { name => 'Google Morocco', q => 'q' }, |
316
|
|
|
|
|
|
|
'google.md' => { name => 'Google Moldova', q => 'q' }, |
317
|
|
|
|
|
|
|
'google.mn' => { name => 'Google Mongolia', q => 'q' }, |
318
|
|
|
|
|
|
|
'google.mobi' => { name => 'Google dot mobi', q => 'q' }, |
319
|
|
|
|
|
|
|
'google.ms' => { name => 'Google Montserrat', q => 'q' }, |
320
|
|
|
|
|
|
|
'google.mu' => { name => 'Google Mauritius', q => 'q' }, |
321
|
|
|
|
|
|
|
'google.mv' => { name => 'Google Maldives', q => 'q' }, |
322
|
|
|
|
|
|
|
'google.mw' => { name => 'Google Malawi', q => 'q' }, |
323
|
|
|
|
|
|
|
'google.net' => { name => 'Google dot net', q => 'q' }, |
324
|
|
|
|
|
|
|
'google.nf' => { name => 'Google Norfolk Island', q => 'q' }, |
325
|
|
|
|
|
|
|
'google.nl' => { name => 'Google Netherlands', q => 'q' }, |
326
|
|
|
|
|
|
|
'google.no' => { name => 'Google Norway', q => 'q' }, |
327
|
|
|
|
|
|
|
'google.nr' => { name => 'Google Nauru', q => 'q' }, |
328
|
|
|
|
|
|
|
'google.nu' => { name => 'Google Niue', q => 'q' }, |
329
|
|
|
|
|
|
|
'google.off.ai' => { name => 'Google Anguilla', q => 'q' }, |
330
|
|
|
|
|
|
|
'google.ph' => { name => 'Google Philipines', q => 'q' }, |
331
|
|
|
|
|
|
|
'google.pk' => { name => 'Google Pakistan', q => 'q' }, |
332
|
|
|
|
|
|
|
'google.pl' => { name => 'Google Poland', q => 'q' }, |
333
|
|
|
|
|
|
|
'google.pn' => { name => 'Google Pitcairn Islands', q => 'q' }, |
334
|
|
|
|
|
|
|
'google.pr' => { name => 'Google Puerto Rico', q => 'q' }, |
335
|
|
|
|
|
|
|
'google.pt' => { name => 'Google Portugal', q => 'q' }, |
336
|
|
|
|
|
|
|
'google.ro' => { name => 'Google Romania', q => 'q' }, |
337
|
|
|
|
|
|
|
'google.ru' => { name => 'Google Russia', q => 'q' }, |
338
|
|
|
|
|
|
|
'google.rw' => { name => 'Google Rwanda', q => 'q' }, |
339
|
|
|
|
|
|
|
'google.sc' => { name => 'Google Seychelles', q => 'q' }, |
340
|
|
|
|
|
|
|
'google.se' => { name => 'Google Sweden', q => 'q' }, |
341
|
|
|
|
|
|
|
'google.sg' => { name => 'Google Singapore', q => 'q' }, |
342
|
|
|
|
|
|
|
'google.sh' => { name => 'Google Saint Helena', q => 'q' }, |
343
|
|
|
|
|
|
|
'google.si' => { name => 'Google Slovenia', q => 'q' }, |
344
|
|
|
|
|
|
|
'google.sk' => { name => 'Google Slovakia', q => 'q' }, |
345
|
|
|
|
|
|
|
'google.sm' => { name => 'Google San Marino', q => 'q' }, |
346
|
|
|
|
|
|
|
'google.sn' => { name => 'Google Senegal', q => 'q' }, |
347
|
|
|
|
|
|
|
'google.sr' => { name => 'Google Suriname', q => 'q' }, |
348
|
|
|
|
|
|
|
'google.st' => { name => 'Google Sao Tome', q => 'q' }, |
349
|
|
|
|
|
|
|
'google.tk' => { name => 'Google Tokelau', q => 'q' }, |
350
|
|
|
|
|
|
|
'google.tm' => { name => 'Google Turkmenistan', q => 'q' }, |
351
|
|
|
|
|
|
|
'google.to' => { name => 'Google Tonga', q => 'q' }, |
352
|
|
|
|
|
|
|
'google.tp' => { name => 'Google East Timor', q => 'q' }, |
353
|
|
|
|
|
|
|
'google.tt' => { name => 'Google Trinidad and Tobago', q => 'q' }, |
354
|
|
|
|
|
|
|
'google.tv' => { name => 'Google Tuvalu', q => 'q' }, |
355
|
|
|
|
|
|
|
'google.tw' => { name => 'Google Taiwan', q => 'q' }, |
356
|
|
|
|
|
|
|
'google.ug' => { name => 'Google Uganda', q => 'q' }, |
357
|
|
|
|
|
|
|
'google.us' => { name => 'Google US', q => 'q' }, |
358
|
|
|
|
|
|
|
'google.uz' => { name => 'Google Uzbekistan', q => 'q' }, |
359
|
|
|
|
|
|
|
'google.vg' => { name => 'Google British Virgin Islands', q => 'q' }, |
360
|
|
|
|
|
|
|
'google.vn' => { name => 'Google Vietnam', q => 'q' }, |
361
|
|
|
|
|
|
|
'google.vu' => { name => 'Google Vanuatu', q => 'q' }, |
362
|
|
|
|
|
|
|
'google.ws' => { name => 'Google Samoa', q => 'q' }, |
363
|
|
|
|
|
|
|
'hotbot.com' => { name => 'HotBot', q => 'query' }, |
364
|
|
|
|
|
|
|
'in.gr' => { name => 'In GR', q => 'q' }, |
365
|
|
|
|
|
|
|
'mamma.com' => { name => 'Mamma', q => 'query' }, |
366
|
|
|
|
|
|
|
'mahalo.com' => { name => 'Mahalo', q => 'search' }, |
367
|
|
|
|
|
|
|
'megasearching.net' => { name => 'Megasearching', q => 's' }, |
368
|
|
|
|
|
|
|
'mirago.co.uk' => { name => 'Mirago UK', q => 'qry' }, |
369
|
|
|
|
|
|
|
'netscape.com' => { name => 'Netscape', q => 's' }, |
370
|
|
|
|
|
|
|
'community.paglo.com' => { name => 'Paglo', q => 'q' }, |
371
|
|
|
|
|
|
|
'pathfinder.gr' => { name => 'Pathfinder GR', q => 'q' }, |
372
|
|
|
|
|
|
|
'phantis.com' => { name => 'Phantis GR' , q => 'q'}, |
373
|
|
|
|
|
|
|
'robby.gr' => { name => 'Robby GR' , q => 'searchstr' }, |
374
|
|
|
|
|
|
|
'sproose.com' => { name => 'Sproose', q => 'query' }, |
375
|
|
|
|
|
|
|
'technorati.com' => { name => 'Technorati', q => 'q' }, |
376
|
|
|
|
|
|
|
'tesco.net' => { name => 'Tesco Search', q => 'q' }, |
377
|
|
|
|
|
|
|
'tiscali.co.uk' => { name => 'Tiscali UK', q => 'query' }, |
378
|
|
|
|
|
|
|
'bing.com' => { name => 'Bing', q => 'q' }, |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
'acbusca.com' => { name => 'ACBusca', q => 'query' }, |
381
|
|
|
|
|
|
|
'atalhocerto.com.br' => { name => 'Atalho Certo', q => 'keyword' }, |
382
|
|
|
|
|
|
|
'bastaclicar.com.br' => { name => 'Basta Clicar', q => 'search' }, |
383
|
|
|
|
|
|
|
'bemrapido.com.br' => { name => 'Bem Rapido', q => 'chave' }, |
384
|
|
|
|
|
|
|
'br.altavista.com' => { name => 'AltaVista Brasil', q => 'q' }, |
385
|
|
|
|
|
|
|
'br.search.yahoo.com' => { name => 'Yahoo Brazil', q => 'p' }, |
386
|
|
|
|
|
|
|
'busca.uol.com.br' => { name => 'Radar UOL', q => 'q' }, |
387
|
|
|
|
|
|
|
'buscaaqui.com.br' => { name => 'Busca Aqui', q => 'q' }, |
388
|
|
|
|
|
|
|
'buscador.terra.com.br' => { name => 'Terra Busca', q => 'query' }, |
389
|
|
|
|
|
|
|
'cade.search.yahoo.com' => { name => 'Cadê', q => 'p' }, |
390
|
|
|
|
|
|
|
'clickgratis.com.br' => { name => 'Click Gratis', q => 'query' }, |
391
|
|
|
|
|
|
|
'entrada.com.br' => { name => 'Entrada', q => 'q' }, |
392
|
|
|
|
|
|
|
'gigabusca.com.br' => { name => 'Giga Busca', q => 'what' }, |
393
|
|
|
|
|
|
|
'internetica.com.br' => { name => 'Internetica', q => 'busca' }, |
394
|
|
|
|
|
|
|
'katatudo.com.br' => { name => 'KataTudo', q => 'q' }, |
395
|
|
|
|
|
|
|
'minasplanet.com.br' => { name => 'Minas Planet', q => 'term' }, |
396
|
|
|
|
|
|
|
'speedybusca.com.br' => { name => 'SpeedyBusca', q => 'q' }, |
397
|
|
|
|
|
|
|
'vaibuscar.com.br' => { name => 'Vai Busca', q => 'q' }, |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
'search.conduit.com' => { name => 'Conduit', q=>'q' }, |
400
|
|
|
|
|
|
|
'in.search.yahoo.com' => { name => 'Yahoo India', q => 'p' }, |
401
|
|
|
|
|
|
|
'rediff.com' => { name => 'Rediff', q => 'MT' }, |
402
|
|
|
|
|
|
|
'guruji.com' => { name => 'Guruji', q => 'q' }, |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
'isohunt.com' => { name => 'Isohunt', q => 'ihq' }, |
405
|
|
|
|
|
|
|
'btjunkie.org' => { name => 'BT Junkie', q => 'q' }, |
406
|
|
|
|
|
|
|
'torrentz.eu' => { name => 'Torrentz', q => 'f' } |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
}; |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
sub new { |
411
|
1
|
|
|
1
|
1
|
820
|
my $class = shift ; |
412
|
1
|
|
|
|
|
2
|
my $self = { } ; |
413
|
1
|
|
|
|
|
3
|
$self->{engines} = $RH_LOOKUPS; |
414
|
1
|
|
|
|
|
4
|
return bless $self, $class ; |
415
|
|
|
|
|
|
|
} |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
=head2 parse_search_string |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
This module provides a simple function to parse and extract search engine query strings. It was designed and tested having |
420
|
|
|
|
|
|
|
Apache referrer logs in mind. It can be used for a wide number of purposes, including tracking down what keywords people use |
421
|
|
|
|
|
|
|
on popular search engines before they land on a site. Although a number of existing modules and scripts exist for this purpose, |
422
|
|
|
|
|
|
|
the majority of them are either outdated using obsolete search strings associated with each engine. |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
The default function exported is "parse_search_string" which accepts an unquoted referrer string as input and returns the |
425
|
|
|
|
|
|
|
search engine query contained within. It currently works with both escaped and un-escaped queries and will translate the search |
426
|
|
|
|
|
|
|
terms before returning them in the latter case. The function returns undef in all other cases and errors. |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
for example: |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
my $ref = 'http://www.google.com/search?hl=en&q=a+simple+test&btnG=Google+Search'; |
431
|
|
|
|
|
|
|
my $terms = |
432
|
|
|
|
|
|
|
$uparse->parse_search_string( $ref ); |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
would return I<'a simple test'> |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
whereas |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
my $ref = 'http://www.mamma.com/Mamma?utfout=1&qtype=0&query=a+more%21+complex_+search%24&Submit=%C2%A0%C2%A0Search%C2%A0%C2%A0'; |
439
|
|
|
|
|
|
|
my $terms = |
440
|
|
|
|
|
|
|
$uparse->parse_search_string( $ref ); |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
would return I<'a more! complex_ search$'> |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
=cut |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=head2 se_term |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
Same as parse_search_string(). |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
=cut |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
sub se_term { |
453
|
124
|
|
|
124
|
1
|
67158
|
my $self = shift ; |
454
|
124
|
|
|
|
|
207
|
my $string = shift ; |
455
|
124
|
50
|
|
|
|
321
|
return unless defined $string ; |
456
|
124
|
|
|
|
|
271
|
return $self->parse_search_string($string) ; |
457
|
|
|
|
|
|
|
} |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
## internal method for creating a URI object |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
sub _uri { |
462
|
264
|
|
|
264
|
|
282
|
my $self = shift; |
463
|
264
|
|
|
|
|
311
|
my $string = shift; |
464
|
|
|
|
|
|
|
|
465
|
264
|
50
|
|
|
|
503
|
return unless defined($string); |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
## create a new URI object |
468
|
|
|
|
|
|
|
## and return unless its http or https |
469
|
|
|
|
|
|
|
|
470
|
264
|
|
|
|
|
961
|
my $uri = URI->new( $string ); |
471
|
|
|
|
|
|
|
return |
472
|
264
|
100
|
100
|
|
|
29332
|
unless (defined($uri) |
|
|
|
33
|
|
|
|
|
473
|
|
|
|
|
|
|
&& (ref($uri) eq 'URI::http' || ref($uri) eq 'URI::https')); |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
## feedster and technorati as they do not follow |
476
|
|
|
|
|
|
|
## the usual search patterns thus we extract the query |
477
|
|
|
|
|
|
|
## terms by taking the last element from the path segments |
478
|
|
|
|
|
|
|
|
479
|
260
|
|
|
|
|
881
|
my $host = $uri->host; |
480
|
|
|
|
|
|
|
|
481
|
260
|
100
|
100
|
|
|
13750
|
return unless defined($host) && $host; |
482
|
|
|
|
|
|
|
|
483
|
256
|
100
|
|
|
|
870
|
if ( $host =~ m/(feedster|technorati)\.com$/ ){ |
484
|
4
|
|
|
|
|
24
|
$uri->query_form( q => ( $uri->path_segments)[-1]); |
485
|
|
|
|
|
|
|
} |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
## clean up the host until it matches |
488
|
|
|
|
|
|
|
## something we already know about |
489
|
|
|
|
|
|
|
|
490
|
256
|
|
|
|
|
1399
|
while( ! defined $self->{'engines'}{ $host }){ |
491
|
134
|
|
|
|
|
242
|
my $c = index($host, '.'); |
492
|
134
|
100
|
|
|
|
270
|
last if $c <0; |
493
|
132
|
|
|
|
|
686
|
$host= substr($host, $c+1); |
494
|
|
|
|
|
|
|
} |
495
|
|
|
|
|
|
|
|
496
|
256
|
|
|
|
|
707
|
return ($uri, $host); |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
} |
499
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
sub parse_search_string { |
502
|
134
|
|
|
134
|
1
|
4178
|
my $self = shift ; |
503
|
134
|
|
|
|
|
157
|
my $string = shift ; |
504
|
134
|
50
|
|
|
|
299
|
return unless defined($string); |
505
|
|
|
|
|
|
|
|
506
|
134
|
|
|
|
|
283
|
my ($uri,$host) = $self->_uri( $string ); |
507
|
134
|
100
|
|
|
|
327
|
return unless defined($uri); |
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
## get rid of the www |
510
|
132
|
|
|
|
|
184
|
$host =~ m!^www\.!; |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
## find the query parameter the engine uses |
513
|
132
|
|
|
|
|
445
|
my $q = $self->{'engines'}{$host}{'q'}; |
514
|
132
|
100
|
|
|
|
297
|
return unless defined $q; |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
## return the string passed to the query parameter |
517
|
128
|
|
|
|
|
398
|
my %h_query = $uri->query_form; |
518
|
|
|
|
|
|
|
|
519
|
128
|
|
|
|
|
12082
|
return $h_query{$q} |
520
|
|
|
|
|
|
|
} |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
=head2 findEngine |
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
Returns a list with the hostname of the search engine as the first element and |
525
|
|
|
|
|
|
|
the canonical name as the second element. |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
my $ref = 'http://www.google.com/search?hl=en&q=a+simple+test&btnG=Google+Search'; |
528
|
|
|
|
|
|
|
my ($hostname, $canonical) = $uparse->findEngine( $ref ) ; |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
This will return 'google.com' as the search engine hostname and 'Google' as the name. |
531
|
|
|
|
|
|
|
This function will return I<undef> on error. |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
=cut |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
sub findEngine { |
536
|
130
|
|
|
130
|
1
|
169
|
my $self = shift ; |
537
|
130
|
|
|
|
|
154
|
my $string = shift ; |
538
|
|
|
|
|
|
|
|
539
|
130
|
50
|
|
|
|
254
|
return unless defined($string); |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
## create a URI object |
542
|
|
|
|
|
|
|
|
543
|
130
|
|
|
|
|
250
|
my ($uri,$hostname) = $self->_uri( $string ); |
544
|
130
|
100
|
66
|
|
|
701
|
return unless defined($uri) && $uri; |
545
|
124
|
50
|
33
|
|
|
1824
|
return unless defined($hostname) && $hostname; |
546
|
|
|
|
|
|
|
|
547
|
124
|
|
|
|
|
314
|
my $canonical = $self->{'engines'}->{$hostname}->{'name'}; |
548
|
|
|
|
|
|
|
|
549
|
124
|
|
|
|
|
324
|
return ($hostname,$canonical); |
550
|
|
|
|
|
|
|
} |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
=head2 se_host |
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
Wrapper around findEngine - returns just the hostname. |
555
|
|
|
|
|
|
|
This function will return I<undef> on error. |
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
=cut |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
sub se_host { |
560
|
130
|
|
|
130
|
1
|
1904
|
my $self = shift ; |
561
|
130
|
|
|
|
|
176
|
my $string = shift ; |
562
|
130
|
50
|
|
|
|
330
|
return unless defined($string) ; |
563
|
130
|
|
|
|
|
280
|
my ($host,$name) = $self->findEngine($string) ; |
564
|
130
|
|
|
|
|
469
|
return $host ; |
565
|
|
|
|
|
|
|
} |
566
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
=head2 se_name |
568
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
Wrapper around findEngine - returns just the canonical name; |
570
|
|
|
|
|
|
|
This function will return I<undef> on error. |
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
=cut |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
sub se_name { |
575
|
0
|
|
|
0
|
1
|
|
my $self = shift ; |
576
|
0
|
|
|
|
|
|
my $string = shift ; |
577
|
0
|
0
|
|
|
|
|
return unless defined($string); |
578
|
0
|
|
|
|
|
|
my ($host,$name) = $self->findEngine($string) ; |
579
|
0
|
|
|
|
|
|
return $name ; |
580
|
|
|
|
|
|
|
} |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
=head1 SUPPORTED ENGINES |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
Currently supported search engines include: Sproose, Google Namibia, Google Ivory Coast, Google Oman, Technorati, Google Ecuador, |
585
|
|
|
|
|
|
|
Google Norfolk Island, Mahalo, Google UK, Yahoo! UK, Google Micronesia, Google Bahrain, Basta Clicar, |
586
|
|
|
|
|
|
|
Giga Busca, Google Greece, Google Belgium, Google Egypt, Google Chile, Godado (IT), Google Australia, |
587
|
|
|
|
|
|
|
Google Uruguay, Google India, Google Taiwan, Google Ukraine, Google US, Terra ES, |
588
|
|
|
|
|
|
|
Tesco Search, Megasearching, SAPO videos, Google Nepal, Google Israel, Google US Virgin Islands, Google Hungary, |
589
|
|
|
|
|
|
|
Google San Marino, Google Croatia, Google dot jobs, Google Panama, Google Malaysia, Internetica, Google Brunei Darussalam, |
590
|
|
|
|
|
|
|
Google Denmark, Google Pakistan, Google Solomon Islands, Google dot biz, Google Lesotho, IceRocket, Google Greenland, Fireball DE, |
591
|
|
|
|
|
|
|
Rtp, Google Portugal, Google Samoa, Google Kazakhstan, Google Blogsearch, Google Thailand, Google, Google Antiqua and Barbuda, |
592
|
|
|
|
|
|
|
Google Germany, Google Moldova, Google Zambia, Google Greece, Google Sri Lanka, Google Ireland, Google Austria, |
593
|
|
|
|
|
|
|
Google Peru, Google Guatemala, ICQ dot com, AOL UK, Google Guyana, In GR, Google dot info, MyWay, Pathfinder GR, Google Costa Rica, |
594
|
|
|
|
|
|
|
KataTudo, Google Jamaica, Google Vietnam, Google Morocco, Google Gambia, Google Singapore, Google Mauritius, Altavista, Google Afghanistan, |
595
|
|
|
|
|
|
|
Google Cote dIvoire, Google Kazakhstan, Google Czech Rep, Phantis GR, Google Bahamas, Google United Arab Emirates, Google East Timor, Ozu ES, |
596
|
|
|
|
|
|
|
Google Venezuela, Google Puerto Rico, Google Armenia, Google Croatia, Google Botswana, Google Tuvalu, Ask UK, Google Singapore, Mirago UK, |
597
|
|
|
|
|
|
|
Google Greenland, MSN Arabia, Google Nauru, Publico, Robby GR, Minas Planet, Pesquisa Iol, Google Romania, Google South Korea, Google Jersey, |
598
|
|
|
|
|
|
|
Netscape, Busca Aqui, Google Bulgaria, Google Uzbekistan, Tiscali UK, Ithaki, Cadê, Lycos IT, Google Suriname, Excite IT, Google Hong Kong, |
599
|
|
|
|
|
|
|
Kataweb IT, Google Burundi, Click Gratis, Google Vietnam, MSN, Alice.it, Google Honduras, Google Trinidad and Tobago, Google Uganda, XL, |
600
|
|
|
|
|
|
|
Jornal Noticias, Google Cook Islands, Google Japan, Google Ecuador, Google Ghana, Google Guadeloupe, Google Libya, Google Kenya, Fastbrowsersearch, |
601
|
|
|
|
|
|
|
Aeiou, Google Niue, Jornal Record, HotBot, Google Honduras, Google Georgia, Google Fiji, Google Philipines, BBC Search, Google, Google Laos, |
602
|
|
|
|
|
|
|
Soso, AltaVista Brasil, Lycos UK, SAPO fotos, Ask dot com, Google Netherlands, Google Philipines, Google Trinidad and Tobago, Google Turkey, |
603
|
|
|
|
|
|
|
AllTheWeb, Google Japan, Google Argentina, Google Vanuatu, Blueyonder, Google Greenland, Google Samoa, Google Georgia, Google Slovakia, |
604
|
|
|
|
|
|
|
Google Sri Lanka, Pesquisa SAPO, Google Latvia, Google Latvia, Correio Manha, Terra Busca, Google El Savador, Google Cambodia, |
605
|
|
|
|
|
|
|
Google Mauritius, Google China, AOL Search, Google Tokelau, Google Tonga, Correio da Manha, Radar UOL, Google Jordan, Godado, Google Jordan, |
606
|
|
|
|
|
|
|
Google Pitcairn Islands, Categorico IT, Google Morocco, Google Dominican Rep, Google France, Abacho, Google Azerbaijan, Google Andorra, Google Belize, |
607
|
|
|
|
|
|
|
Google Paraguay, Simpatico IT, Google Ethiopia, Google Uganda, Google Poland, Google Bolivia, Google Hungary, Google Russia, Diario Noticias, |
608
|
|
|
|
|
|
|
Google Puerto Rico, Google Montserrat, Yahoo! Japan, Google Seychelles, Mamma, Google Pitcairn Islands, Google South Africa, Paglo, Google Malta, |
609
|
|
|
|
|
|
|
Google Azerbaijan, Google New Zeland, Google China, Google Norway, Google Bosnia and Herzegovina, Google Indonesia, SpeedyBusca, Entrada, Google Anguilla, |
610
|
|
|
|
|
|
|
Google Rep of Congo, Google Dominica, Google Finland, Altavista UK, Google Guyana, MSN UK, Yahoo Answers, Google British Virgin Islands, Google Guadeloupe, |
611
|
|
|
|
|
|
|
Google Lithuania, Google Antiqua and Barbuda, Google Bahamas, Google Malawi, MSN Prodigy, Bing, Google Bolivia, Google Djubouti, Google Uzbekistan, Fastweb IT, |
612
|
|
|
|
|
|
|
Google Tajikistan, Virgin Search, Google Nigeria, Yahoo Japan, Pesquisa Clix, Google Grenada, Google Haiti, Google American Samoa, Google Pakistan, |
613
|
|
|
|
|
|
|
Google Cocos Islands, Google Hong Kong, NTLWorld, ilMotore, Google Belize, Google Guernsey, Google Sweden, Google Anguilla, Google Bangladesh, Google Isle of Man, |
614
|
|
|
|
|
|
|
Google Guernsey, Google Kyrgyzstan, Google Dem Rep of Congo, Google Malawi, Orange Search, Google Seychelles, Google Guyana, Google Gibraltar, |
615
|
|
|
|
|
|
|
oogle Italy, Google Kiribati, TheSpider IT, Google Nicaragua, Google Russia, Google Venezuela, Google Poland, Google Brazil, Google Senegal, Conduit, Lycos, |
616
|
|
|
|
|
|
|
Google Isle of Man, Live.com, Google Italy, Libero IT, Google Canada, Google Nauru, Google Liechtenstein, Google Afghanistan, Cuil, Google Zimbabwe, Google Mauritius, |
617
|
|
|
|
|
|
|
Orange ES, Google Burundi, Google Portugal, ACBusca, Bem Rapido, Atalho Certo, Excite, Clusty, Yahoo Brazil, My Web Search, Google Spain, Google Uzbekistan, Google, |
618
|
|
|
|
|
|
|
Google Mexico, T-Online, Google dot mobi, Google Luxembourg, Google Austria, Yahoo!, Google Kiribati, Sweetim, Vai Busca, Google Mongolia, Google Saudi Arabia, Google dot net, |
619
|
|
|
|
|
|
|
Google Maldives, Google Trinidad and Tobago, Google Jersey, Feedster, Google Turkmenistan, Google Switzerland, Google Norfolk Island, Suche DE, Google Malawi, Google Rwanda, |
620
|
|
|
|
|
|
|
Lycos ES, Google Burundi, Google French Guiana, Google Kyrgyzstan, Google Saint Helena, VirginMedia, Google Iceland, SAPO sabores, Google India, Google Cuba, |
621
|
|
|
|
|
|
|
Google US Virgin Islands, Google Taiwan, Google Sao Tome, Google Slovenia, Starware, Google Estonia, Conduit, Yahoo India, Rediff, Guruji |
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
=head1 AUTHOR |
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
Spiros Denaxas, C<< <s.denaxas at gmail.com> >> |
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
=head1 SOURCE CODE |
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
The source code can be found on github L<https://github.com/spiros/URI-ParseSearchString> |
630
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
=head1 BUGS |
632
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
This is my first CPAN module so I encourage you to send all comments, especially bad, |
634
|
|
|
|
|
|
|
to my email address. |
635
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
This could not have been possible without the support of my co-workers at |
637
|
|
|
|
|
|
|
http://nestoria.co.uk - the easiest way of finding UK property. |
638
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
=head1 SUPPORT |
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
For more information, you could also visit my blog: |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
http://blog.ffffruit.com |
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
=over 4 |
646
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
=back |
648
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE |
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
Copyright 2011 Spiros Denaxas, all rights reserved. |
652
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
654
|
|
|
|
|
|
|
under the same terms as Perl itself. |
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
=cut |
657
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
1; # End of URI::ParseSearchString |