line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# |
2
|
|
|
|
|
|
|
# $Id$ |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
# www::google Brik |
5
|
|
|
|
|
|
|
# |
6
|
|
|
|
|
|
|
package Metabrik::Www::Google; |
7
|
1
|
|
|
1
|
|
1071
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
29
|
|
8
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
29
|
|
9
|
|
|
|
|
|
|
|
10
|
1
|
|
|
1
|
|
4
|
use base qw(Metabrik::Client::Www); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
519
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
sub brik_properties { |
13
|
|
|
|
|
|
|
return { |
14
|
0
|
|
|
0
|
1
|
|
revision => '$Revision$', |
15
|
|
|
|
|
|
|
tags => [ qw(unstable) ], |
16
|
|
|
|
|
|
|
author => 'GomoR ', |
17
|
|
|
|
|
|
|
license => 'http://opensource.org/licenses/BSD-3-Clause', |
18
|
|
|
|
|
|
|
attributes => { |
19
|
|
|
|
|
|
|
language => [ qw(fr|uk|de|ch) ], |
20
|
|
|
|
|
|
|
page => [ qw(number) ], |
21
|
|
|
|
|
|
|
filter => [ qw(0|1) ], |
22
|
|
|
|
|
|
|
}, |
23
|
|
|
|
|
|
|
attributes_default => { |
24
|
|
|
|
|
|
|
language => 'fr', |
25
|
|
|
|
|
|
|
page => 1, |
26
|
|
|
|
|
|
|
do_javascript => 1, |
27
|
|
|
|
|
|
|
filter => 0, |
28
|
|
|
|
|
|
|
}, |
29
|
|
|
|
|
|
|
commands => { |
30
|
|
|
|
|
|
|
search => [ qw(keywords) ], |
31
|
|
|
|
|
|
|
}, |
32
|
|
|
|
|
|
|
require_modules => { |
33
|
|
|
|
|
|
|
'WWW::Mechanize::PhantomJS' => [ ], |
34
|
|
|
|
|
|
|
'Metabrik::String::Html' => [ ], |
35
|
|
|
|
|
|
|
'Metabrik::String::Uri' => [ ], |
36
|
|
|
|
|
|
|
}, |
37
|
|
|
|
|
|
|
}; |
38
|
|
|
|
|
|
|
} |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
# Search last 24 hours: &tbs=qdr:d |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# my $url = 'http://www.google.fr/#q=gomor' |
43
|
|
|
|
|
|
|
# set client::www do_javascript 1 |
44
|
|
|
|
|
|
|
# run client::www get $url |
45
|
|
|
|
|
|
|
# my $content = $RUN->{content} |
46
|
|
|
|
|
|
|
# run client::www parse $content |
47
|
|
|
|
|
|
|
# my $body = $RUN->content |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
sub search { |
50
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
51
|
0
|
|
|
|
|
|
my @args = @_; |
52
|
|
|
|
|
|
|
|
53
|
0
|
0
|
|
|
|
|
if (@args <= 0) { |
54
|
0
|
0
|
|
|
|
|
$self->brik_help_run_undef_arg('search', undef) or return; |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
|
57
|
0
|
|
|
|
|
|
my $language = $self->language; |
58
|
0
|
|
|
|
|
|
my $page = $self->page; |
59
|
0
|
|
|
|
|
|
my $filter = $self->filter; |
60
|
|
|
|
|
|
|
|
61
|
0
|
|
|
|
|
|
my $keywords = join(' ', @args); |
62
|
|
|
|
|
|
|
|
63
|
0
|
0
|
|
|
|
|
my $si = Metabrik::String::Uri->new_from_brik_init($self) or return; |
64
|
0
|
0
|
|
|
|
|
$keywords = $si->encode($keywords) or return; |
65
|
|
|
|
|
|
|
|
66
|
0
|
0
|
0
|
|
|
|
if ($language eq 'fr' || $language eq 'uk' || $language eq 'de' || $language eq 'ch') { |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
else { |
69
|
0
|
|
|
|
|
|
return $self->log->error("search: unsupported language [$language]"); |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
0
|
|
|
|
|
|
my $cache = { |
73
|
|
|
|
|
|
|
fr => 'en cache', |
74
|
|
|
|
|
|
|
de => 'im cache', |
75
|
|
|
|
|
|
|
ch => 'im cache', |
76
|
|
|
|
|
|
|
uk => 'cached', |
77
|
|
|
|
|
|
|
}; |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
# Google UK is google.co.uk |
80
|
0
|
|
|
|
|
|
my $url = 'http://www.google.'.$language.'/#q='; |
81
|
0
|
0
|
|
|
|
|
if ($language eq 'uk') { |
82
|
0
|
|
|
|
|
|
$url = 'http://www.google.co.uk/#q='; |
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
|
85
|
0
|
|
|
|
|
|
my $start = ($page - 1); |
86
|
0
|
0
|
|
|
|
|
if ($start < 0) { |
87
|
0
|
|
|
|
|
|
$start = 0; |
88
|
|
|
|
|
|
|
} |
89
|
0
|
|
|
|
|
|
$start *= 10; |
90
|
0
|
|
|
|
|
|
my $search = $url.$keywords.'&start='.$start.'&filter='.$filter; |
91
|
|
|
|
|
|
|
|
92
|
0
|
|
|
|
|
|
$self->log->verbose("search: [$search]"); |
93
|
|
|
|
|
|
|
|
94
|
0
|
0
|
|
|
|
|
my $get = $self->get($search) or return; |
95
|
0
|
0
|
|
|
|
|
if ($get->{code} == 200) { |
96
|
0
|
0
|
|
|
|
|
my $tree = $self->parse($get->{content}) or return; |
97
|
0
|
|
|
|
|
|
my $body = $tree->content; |
98
|
|
|
|
|
|
|
|
99
|
0
|
|
|
|
|
|
my $r = $self->_traverse($body->[1]); |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# We merge cache stuff within results |
102
|
0
|
|
|
|
|
|
my @merged = (); |
103
|
0
|
|
|
|
|
|
my $this = {}; |
104
|
0
|
|
|
|
|
|
for (@$r) { |
105
|
0
|
|
|
|
|
|
$self->log->debug("url: [".$_->{url}."]"); |
106
|
0
|
|
|
|
|
|
$self->log->debug("title: [".$_->{title}."]"); |
107
|
|
|
|
|
|
|
|
108
|
0
|
0
|
|
|
|
|
if ($_->{title} =~ m/^@{[$cache->{$language}]}/i) { |
|
0
|
|
|
|
|
|
|
109
|
0
|
|
|
|
|
|
$self->log->debug("cache: [".$_->{url}."]"); |
110
|
0
|
|
|
|
|
|
$merged[-1]->{cache_url} = $_->{url}; |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
else { |
113
|
0
|
|
|
|
|
|
$this->{url} = $_->{url}; |
114
|
0
|
|
|
|
|
|
$this->{title} = $_->{title}; |
115
|
0
|
|
|
|
|
|
push @merged, $this; |
116
|
0
|
|
|
|
|
|
$this = {}; |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
|
120
|
0
|
|
|
|
|
|
return \@merged; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
|
123
|
0
|
|
|
|
|
|
return $self->log->error("search: unhandled error"); |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
sub _traverse { |
127
|
0
|
|
|
0
|
|
|
my $self = shift; |
128
|
0
|
|
|
|
|
|
my ($node) = @_; |
129
|
|
|
|
|
|
|
|
130
|
0
|
|
|
|
|
|
my @results = (); |
131
|
|
|
|
|
|
|
|
132
|
0
|
|
|
|
|
|
my @list = $node->content_list; |
133
|
0
|
|
|
|
|
|
for my $this (@list) { |
134
|
0
|
0
|
|
|
|
|
if (ref($this) eq 'HTML::Element') { |
135
|
0
|
|
|
|
|
|
my $tag = $this->tag; |
136
|
0
|
0
|
|
|
|
|
if ($tag eq 'a') { |
137
|
0
|
|
|
|
|
|
my $h = $self->_href_to_hash($this); |
138
|
0
|
0
|
0
|
|
|
|
if ($h && keys %$h > 0) { |
139
|
|
|
|
|
|
|
#print Data::Dumper::Dumper($h)."\n"; |
140
|
0
|
|
|
|
|
|
push @results, $h; |
141
|
|
|
|
|
|
|
} |
142
|
0
|
|
|
|
|
|
next; |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
# Do it recursively |
146
|
0
|
|
|
|
|
|
my $new = $self->_traverse($this); |
147
|
0
|
|
|
|
|
|
push @results, @$new; |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
|
151
|
0
|
|
|
|
|
|
return \@results; |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
sub _href_to_hash { |
155
|
0
|
|
|
0
|
|
|
my $self = shift; |
156
|
0
|
|
|
|
|
|
my ($element) = @_; |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
# /url?q=http://www.justanswer.com/military-law/5ps6l-read-gomor-submitted-rebuttal-go-will.html&sa=U&ved=0ahUKEwi_hP_LgJTPAhVEWRoKHdlaDKQQFghHMAk&usg=AFQjCNGs50hYJHY-aJ6yxYeiP0p5Qd52-A |
159
|
0
|
|
|
|
|
|
my $is_incomplete = 0; |
160
|
0
|
|
|
|
|
|
my $title = ''; |
161
|
0
|
|
|
|
|
|
my $url = ''; |
162
|
0
|
|
|
|
|
|
my $href = $element->{href}; |
163
|
0
|
0
|
|
|
|
|
if ($href =~ m{^/url\?q=}) { # && $href !~ m{/url\?q=http://webcache.googleusercontent.com/}) { |
164
|
0
|
|
|
|
|
|
$url = $href; |
165
|
0
|
|
|
|
|
|
$url =~ s{^/url\?q=}{}; |
166
|
0
|
|
|
|
|
|
$url =~ s{&sa=.+?$}{}; |
167
|
0
|
|
|
|
|
|
my @list = @{$element->content}; |
|
0
|
|
|
|
|
|
|
168
|
0
|
|
|
|
|
|
for (@list) { |
169
|
0
|
0
|
|
|
|
|
if (ref($_) eq 'HTML::Element') { |
170
|
0
|
0
|
|
|
|
|
if (defined($_->content)) { |
171
|
0
|
|
|
|
|
|
my $txt = join(' ', @{$_->content}); |
|
0
|
|
|
|
|
|
|
172
|
0
|
|
|
|
|
|
$title .= $txt; |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
else { |
175
|
0
|
|
|
|
|
|
return {}; |
176
|
|
|
|
|
|
|
} |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
else { |
179
|
0
|
|
|
|
|
|
$title .= $_; |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
else { |
184
|
0
|
|
|
|
|
|
return; |
185
|
|
|
|
|
|
|
} |
186
|
|
|
|
|
|
|
|
187
|
0
|
0
|
|
|
|
|
my $sh = Metabrik::String::Html->new_from_brik_init($self) or return; |
188
|
0
|
0
|
|
|
|
|
my $si = Metabrik::String::Uri->new_from_brik_init($self) or return; |
189
|
|
|
|
|
|
|
|
190
|
0
|
|
|
|
|
|
$title = $sh->decode($title); |
191
|
0
|
|
|
|
|
|
$url = $si->decode($url); |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
return { |
194
|
0
|
|
|
|
|
|
url => $url, |
195
|
|
|
|
|
|
|
title => $title, |
196
|
|
|
|
|
|
|
}; |
197
|
|
|
|
|
|
|
} |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
1; |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
__END__ |