| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
#WWW/monitor.pm. Written in 2007 by Yaron Kahanoitch. This |
|
2
|
|
|
|
|
|
|
# source code has been placed in the public domain by the author. |
|
3
|
|
|
|
|
|
|
# Please be kind and preserve the documentation. |
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package WWW::Monitor::Task; |
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
#use 5.008; |
|
10
|
1
|
|
|
1
|
|
20573
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
28
|
|
|
11
|
1
|
|
|
1
|
|
6
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
30
|
|
|
12
|
1
|
|
|
1
|
|
958
|
use HTTP::Response; |
|
|
1
|
|
|
|
|
31425
|
|
|
|
1
|
|
|
|
|
32
|
|
|
13
|
1
|
|
|
1
|
|
956
|
use HTTP::Request; |
|
|
1
|
|
|
|
|
689
|
|
|
|
1
|
|
|
|
|
22
|
|
|
14
|
1
|
|
|
1
|
|
6
|
use HTTP::Headers; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
20
|
|
|
15
|
1
|
|
|
1
|
|
4
|
use HTTP::Status; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
314
|
|
|
16
|
1
|
|
|
1
|
|
1268
|
use HTML::TreeBuilder; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
13
|
|
|
17
|
|
|
|
|
|
|
#use Carp; |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our(@ISA, @EXPORT, @EXPORT_OK, $VERSION); |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
$VERSION = 0.24; |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
@ISA = qw( |
|
26
|
|
|
|
|
|
|
Exporter |
|
27
|
|
|
|
|
|
|
); |
|
28
|
|
|
|
|
|
|
@EXPORT = qw (); |
|
29
|
|
|
|
|
|
|
@EXPORT_OK = qw (); |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
our $HASH_SEPARATOR = "\n"; |
|
32
|
|
|
|
|
|
|
our $HASH_KEY_PREFIX = "__HASH_KEY__"; |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=head1 NAME |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
WWW::Monitor::Task - A Task class for monitoring single web page |
|
37
|
|
|
|
|
|
|
against a cached version. |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
=head1 VERSION |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
Version 0.1 |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=cut |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=head1 Description |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
This class is responsible for tracking a single web page and reporting |
|
48
|
|
|
|
|
|
|
changes. This class should be considered as a private asset of |
|
49
|
|
|
|
|
|
|
L. For details please refer to |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
=head1 EXPORT |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
=head1 FUNCTIONS |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=head2 new |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
A constructor. |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=cut |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
sub new { |
|
62
|
0
|
|
|
0
|
1
|
|
my $this = shift; |
|
63
|
0
|
|
|
|
|
|
my %arg; |
|
64
|
0
|
0
|
|
|
|
|
unless (@_ % 2) { |
|
65
|
0
|
|
|
|
|
|
%arg = @_; |
|
66
|
|
|
|
|
|
|
} else { |
|
67
|
0
|
|
|
|
|
|
carp ("Parameters for WWW::Monitor::Task should be given as pair of 'OPTION'=>'VAL'"); |
|
68
|
|
|
|
|
|
|
} |
|
69
|
0
|
|
0
|
|
|
|
my $class = ref($this) || $this; |
|
70
|
0
|
|
|
|
|
|
my $self = {}; |
|
71
|
0
|
0
|
|
|
|
|
carp ("Url is not given") unless exists $arg{URL}; |
|
72
|
0
|
|
|
|
|
|
$self->{url} = $arg{URL}; |
|
73
|
0
|
|
|
|
|
|
$self->{cache} = $arg{CACHE}; |
|
74
|
0
|
|
|
|
|
|
bless($self, $class); |
|
75
|
|
|
|
|
|
|
} |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=head2 run ( mechanize, carrier, ) |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
Executes Task. Parameters: |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
mechanize - Web mechanize object. |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
L assumes that the given object implements or |
|
84
|
|
|
|
|
|
|
inherits WWW::mechnize abstraction. See |
|
85
|
|
|
|
|
|
|
L. |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
carrier- Object which will conduct the notification; see L for details |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
cache - optional - A cache class. |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=cut |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
sub run { |
|
94
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
95
|
0
|
|
|
|
|
|
$self->{error} = ""; |
|
96
|
0
|
|
|
|
|
|
my ($mechanize,$carrier) = (shift,shift); |
|
97
|
0
|
|
|
|
|
|
my $cache = ""; |
|
98
|
0
|
0
|
|
|
|
|
if (@_) { $cache = shift;} |
|
|
0
|
|
|
|
|
|
|
|
99
|
0
|
|
|
|
|
|
my $url_i = $self->{url}; |
|
100
|
0
|
0
|
|
|
|
|
$self->{cache} = $cache if ($cache); |
|
101
|
0
|
|
|
|
|
|
my $responses = {}; |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
#Get Url data. Output data is stored in the hash ref $responses. |
|
104
|
0
|
0
|
|
|
|
|
$self->get_url_data($mechanize,$url_i,$responses) or return 0; |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
#Compares Pages list with cache. |
|
107
|
0
|
|
|
|
|
|
my ($url_keys_for_comapre,$old_pages_to_compare,$new_pagets_to_compare,$missing_pages,$added_pages,$existsInCache) = $self->sync_cache($url_i,$responses); |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
# if a page does not exist in cache we don't want to notify this |
|
110
|
0
|
0
|
|
|
|
|
return 1 unless ($existsInCache); |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
#Activate Notification. |
|
113
|
0
|
|
|
|
|
|
$self->be_notified($carrier,$url_i,$missing_pages,$added_pages,$old_pages_to_compare,$new_pagets_to_compare,$url_keys_for_comapre); |
|
114
|
0
|
|
|
|
|
|
return 1; |
|
115
|
|
|
|
|
|
|
} |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
=head2 be_notified |
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
(Private method) |
|
120
|
|
|
|
|
|
|
Tests if a page has changed. If yes, notification call back is being called. |
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=cut |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
sub be_notified { |
|
125
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
126
|
0
|
|
|
|
|
|
my $notify_ind = 0; |
|
127
|
0
|
|
|
|
|
|
my ($carrier,$url,$missing_pages,$added_pages,$old_pages_to_compare,$new_pages_to_compare,$url_keys_for_comapre) = @_; |
|
128
|
0
|
|
|
|
|
|
my $cache = $self->{cache}; |
|
129
|
0
|
|
|
|
|
|
my $ret = 1; |
|
130
|
|
|
|
|
|
|
#Extract textual information from missing pages. |
|
131
|
0
|
|
|
|
|
|
$self->{missing_parts} = $missing_pages; |
|
132
|
0
|
|
|
|
|
|
my $notify_ind1 = values(%$missing_pages); |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
#Extract added information from added pages. |
|
135
|
0
|
|
|
|
|
|
$self->{added_parts} = $added_pages; |
|
136
|
0
|
|
|
|
|
|
my $notify_ind2 = values(%$added_pages); |
|
137
|
|
|
|
|
|
|
|
|
138
|
0
|
|
|
|
|
|
my $index = 0; |
|
139
|
|
|
|
|
|
|
#Go over on all pages that exists in cache and perform textual comparison |
|
140
|
0
|
|
|
|
|
|
$self->{changed} = {}; |
|
141
|
0
|
0
|
|
|
|
|
if (@$old_pages_to_compare) { |
|
142
|
0
|
|
|
|
|
|
while ($index < scalar(@$old_pages_to_compare)) { |
|
143
|
0
|
|
|
|
|
|
my $t1 = $self->format_html($old_pages_to_compare->[$index]); |
|
144
|
0
|
|
|
|
|
|
my $t2 = $self->format_html($new_pages_to_compare->[$index]); |
|
145
|
|
|
|
|
|
|
|
|
146
|
0
|
0
|
|
|
|
|
if ($$t1 ne $$t2) { |
|
147
|
0
|
|
|
|
|
|
my $tmp = [$old_pages_to_compare->[$index], $new_pages_to_compare->[$index] ]; |
|
148
|
0
|
|
|
|
|
|
$self->{changed}{$url_keys_for_comapre->[$index]} = $tmp; |
|
149
|
0
|
|
|
|
|
|
$cache->set($url_keys_for_comapre->[$index],$new_pages_to_compare->[$index]->as_string); |
|
150
|
0
|
|
|
|
|
|
$notify_ind = 1; |
|
151
|
|
|
|
|
|
|
} |
|
152
|
0
|
|
|
|
|
|
++$index; |
|
153
|
|
|
|
|
|
|
} |
|
154
|
|
|
|
|
|
|
} |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
#If notification is required, perform it. |
|
158
|
0
|
0
|
0
|
|
|
|
if ($notify_ind or $notify_ind1 or $notify_ind2) { |
|
|
|
|
0
|
|
|
|
|
|
159
|
0
|
|
|
|
|
|
$self->{time1} = HTTP::Date::time2str($self->validity($url)); |
|
160
|
0
|
|
|
|
|
|
$self->{time2} = HTTP::Date::time2str(time()); |
|
161
|
0
|
|
|
|
|
|
$self->store_validity($url,time()); |
|
162
|
0
|
|
|
|
|
|
return $carrier->notify($url,$self); |
|
163
|
0
|
|
|
|
|
|
} else { return 1;} |
|
164
|
|
|
|
|
|
|
} |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
=head2 is_html |
|
167
|
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
(Private method) |
|
169
|
|
|
|
|
|
|
Return true if page is html |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=cut |
|
172
|
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
sub is_html { |
|
174
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
175
|
0
|
|
|
|
|
|
my $response = shift; |
|
176
|
0
|
|
|
|
|
|
return $response->header('Content-Type') =~ m%^text/html%; |
|
177
|
|
|
|
|
|
|
} |
|
178
|
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=head2 missing_parts |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
Return hash reference which includes parts that exists only in old cached version. Every entry in the returned list is a reference to HTTP::REsponse object. |
|
182
|
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=cut |
|
184
|
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
sub missing_parts { |
|
186
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
187
|
0
|
|
|
|
|
|
return $self->{missing_parts}; |
|
188
|
|
|
|
|
|
|
} |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=head2 added_parts |
|
191
|
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
Return hash reference which includes parts that exists only in the new cached version.Every entry in the returned list is a reference to HTTP::REsponse object. |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=cut |
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
sub added_parts { |
|
197
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
198
|
0
|
|
|
|
|
|
return $self->{added_parts}; |
|
199
|
|
|
|
|
|
|
} |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
=head2 old_version_time_stamp |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
Return the time when the url was previously cached. Time is returned in seconds since epoch. |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
=cut |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
sub old_version_time_stamp { |
|
208
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
209
|
0
|
|
|
|
|
|
return $self->{time1}; |
|
210
|
|
|
|
|
|
|
} |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
=head2 new_version_time_stamp |
|
213
|
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
Return the time when the url was queried. Time is returned in seconds since epoch. |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
=cut |
|
217
|
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
sub new_version_time_stamp { |
|
219
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
220
|
0
|
|
|
|
|
|
return $self->{time2}; |
|
221
|
|
|
|
|
|
|
} |
|
222
|
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
=head2 changed_parts |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
Return a list that consists of all changed parts. |
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
=cut |
|
228
|
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
sub changed_parts { |
|
230
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
231
|
0
|
|
|
|
|
|
return keys %{$self->{changed}}; |
|
|
0
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
} |
|
233
|
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
=head2 get_old_new_pair [ urls key ] |
|
235
|
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
Return a list of two elements. The first one is the old cached version and the second one is the new version. |
|
237
|
|
|
|
|
|
|
The given url key must be one of the keys returned by changed_parts method. |
|
238
|
|
|
|
|
|
|
Each of the pair two pairs is a reference to L object. |
|
239
|
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
=cut |
|
243
|
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
sub get_old_new_pair { |
|
245
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
246
|
0
|
|
|
|
|
|
my $url_key = shift; |
|
247
|
0
|
0
|
|
|
|
|
if (exists $self->{changed}{$url_key}) { |
|
248
|
0
|
|
|
|
|
|
return @{$self->{changed}{$url_key}}; |
|
|
0
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
} else { |
|
250
|
0
|
|
|
|
|
|
return 0; |
|
251
|
|
|
|
|
|
|
} |
|
252
|
|
|
|
|
|
|
} |
|
253
|
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=head2 format_html [ leftmargin, rightmargin] |
|
255
|
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
Return a textual version of HTML |
|
257
|
|
|
|
|
|
|
left and right margins set the margin for the returned data. |
|
258
|
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=cut |
|
260
|
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
sub format_html { |
|
262
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
263
|
0
|
|
|
|
|
|
my $response_ref = shift; |
|
264
|
0
|
|
|
|
|
|
my $leftmargin = 0; |
|
265
|
0
|
|
|
|
|
|
my $rightmargin = 120; |
|
266
|
|
|
|
|
|
|
|
|
267
|
0
|
0
|
|
|
|
|
if (@_) { |
|
268
|
0
|
|
|
|
|
|
$leftmargin = shift; |
|
269
|
0
|
|
|
|
|
|
$rightmargin = shift; |
|
270
|
|
|
|
|
|
|
} |
|
271
|
|
|
|
|
|
|
|
|
272
|
0
|
|
|
|
|
|
my $reftype = ref($response_ref); |
|
273
|
0
|
0
|
0
|
|
|
|
if (($reftype ne 'REF') and $self->is_html($response_ref)) { |
|
|
|
0
|
|
|
|
|
|
|
274
|
0
|
|
|
|
|
|
my $tree = HTML::TreeBuilder->new->parse($response_ref->content); |
|
275
|
0
|
|
|
|
|
|
my $formatter = HTML::FormatText->new(leftmargin => $leftmargin, rightmargin => $rightmargin); |
|
276
|
0
|
|
|
|
|
|
my $ret = $formatter->format($tree); |
|
277
|
0
|
|
|
|
|
|
return \$ret; |
|
278
|
|
|
|
|
|
|
} elsif ($reftype eq 'REF') { #Backward compatibility case to ver 0.126 |
|
279
|
0
|
|
|
|
|
|
my $tree = HTML::TreeBuilder->new->parse($response_ref); |
|
280
|
0
|
|
|
|
|
|
my $formatter = HTML::FormatText->new(leftmargin => $leftmargin, rightmargin => $rightmargin); |
|
281
|
0
|
|
|
|
|
|
my $ret = $formatter->format($tree); |
|
282
|
0
|
|
|
|
|
|
return \$ret; |
|
283
|
|
|
|
|
|
|
} else { #We have non html data |
|
284
|
0
|
|
|
|
|
|
my $content = $response_ref->content; |
|
285
|
0
|
|
|
|
|
|
return \$content; |
|
286
|
|
|
|
|
|
|
} |
|
287
|
|
|
|
|
|
|
} |
|
288
|
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
=head2 get_hash_cache_key |
|
290
|
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
(Private method) |
|
292
|
|
|
|
|
|
|
Return a hash key that stores information about the entire visible part or the URL. |
|
293
|
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
=cut |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
sub get_hash_cache_key { |
|
297
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
298
|
0
|
|
|
|
|
|
my $url = shift; |
|
299
|
0
|
|
|
|
|
|
return $HASH_KEY_PREFIX.$url; |
|
300
|
|
|
|
|
|
|
} |
|
301
|
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
=head2 get_cache_hash |
|
303
|
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
(Private Method) |
|
305
|
|
|
|
|
|
|
Returns all urls which were last cached. |
|
306
|
|
|
|
|
|
|
return true if the url was previously hashed. |
|
307
|
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
=cut |
|
309
|
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
sub get_cache_hash { |
|
311
|
0
|
|
|
0
|
1
|
|
my ($self,$url,$is_cached_site) = @_; |
|
312
|
0
|
|
|
|
|
|
my $cache = $self->{cache}; |
|
313
|
0
|
|
|
|
|
|
my $ret = {}; |
|
314
|
0
|
|
|
|
|
|
$$is_cached_site = 1; |
|
315
|
0
|
|
|
|
|
|
my $hash_key = $self->get_hash_cache_key($url); |
|
316
|
0
|
0
|
|
|
|
|
$cache->exists($hash_key) or do { $$is_cached_site = 0;return 0;}; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
317
|
0
|
|
|
|
|
|
foreach $hash_key (split($HASH_SEPARATOR, $cache->get($hash_key))) { |
|
318
|
0
|
|
|
|
|
|
my $tmp = $cache->get($hash_key); |
|
319
|
0
|
|
|
|
|
|
my $tmp2 = HTTP::Response->parse( $tmp ); |
|
320
|
0
|
0
|
|
|
|
|
if ($tmp2) { |
|
321
|
0
|
|
|
|
|
|
$ret->{$hash_key} = $tmp2; |
|
322
|
|
|
|
|
|
|
} else { #Backward compatibility to version 0.126 |
|
323
|
0
|
|
|
|
|
|
$ret->{$hash_key} = \$tmp; |
|
324
|
|
|
|
|
|
|
} |
|
325
|
|
|
|
|
|
|
} |
|
326
|
0
|
|
|
|
|
|
return $ret; |
|
327
|
|
|
|
|
|
|
} |
|
328
|
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
=head2 store_validity |
|
330
|
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
(Private method) |
|
332
|
|
|
|
|
|
|
Store current time in the main hash key |
|
333
|
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
=cut |
|
335
|
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
sub store_validity { |
|
337
|
0
|
|
|
0
|
1
|
|
my ($self,$url) = (@_); |
|
338
|
0
|
|
|
|
|
|
my $cache = $self->{cache}; |
|
339
|
0
|
|
|
|
|
|
my $hash_key = $self->get_hash_cache_key($url); |
|
340
|
0
|
0
|
|
|
|
|
$cache->set_validity($hash_key,time()) if ($cache->exists($hash_key)); |
|
341
|
0
|
|
|
|
|
|
return 1; |
|
342
|
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
} |
|
344
|
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
=head2 validity |
|
346
|
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
(private method) |
|
348
|
|
|
|
|
|
|
Retreive date validity of per stores url |
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
=cut |
|
351
|
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
sub validity { |
|
353
|
0
|
|
|
0
|
1
|
|
my ($self,$url) = (@_); |
|
354
|
0
|
|
|
|
|
|
my $cache = $self->{cache}; |
|
355
|
0
|
|
|
|
|
|
my $hash_key = $self->get_hash_cache_key($url); |
|
356
|
0
|
0
|
|
|
|
|
if ($cache->exists($hash_key)) { |
|
357
|
0
|
|
|
|
|
|
return $cache->validity($hash_key); |
|
358
|
|
|
|
|
|
|
} |
|
359
|
0
|
|
|
|
|
|
return 0; |
|
360
|
|
|
|
|
|
|
} |
|
361
|
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
=head2 store_cache_hash |
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
Store General information of a web address, including all frames and dates. |
|
365
|
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
=cut |
|
367
|
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
sub store_cache_hash { |
|
369
|
0
|
|
|
0
|
1
|
|
my ($self,$url,$data,$added_data,$deleted_data) = (@_); |
|
370
|
0
|
|
|
|
|
|
my $cache = $self->{cache}; |
|
371
|
0
|
|
|
|
|
|
my $hash_key = $self->get_hash_cache_key($url); |
|
372
|
0
|
|
|
|
|
|
my $header = join($HASH_SEPARATOR,keys %$data); |
|
373
|
0
|
|
|
|
|
|
$cache->set($hash_key,join($HASH_SEPARATOR,keys %$data)); |
|
374
|
0
|
|
|
|
|
|
while (my ($key,$value) = each %$added_data) { |
|
375
|
0
|
|
|
|
|
|
$cache->set($key,$value->as_string); |
|
376
|
0
|
|
|
|
|
|
$cache->set_validity($key,time()); |
|
377
|
|
|
|
|
|
|
} |
|
378
|
0
|
|
|
|
|
|
while (my ($key2,$value2) = each %$deleted_data) { |
|
379
|
0
|
|
|
|
|
|
$cache->purge($key2,$value2); |
|
380
|
|
|
|
|
|
|
} |
|
381
|
0
|
|
|
|
|
|
return 1; |
|
382
|
|
|
|
|
|
|
} |
|
383
|
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
=head2 sync_cache |
|
385
|
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
(Private method) |
|
387
|
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
=cut |
|
389
|
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
#sync_cache (Privatre method) takes newly retrieved data, and stores and compresses it with |
|
392
|
|
|
|
|
|
|
# the cache data. That is, It returns as follows: |
|
393
|
|
|
|
|
|
|
# might_be_changed - Urls that are included in the retrieved pages and are in the cache. |
|
394
|
|
|
|
|
|
|
# Those pages are potentialy changed, and therefore should be examinated by HTML comparison. |
|
395
|
|
|
|
|
|
|
# deleted_data - Pages which exist in the cache and not in the new set. |
|
396
|
|
|
|
|
|
|
# added_data - Pages which exist only in the new version. |
|
397
|
|
|
|
|
|
|
# In addition, the sub purges all deleted pages from cache and stores the added pages. |
|
398
|
|
|
|
|
|
|
# Due to performance reasons, all the "might_be_changed" pages are not cached. |
|
399
|
|
|
|
|
|
|
# This is left for the caller to do. |
|
400
|
|
|
|
|
|
|
sub sync_cache { |
|
401
|
0
|
|
|
0
|
1
|
|
my ($self,$url,$new_data_http) = @_; |
|
402
|
0
|
|
|
|
|
|
my $cache = $self->{cache}; |
|
403
|
0
|
|
|
|
|
|
my $is_cached_site; |
|
404
|
0
|
|
|
|
|
|
my $old_data = $self->get_cache_hash($url,\$is_cached_site); |
|
405
|
0
|
|
|
|
|
|
my ($added_data,$deleted_data) = ({},{}); |
|
406
|
0
|
|
|
|
|
|
my @old_pages_to_compare; |
|
407
|
|
|
|
|
|
|
my @new_pages_to_compare; |
|
408
|
0
|
|
|
|
|
|
my @url_keys_for_comapre; |
|
409
|
0
|
|
|
|
|
|
my $index_new = 0;my $index_old = 0; |
|
|
0
|
|
|
|
|
|
|
|
410
|
0
|
|
|
|
|
|
my @new_keys = sort (keys %$new_data_http); |
|
411
|
0
|
0
|
|
|
|
|
my @old_keys = ($old_data)?(sort(keys %$old_data)):(); |
|
412
|
|
|
|
|
|
|
# print "Scalars: ", scalar(@new_keys), "==",scalar(@old_keys),"\n"; |
|
413
|
0
|
|
0
|
|
|
|
while ($index_new < scalar(@new_keys) and $index_old < scalar(@old_keys)) { |
|
414
|
0
|
0
|
|
|
|
|
if ($new_keys[$index_new] eq $old_keys[$index_old]) { |
|
415
|
0
|
0
|
|
|
|
|
if ($new_data_http->{$new_keys[$index_new]}->code() != RC_NOT_MODIFIED) { |
|
416
|
0
|
|
|
|
|
|
push @old_pages_to_compare, $old_data->{ $old_keys[$index_old]}; |
|
417
|
0
|
|
|
|
|
|
my $a_response = $new_data_http->{$new_keys[$index_new]}; |
|
418
|
0
|
|
|
|
|
|
push @new_pages_to_compare, $a_response; |
|
419
|
0
|
|
|
|
|
|
push @url_keys_for_comapre,$new_keys[$index_new]; |
|
420
|
|
|
|
|
|
|
} |
|
421
|
0
|
|
|
|
|
|
++$index_old;++$index_new;next; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
} |
|
423
|
0
|
0
|
|
|
|
|
if ($new_keys[$index_new] lt $old_keys[$index_old]) { |
|
424
|
0
|
|
|
|
|
|
my $a_response = $new_data_http->{$new_keys[$index_new]}; |
|
425
|
0
|
|
|
|
|
|
$added_data->{$new_keys[$index_new]} = $a_response; |
|
426
|
0
|
|
|
|
|
|
++$index_new; |
|
427
|
0
|
|
|
|
|
|
next; |
|
428
|
|
|
|
|
|
|
} |
|
429
|
0
|
|
|
|
|
|
$deleted_data->{$old_keys[$index_old]} = $old_data->{$old_keys[$index_old]}; |
|
430
|
0
|
|
|
|
|
|
++$index_old;next; |
|
|
0
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
} |
|
432
|
0
|
|
|
|
|
|
while ($index_new < scalar(@new_keys)) { |
|
433
|
0
|
|
|
|
|
|
my $a_response = $new_data_http->{$new_keys[$index_new]}; |
|
434
|
0
|
|
|
|
|
|
$added_data->{$new_keys[$index_new]} = $a_response; |
|
435
|
0
|
|
|
|
|
|
++$index_new; |
|
436
|
|
|
|
|
|
|
} |
|
437
|
0
|
|
|
|
|
|
while ($index_old < scalar(@old_keys)) { |
|
438
|
0
|
|
|
|
|
|
$deleted_data->{$old_keys[$index_old]} = $old_data->{$old_keys[$index_old]}; |
|
439
|
0
|
|
|
|
|
|
++$index_old; |
|
440
|
|
|
|
|
|
|
} |
|
441
|
|
|
|
|
|
|
# print "Goota cache\n"; |
|
442
|
0
|
0
|
|
|
|
|
$self->store_cache_hash($url,$new_data_http,$added_data,$deleted_data) or die ("Cannot store $url in cache"); |
|
443
|
0
|
|
|
|
|
|
return (\@url_keys_for_comapre,\@old_pages_to_compare,\@new_pages_to_compare,$deleted_data,$added_data,$is_cached_site); |
|
444
|
|
|
|
|
|
|
} |
|
445
|
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=head2 get_url_data |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
(Private method) |
|
449
|
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
=cut |
|
451
|
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
# get_url_data recurses over all pages which construct a given web page--including all type |
|
453
|
|
|
|
|
|
|
# of included frames and dynamic pages--and retrieves them into a given hash reference |
|
454
|
|
|
|
|
|
|
# $response. |
|
455
|
|
|
|
|
|
|
sub get_url_data { |
|
456
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
457
|
0
|
|
|
|
|
|
my $mechanize = shift; |
|
458
|
0
|
|
|
|
|
|
my $url = shift; |
|
459
|
0
|
|
|
|
|
|
my $responses = shift; |
|
460
|
0
|
|
|
|
|
|
my $cache = $self->{cache}; |
|
461
|
0
|
|
|
|
|
|
my $r = HTTP::Request->new('GET',$url); |
|
462
|
|
|
|
|
|
|
# Only allow "identity" for the time being |
|
463
|
0
|
|
|
|
|
|
$r->header( 'Accept-Encoding', 'identity' ); |
|
464
|
0
|
0
|
|
|
|
|
if ($cache->exists($url)) { |
|
465
|
0
|
|
|
|
|
|
my $validity = $cache->validity($url); |
|
466
|
0
|
0
|
|
|
|
|
$r->header('If-Modified-Since'=>HTTP::Date::time2str($cache->validity($url))) if ($validity); |
|
467
|
|
|
|
|
|
|
} |
|
468
|
0
|
|
|
|
|
|
my $response = $mechanize->request( $r ); |
|
469
|
|
|
|
|
|
|
|
|
470
|
0
|
0
|
|
|
|
|
if ($response->code() == 304) { |
|
|
|
0
|
|
|
|
|
|
|
471
|
0
|
|
|
|
|
|
$response = HTTP::Response->parse($cache->get($url)); |
|
472
|
0
|
|
|
|
|
|
$mechanize->_update_page($r,$response); |
|
473
|
|
|
|
|
|
|
} elsif(!($self->{status} = $response->is_success())) { |
|
474
|
0
|
|
|
|
|
|
$self->{error} = $response->status_line; |
|
475
|
0
|
|
|
|
|
|
return 0; |
|
476
|
|
|
|
|
|
|
} |
|
477
|
0
|
|
|
|
|
|
$responses->{$url} = $response; |
|
478
|
0
|
|
|
|
|
|
my $frames = []; |
|
479
|
0
|
|
|
|
|
|
my $output = $mechanize->find_all_links( tag_regex => qr/^([ia]?frame)$/i); |
|
480
|
0
|
0
|
|
|
|
|
push @$frames,@$output if ($output); |
|
481
|
0
|
|
|
|
|
|
$output = $mechanize->find_all_links( tag_regex => qr/meta/); |
|
482
|
0
|
0
|
|
|
|
|
push @$frames,@$output if ($output); |
|
483
|
|
|
|
|
|
|
|
|
484
|
0
|
|
|
|
|
|
foreach my $link (@$frames) { |
|
485
|
0
|
0
|
|
|
|
|
next unless ($link->url_abs =~ m%^http.*//%); |
|
486
|
0
|
0
|
|
|
|
|
unless (exists $responses->{$link->url_abs()}) { |
|
487
|
0
|
0
|
|
|
|
|
$self->get_url_data($mechanize,$link->url_abs(),$responses) or return 0; |
|
488
|
|
|
|
|
|
|
} |
|
489
|
|
|
|
|
|
|
} |
|
490
|
0
|
|
|
|
|
|
return 1; |
|
491
|
|
|
|
|
|
|
} |
|
492
|
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
=head2 success |
|
494
|
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
return true upon success of the last run execution. |
|
496
|
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
=cut |
|
498
|
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
sub success { |
|
500
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
501
|
0
|
|
|
|
|
|
return $self->{status}; |
|
502
|
|
|
|
|
|
|
} |
|
503
|
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
=head1 AUTHOR |
|
506
|
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
Yaron Kahanovitch, C<< >> |
|
508
|
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
=head1 BUGS |
|
510
|
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
Please report any bugs or feature requests to |
|
512
|
|
|
|
|
|
|
C, or through the web interface at |
|
513
|
|
|
|
|
|
|
L. |
|
514
|
|
|
|
|
|
|
I will be notified, and then you'll automatically be notified of progress on |
|
515
|
|
|
|
|
|
|
your bug as I make changes. |
|
516
|
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
=head1 SUPPORT |
|
518
|
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. perldoc WWW::Monitor |
|
520
|
|
|
|
|
|
|
You can also look for information at: |
|
521
|
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
=over 4 |
|
523
|
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation |
|
525
|
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
L |
|
527
|
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
=item * CPAN Ratings |
|
529
|
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
L |
|
531
|
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker |
|
533
|
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
L |
|
535
|
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
=item * Search CPAN |
|
537
|
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
L |
|
539
|
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
=back |
|
541
|
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
=head1 ACKNOWLEDGMENTS |
|
543
|
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE |
|
545
|
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
Copyright 2007 Yaron Kahanovitch, all rights reserved. |
|
547
|
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
|
549
|
|
|
|
|
|
|
under the same terms as Perl itself. |
|
550
|
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
1; # End of WWW::Monitor::Task |