line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#WWW/monitor.pm. Written in 2007 by Yaron Kahanoitch. This |
2
|
|
|
|
|
|
|
# source code has been placed in the public domain by the author. |
3
|
|
|
|
|
|
|
# Please be kind and preserve the documentation. |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package WWW::Monitor::Task; |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
#use 5.008; |
10
|
1
|
|
|
1
|
|
20573
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
28
|
|
11
|
1
|
|
|
1
|
|
6
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
30
|
|
12
|
1
|
|
|
1
|
|
958
|
use HTTP::Response; |
|
1
|
|
|
|
|
31425
|
|
|
1
|
|
|
|
|
32
|
|
13
|
1
|
|
|
1
|
|
956
|
use HTTP::Request; |
|
1
|
|
|
|
|
689
|
|
|
1
|
|
|
|
|
22
|
|
14
|
1
|
|
|
1
|
|
6
|
use HTTP::Headers; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
20
|
|
15
|
1
|
|
|
1
|
|
4
|
use HTTP::Status; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
314
|
|
16
|
1
|
|
|
1
|
|
1268
|
use HTML::TreeBuilder; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
13
|
|
17
|
|
|
|
|
|
|
#use Carp; |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our(@ISA, @EXPORT, @EXPORT_OK, $VERSION); |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
$VERSION = 0.24; |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
@ISA = qw( |
26
|
|
|
|
|
|
|
Exporter |
27
|
|
|
|
|
|
|
); |
28
|
|
|
|
|
|
|
@EXPORT = qw (); |
29
|
|
|
|
|
|
|
@EXPORT_OK = qw (); |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
our $HASH_SEPARATOR = "\n"; |
32
|
|
|
|
|
|
|
our $HASH_KEY_PREFIX = "__HASH_KEY__"; |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=head1 NAME |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
WWW::Monitor::Task - A Task class for monitoring single web page |
37
|
|
|
|
|
|
|
against a cached version. |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
=head1 VERSION |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
Version 0.1 |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=cut |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=head1 Description |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
This class is responsible for tracking a single web page and reporting |
48
|
|
|
|
|
|
|
changes. This class should be considered as a private asset of |
49
|
|
|
|
|
|
|
L. For details please refer to |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
=head1 EXPORT |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
=head1 FUNCTIONS |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=head2 new |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
A constructor. |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=cut |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
sub new { |
62
|
0
|
|
|
0
|
1
|
|
my $this = shift; |
63
|
0
|
|
|
|
|
|
my %arg; |
64
|
0
|
0
|
|
|
|
|
unless (@_ % 2) { |
65
|
0
|
|
|
|
|
|
%arg = @_; |
66
|
|
|
|
|
|
|
} else { |
67
|
0
|
|
|
|
|
|
carp ("Parameters for WWW::Monitor::Task should be given as pair of 'OPTION'=>'VAL'"); |
68
|
|
|
|
|
|
|
} |
69
|
0
|
|
0
|
|
|
|
my $class = ref($this) || $this; |
70
|
0
|
|
|
|
|
|
my $self = {}; |
71
|
0
|
0
|
|
|
|
|
carp ("Url is not given") unless exists $arg{URL}; |
72
|
0
|
|
|
|
|
|
$self->{url} = $arg{URL}; |
73
|
0
|
|
|
|
|
|
$self->{cache} = $arg{CACHE}; |
74
|
0
|
|
|
|
|
|
bless($self, $class); |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=head2 run ( mechanize, carrier, ) |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
Executes Task. Parameters: |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
mechanize - Web mechanize object. |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
L assumes that the given object implements or |
84
|
|
|
|
|
|
|
inherits WWW::mechnize abstraction. See |
85
|
|
|
|
|
|
|
L. |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
carrier- Object which will conduct the notification; see L for details |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
cache - optional - A cache class. |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=cut |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
sub run { |
94
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
95
|
0
|
|
|
|
|
|
$self->{error} = ""; |
96
|
0
|
|
|
|
|
|
my ($mechanize,$carrier) = (shift,shift); |
97
|
0
|
|
|
|
|
|
my $cache = ""; |
98
|
0
|
0
|
|
|
|
|
if (@_) { $cache = shift;} |
|
0
|
|
|
|
|
|
|
99
|
0
|
|
|
|
|
|
my $url_i = $self->{url}; |
100
|
0
|
0
|
|
|
|
|
$self->{cache} = $cache if ($cache); |
101
|
0
|
|
|
|
|
|
my $responses = {}; |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
#Get Url data. Output data is stored in the hash ref $responses. |
104
|
0
|
0
|
|
|
|
|
$self->get_url_data($mechanize,$url_i,$responses) or return 0; |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
#Compares Pages list with cache. |
107
|
0
|
|
|
|
|
|
my ($url_keys_for_comapre,$old_pages_to_compare,$new_pagets_to_compare,$missing_pages,$added_pages,$existsInCache) = $self->sync_cache($url_i,$responses); |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
# if a page does not exist in cache we don't want to notify this |
110
|
0
|
0
|
|
|
|
|
return 1 unless ($existsInCache); |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
#Activate Notification. |
113
|
0
|
|
|
|
|
|
$self->be_notified($carrier,$url_i,$missing_pages,$added_pages,$old_pages_to_compare,$new_pagets_to_compare,$url_keys_for_comapre); |
114
|
0
|
|
|
|
|
|
return 1; |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
=head2 be_notified |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
(Private method) |
120
|
|
|
|
|
|
|
Tests if a page has changed. If yes, notification call back is being called. |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=cut |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
sub be_notified { |
125
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
126
|
0
|
|
|
|
|
|
my $notify_ind = 0; |
127
|
0
|
|
|
|
|
|
my ($carrier,$url,$missing_pages,$added_pages,$old_pages_to_compare,$new_pages_to_compare,$url_keys_for_comapre) = @_; |
128
|
0
|
|
|
|
|
|
my $cache = $self->{cache}; |
129
|
0
|
|
|
|
|
|
my $ret = 1; |
130
|
|
|
|
|
|
|
#Extract textual information from missing pages. |
131
|
0
|
|
|
|
|
|
$self->{missing_parts} = $missing_pages; |
132
|
0
|
|
|
|
|
|
my $notify_ind1 = values(%$missing_pages); |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
#Extract added information from added pages. |
135
|
0
|
|
|
|
|
|
$self->{added_parts} = $added_pages; |
136
|
0
|
|
|
|
|
|
my $notify_ind2 = values(%$added_pages); |
137
|
|
|
|
|
|
|
|
138
|
0
|
|
|
|
|
|
my $index = 0; |
139
|
|
|
|
|
|
|
#Go over on all pages that exists in cache and perform textual comparison |
140
|
0
|
|
|
|
|
|
$self->{changed} = {}; |
141
|
0
|
0
|
|
|
|
|
if (@$old_pages_to_compare) { |
142
|
0
|
|
|
|
|
|
while ($index < scalar(@$old_pages_to_compare)) { |
143
|
0
|
|
|
|
|
|
my $t1 = $self->format_html($old_pages_to_compare->[$index]); |
144
|
0
|
|
|
|
|
|
my $t2 = $self->format_html($new_pages_to_compare->[$index]); |
145
|
|
|
|
|
|
|
|
146
|
0
|
0
|
|
|
|
|
if ($$t1 ne $$t2) { |
147
|
0
|
|
|
|
|
|
my $tmp = [$old_pages_to_compare->[$index], $new_pages_to_compare->[$index] ]; |
148
|
0
|
|
|
|
|
|
$self->{changed}{$url_keys_for_comapre->[$index]} = $tmp; |
149
|
0
|
|
|
|
|
|
$cache->set($url_keys_for_comapre->[$index],$new_pages_to_compare->[$index]->as_string); |
150
|
0
|
|
|
|
|
|
$notify_ind = 1; |
151
|
|
|
|
|
|
|
} |
152
|
0
|
|
|
|
|
|
++$index; |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
#If notification is required, perform it. |
158
|
0
|
0
|
0
|
|
|
|
if ($notify_ind or $notify_ind1 or $notify_ind2) { |
|
|
|
0
|
|
|
|
|
159
|
0
|
|
|
|
|
|
$self->{time1} = HTTP::Date::time2str($self->validity($url)); |
160
|
0
|
|
|
|
|
|
$self->{time2} = HTTP::Date::time2str(time()); |
161
|
0
|
|
|
|
|
|
$self->store_validity($url,time()); |
162
|
0
|
|
|
|
|
|
return $carrier->notify($url,$self); |
163
|
0
|
|
|
|
|
|
} else { return 1;} |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
=head2 is_html |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
(Private method) |
169
|
|
|
|
|
|
|
Return true if page is html |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=cut |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
sub is_html { |
174
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
175
|
0
|
|
|
|
|
|
my $response = shift; |
176
|
0
|
|
|
|
|
|
return $response->header('Content-Type') =~ m%^text/html%; |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=head2 missing_parts |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
Return hash reference which includes parts that exists only in old cached version. Every entry in the returned list is a reference to HTTP::REsponse object. |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=cut |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
sub missing_parts { |
186
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
187
|
0
|
|
|
|
|
|
return $self->{missing_parts}; |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=head2 added_parts |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
Return hash reference which includes parts that exists only in the new cached version.Every entry in the returned list is a reference to HTTP::REsponse object. |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=cut |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
sub added_parts { |
197
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
198
|
0
|
|
|
|
|
|
return $self->{added_parts}; |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
=head2 old_version_time_stamp |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
Return the time when the url was previously cached. Time is returned in seconds since epoch. |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
=cut |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
sub old_version_time_stamp { |
208
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
209
|
0
|
|
|
|
|
|
return $self->{time1}; |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
=head2 new_version_time_stamp |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
Return the time when the url was queried. Time is returned in seconds since epoch. |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
=cut |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
sub new_version_time_stamp { |
219
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
220
|
0
|
|
|
|
|
|
return $self->{time2}; |
221
|
|
|
|
|
|
|
} |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
=head2 changed_parts |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
Return a list that consists of all changed parts. |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
=cut |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
sub changed_parts { |
230
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
231
|
0
|
|
|
|
|
|
return keys %{$self->{changed}}; |
|
0
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
=head2 get_old_new_pair [ urls key ] |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
Return a list of two elements. The first one is the old cached version and the second one is the new version. |
237
|
|
|
|
|
|
|
The given url key must be one of the keys returned by changed_parts method. |
238
|
|
|
|
|
|
|
Each of the pair two pairs is a reference to L object. |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
=cut |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
sub get_old_new_pair { |
245
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
246
|
0
|
|
|
|
|
|
my $url_key = shift; |
247
|
0
|
0
|
|
|
|
|
if (exists $self->{changed}{$url_key}) { |
248
|
0
|
|
|
|
|
|
return @{$self->{changed}{$url_key}}; |
|
0
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
} else { |
250
|
0
|
|
|
|
|
|
return 0; |
251
|
|
|
|
|
|
|
} |
252
|
|
|
|
|
|
|
} |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=head2 format_html [ leftmargin, rightmargin] |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
Return a textual version of HTML |
257
|
|
|
|
|
|
|
left and right margins set the margin for the returned data. |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=cut |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
sub format_html { |
262
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
263
|
0
|
|
|
|
|
|
my $response_ref = shift; |
264
|
0
|
|
|
|
|
|
my $leftmargin = 0; |
265
|
0
|
|
|
|
|
|
my $rightmargin = 120; |
266
|
|
|
|
|
|
|
|
267
|
0
|
0
|
|
|
|
|
if (@_) { |
268
|
0
|
|
|
|
|
|
$leftmargin = shift; |
269
|
0
|
|
|
|
|
|
$rightmargin = shift; |
270
|
|
|
|
|
|
|
} |
271
|
|
|
|
|
|
|
|
272
|
0
|
|
|
|
|
|
my $reftype = ref($response_ref); |
273
|
0
|
0
|
0
|
|
|
|
if (($reftype ne 'REF') and $self->is_html($response_ref)) { |
|
|
0
|
|
|
|
|
|
274
|
0
|
|
|
|
|
|
my $tree = HTML::TreeBuilder->new->parse($response_ref->content); |
275
|
0
|
|
|
|
|
|
my $formatter = HTML::FormatText->new(leftmargin => $leftmargin, rightmargin => $rightmargin); |
276
|
0
|
|
|
|
|
|
my $ret = $formatter->format($tree); |
277
|
0
|
|
|
|
|
|
return \$ret; |
278
|
|
|
|
|
|
|
} elsif ($reftype eq 'REF') { #Backward compatibility case to ver 0.126 |
279
|
0
|
|
|
|
|
|
my $tree = HTML::TreeBuilder->new->parse($response_ref); |
280
|
0
|
|
|
|
|
|
my $formatter = HTML::FormatText->new(leftmargin => $leftmargin, rightmargin => $rightmargin); |
281
|
0
|
|
|
|
|
|
my $ret = $formatter->format($tree); |
282
|
0
|
|
|
|
|
|
return \$ret; |
283
|
|
|
|
|
|
|
} else { #We have non html data |
284
|
0
|
|
|
|
|
|
my $content = $response_ref->content; |
285
|
0
|
|
|
|
|
|
return \$content; |
286
|
|
|
|
|
|
|
} |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
=head2 get_hash_cache_key |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
(Private method) |
292
|
|
|
|
|
|
|
Return a hash key that stores information about the entire visible part or the URL. |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
=cut |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
sub get_hash_cache_key { |
297
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
298
|
0
|
|
|
|
|
|
my $url = shift; |
299
|
0
|
|
|
|
|
|
return $HASH_KEY_PREFIX.$url; |
300
|
|
|
|
|
|
|
} |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
=head2 get_cache_hash |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
(Private Method) |
305
|
|
|
|
|
|
|
Returns all urls which were last cached. |
306
|
|
|
|
|
|
|
return true if the url was previously hashed. |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
=cut |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
sub get_cache_hash { |
311
|
0
|
|
|
0
|
1
|
|
my ($self,$url,$is_cached_site) = @_; |
312
|
0
|
|
|
|
|
|
my $cache = $self->{cache}; |
313
|
0
|
|
|
|
|
|
my $ret = {}; |
314
|
0
|
|
|
|
|
|
$$is_cached_site = 1; |
315
|
0
|
|
|
|
|
|
my $hash_key = $self->get_hash_cache_key($url); |
316
|
0
|
0
|
|
|
|
|
$cache->exists($hash_key) or do { $$is_cached_site = 0;return 0;}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
317
|
0
|
|
|
|
|
|
foreach $hash_key (split($HASH_SEPARATOR, $cache->get($hash_key))) { |
318
|
0
|
|
|
|
|
|
my $tmp = $cache->get($hash_key); |
319
|
0
|
|
|
|
|
|
my $tmp2 = HTTP::Response->parse( $tmp ); |
320
|
0
|
0
|
|
|
|
|
if ($tmp2) { |
321
|
0
|
|
|
|
|
|
$ret->{$hash_key} = $tmp2; |
322
|
|
|
|
|
|
|
} else { #Backward compatibility to version 0.126 |
323
|
0
|
|
|
|
|
|
$ret->{$hash_key} = \$tmp; |
324
|
|
|
|
|
|
|
} |
325
|
|
|
|
|
|
|
} |
326
|
0
|
|
|
|
|
|
return $ret; |
327
|
|
|
|
|
|
|
} |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
=head2 store_validity |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
(Private method) |
332
|
|
|
|
|
|
|
Store current time in the main hash key |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
=cut |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
sub store_validity { |
337
|
0
|
|
|
0
|
1
|
|
my ($self,$url) = (@_); |
338
|
0
|
|
|
|
|
|
my $cache = $self->{cache}; |
339
|
0
|
|
|
|
|
|
my $hash_key = $self->get_hash_cache_key($url); |
340
|
0
|
0
|
|
|
|
|
$cache->set_validity($hash_key,time()) if ($cache->exists($hash_key)); |
341
|
0
|
|
|
|
|
|
return 1; |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
} |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
=head2 validity |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
(private method) |
348
|
|
|
|
|
|
|
Retreive date validity of per stores url |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
=cut |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
sub validity { |
353
|
0
|
|
|
0
|
1
|
|
my ($self,$url) = (@_); |
354
|
0
|
|
|
|
|
|
my $cache = $self->{cache}; |
355
|
0
|
|
|
|
|
|
my $hash_key = $self->get_hash_cache_key($url); |
356
|
0
|
0
|
|
|
|
|
if ($cache->exists($hash_key)) { |
357
|
0
|
|
|
|
|
|
return $cache->validity($hash_key); |
358
|
|
|
|
|
|
|
} |
359
|
0
|
|
|
|
|
|
return 0; |
360
|
|
|
|
|
|
|
} |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
=head2 store_cache_hash |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
Store General information of a web address, including all frames and dates. |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
=cut |
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
sub store_cache_hash { |
369
|
0
|
|
|
0
|
1
|
|
my ($self,$url,$data,$added_data,$deleted_data) = (@_); |
370
|
0
|
|
|
|
|
|
my $cache = $self->{cache}; |
371
|
0
|
|
|
|
|
|
my $hash_key = $self->get_hash_cache_key($url); |
372
|
0
|
|
|
|
|
|
my $header = join($HASH_SEPARATOR,keys %$data); |
373
|
0
|
|
|
|
|
|
$cache->set($hash_key,join($HASH_SEPARATOR,keys %$data)); |
374
|
0
|
|
|
|
|
|
while (my ($key,$value) = each %$added_data) { |
375
|
0
|
|
|
|
|
|
$cache->set($key,$value->as_string); |
376
|
0
|
|
|
|
|
|
$cache->set_validity($key,time()); |
377
|
|
|
|
|
|
|
} |
378
|
0
|
|
|
|
|
|
while (my ($key2,$value2) = each %$deleted_data) { |
379
|
0
|
|
|
|
|
|
$cache->purge($key2,$value2); |
380
|
|
|
|
|
|
|
} |
381
|
0
|
|
|
|
|
|
return 1; |
382
|
|
|
|
|
|
|
} |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
=head2 sync_cache |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
(Private method) |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
=cut |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
#sync_cache (Privatre method) takes newly retrieved data, and stores and compresses it with |
392
|
|
|
|
|
|
|
# the cache data. That is, It returns as follows: |
393
|
|
|
|
|
|
|
# might_be_changed - Urls that are included in the retrieved pages and are in the cache. |
394
|
|
|
|
|
|
|
# Those pages are potentialy changed, and therefore should be examinated by HTML comparison. |
395
|
|
|
|
|
|
|
# deleted_data - Pages which exist in the cache and not in the new set. |
396
|
|
|
|
|
|
|
# added_data - Pages which exist only in the new version. |
397
|
|
|
|
|
|
|
# In addition, the sub purges all deleted pages from cache and stores the added pages. |
398
|
|
|
|
|
|
|
# Due to performance reasons, all the "might_be_changed" pages are not cached. |
399
|
|
|
|
|
|
|
# This is left for the caller to do. |
400
|
|
|
|
|
|
|
sub sync_cache { |
401
|
0
|
|
|
0
|
1
|
|
my ($self,$url,$new_data_http) = @_; |
402
|
0
|
|
|
|
|
|
my $cache = $self->{cache}; |
403
|
0
|
|
|
|
|
|
my $is_cached_site; |
404
|
0
|
|
|
|
|
|
my $old_data = $self->get_cache_hash($url,\$is_cached_site); |
405
|
0
|
|
|
|
|
|
my ($added_data,$deleted_data) = ({},{}); |
406
|
0
|
|
|
|
|
|
my @old_pages_to_compare; |
407
|
|
|
|
|
|
|
my @new_pages_to_compare; |
408
|
0
|
|
|
|
|
|
my @url_keys_for_comapre; |
409
|
0
|
|
|
|
|
|
my $index_new = 0;my $index_old = 0; |
|
0
|
|
|
|
|
|
|
410
|
0
|
|
|
|
|
|
my @new_keys = sort (keys %$new_data_http); |
411
|
0
|
0
|
|
|
|
|
my @old_keys = ($old_data)?(sort(keys %$old_data)):(); |
412
|
|
|
|
|
|
|
# print "Scalars: ", scalar(@new_keys), "==",scalar(@old_keys),"\n"; |
413
|
0
|
|
0
|
|
|
|
while ($index_new < scalar(@new_keys) and $index_old < scalar(@old_keys)) { |
414
|
0
|
0
|
|
|
|
|
if ($new_keys[$index_new] eq $old_keys[$index_old]) { |
415
|
0
|
0
|
|
|
|
|
if ($new_data_http->{$new_keys[$index_new]}->code() != RC_NOT_MODIFIED) { |
416
|
0
|
|
|
|
|
|
push @old_pages_to_compare, $old_data->{ $old_keys[$index_old]}; |
417
|
0
|
|
|
|
|
|
my $a_response = $new_data_http->{$new_keys[$index_new]}; |
418
|
0
|
|
|
|
|
|
push @new_pages_to_compare, $a_response; |
419
|
0
|
|
|
|
|
|
push @url_keys_for_comapre,$new_keys[$index_new]; |
420
|
|
|
|
|
|
|
} |
421
|
0
|
|
|
|
|
|
++$index_old;++$index_new;next; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
} |
423
|
0
|
0
|
|
|
|
|
if ($new_keys[$index_new] lt $old_keys[$index_old]) { |
424
|
0
|
|
|
|
|
|
my $a_response = $new_data_http->{$new_keys[$index_new]}; |
425
|
0
|
|
|
|
|
|
$added_data->{$new_keys[$index_new]} = $a_response; |
426
|
0
|
|
|
|
|
|
++$index_new; |
427
|
0
|
|
|
|
|
|
next; |
428
|
|
|
|
|
|
|
} |
429
|
0
|
|
|
|
|
|
$deleted_data->{$old_keys[$index_old]} = $old_data->{$old_keys[$index_old]}; |
430
|
0
|
|
|
|
|
|
++$index_old;next; |
|
0
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
} |
432
|
0
|
|
|
|
|
|
while ($index_new < scalar(@new_keys)) { |
433
|
0
|
|
|
|
|
|
my $a_response = $new_data_http->{$new_keys[$index_new]}; |
434
|
0
|
|
|
|
|
|
$added_data->{$new_keys[$index_new]} = $a_response; |
435
|
0
|
|
|
|
|
|
++$index_new; |
436
|
|
|
|
|
|
|
} |
437
|
0
|
|
|
|
|
|
while ($index_old < scalar(@old_keys)) { |
438
|
0
|
|
|
|
|
|
$deleted_data->{$old_keys[$index_old]} = $old_data->{$old_keys[$index_old]}; |
439
|
0
|
|
|
|
|
|
++$index_old; |
440
|
|
|
|
|
|
|
} |
441
|
|
|
|
|
|
|
# print "Goota cache\n"; |
442
|
0
|
0
|
|
|
|
|
$self->store_cache_hash($url,$new_data_http,$added_data,$deleted_data) or die ("Cannot store $url in cache"); |
443
|
0
|
|
|
|
|
|
return (\@url_keys_for_comapre,\@old_pages_to_compare,\@new_pages_to_compare,$deleted_data,$added_data,$is_cached_site); |
444
|
|
|
|
|
|
|
} |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=head2 get_url_data |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
(Private method) |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
=cut |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
# get_url_data recurses over all pages which construct a given web page--including all type |
453
|
|
|
|
|
|
|
# of included frames and dynamic pages--and retrieves them into a given hash reference |
454
|
|
|
|
|
|
|
# $response. |
455
|
|
|
|
|
|
|
sub get_url_data { |
456
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
457
|
0
|
|
|
|
|
|
my $mechanize = shift; |
458
|
0
|
|
|
|
|
|
my $url = shift; |
459
|
0
|
|
|
|
|
|
my $responses = shift; |
460
|
0
|
|
|
|
|
|
my $cache = $self->{cache}; |
461
|
0
|
|
|
|
|
|
my $r = HTTP::Request->new('GET',$url); |
462
|
|
|
|
|
|
|
# Only allow "identity" for the time being |
463
|
0
|
|
|
|
|
|
$r->header( 'Accept-Encoding', 'identity' ); |
464
|
0
|
0
|
|
|
|
|
if ($cache->exists($url)) { |
465
|
0
|
|
|
|
|
|
my $validity = $cache->validity($url); |
466
|
0
|
0
|
|
|
|
|
$r->header('If-Modified-Since'=>HTTP::Date::time2str($cache->validity($url))) if ($validity); |
467
|
|
|
|
|
|
|
} |
468
|
0
|
|
|
|
|
|
my $response = $mechanize->request( $r ); |
469
|
|
|
|
|
|
|
|
470
|
0
|
0
|
|
|
|
|
if ($response->code() == 304) { |
|
|
0
|
|
|
|
|
|
471
|
0
|
|
|
|
|
|
$response = HTTP::Response->parse($cache->get($url)); |
472
|
0
|
|
|
|
|
|
$mechanize->_update_page($r,$response); |
473
|
|
|
|
|
|
|
} elsif(!($self->{status} = $response->is_success())) { |
474
|
0
|
|
|
|
|
|
$self->{error} = $response->status_line; |
475
|
0
|
|
|
|
|
|
return 0; |
476
|
|
|
|
|
|
|
} |
477
|
0
|
|
|
|
|
|
$responses->{$url} = $response; |
478
|
0
|
|
|
|
|
|
my $frames = []; |
479
|
0
|
|
|
|
|
|
my $output = $mechanize->find_all_links( tag_regex => qr/^([ia]?frame)$/i); |
480
|
0
|
0
|
|
|
|
|
push @$frames,@$output if ($output); |
481
|
0
|
|
|
|
|
|
$output = $mechanize->find_all_links( tag_regex => qr/meta/); |
482
|
0
|
0
|
|
|
|
|
push @$frames,@$output if ($output); |
483
|
|
|
|
|
|
|
|
484
|
0
|
|
|
|
|
|
foreach my $link (@$frames) { |
485
|
0
|
0
|
|
|
|
|
next unless ($link->url_abs =~ m%^http.*//%); |
486
|
0
|
0
|
|
|
|
|
unless (exists $responses->{$link->url_abs()}) { |
487
|
0
|
0
|
|
|
|
|
$self->get_url_data($mechanize,$link->url_abs(),$responses) or return 0; |
488
|
|
|
|
|
|
|
} |
489
|
|
|
|
|
|
|
} |
490
|
0
|
|
|
|
|
|
return 1; |
491
|
|
|
|
|
|
|
} |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
=head2 success |
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
return true upon success of the last run execution. |
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
=cut |
498
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
sub success { |
500
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
501
|
0
|
|
|
|
|
|
return $self->{status}; |
502
|
|
|
|
|
|
|
} |
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
=head1 AUTHOR |
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
Yaron Kahanovitch, C<< >> |
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
=head1 BUGS |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
Please report any bugs or feature requests to |
512
|
|
|
|
|
|
|
C, or through the web interface at |
513
|
|
|
|
|
|
|
L. |
514
|
|
|
|
|
|
|
I will be notified, and then you'll automatically be notified of progress on |
515
|
|
|
|
|
|
|
your bug as I make changes. |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
=head1 SUPPORT |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. perldoc WWW::Monitor |
520
|
|
|
|
|
|
|
You can also look for information at: |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
=over 4 |
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
L |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
=item * CPAN Ratings |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
L |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker |
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
L |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
=item * Search CPAN |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
L |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
=back |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
=head1 ACKNOWLEDGMENTS |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
Copyright 2007 Yaron Kahanovitch, all rights reserved. |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
549
|
|
|
|
|
|
|
under the same terms as Perl itself. |
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
1; # End of WWW::Monitor::Task |