blib/lib/WWW/Sucksub/Attila.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 19 | 21 | 90.4 |
branch | n/a | ||
condition | n/a | ||
subroutine | 7 | 7 | 100.0 |
pod | n/a | ||
total | 26 | 28 | 92.8 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package WWW::Sucksub::Attila; | ||||||
2 | |||||||
3 | =head1 NAME | ||||||
4 | |||||||
5 | WWW::Sucksub::Attila - automated access to attila french subtitles database | ||||||
6 | |||||||
7 | =head1 VERSION | ||||||
8 | |||||||
9 | Version 0.06 | ||||||
10 | |||||||
11 | =cut | ||||||
12 | |||||||
13 | our $VERSION = '0.06'; | ||||||
14 | |||||||
15 | =head1 SYNOPSIS | ||||||
16 | |||||||
17 | WWW::SuckSub::Attila is a web robot based on the WWW::Mechanize Module. | ||||||
18 | it parses distant web database specialised on french subtitles and build a dbm file | ||||||
19 | to store result ( film title - http link for subtitle file ). | ||||||
20 | The dbm file is used like a dictionnary you can update and use to do quick search. | ||||||
21 | |||||||
22 | use WWW::Sucksub::Attila; | ||||||
23 | my $test=WWW::Sucksub::Attila>new( | ||||||
24 | motif => $mot, | ||||||
25 | debug =>1, | ||||||
26 | logout => '/where/debug/file/is/written.txt', | ||||||
27 | dbfile=>'/where/dbm/file/is.db', | ||||||
28 | html=>'/where/html/report/will/be/written.html' | ||||||
29 | ); | ||||||
30 | $test->update(); #parse all site and collect subtitles http link | ||||||
31 | $test->search(); #search on local dbm file and produce html report | ||||||
32 | |||||||
33 | =head1 CONSTRUCTOR AND STARTUP | ||||||
34 | |||||||
35 | =head2 Attila Constructor | ||||||
36 | |||||||
37 | The new() constructor, is associated to default values : | ||||||
38 | you can modify these one as shown in the synopsis example. | ||||||
39 | Default value are these : | ||||||
40 | |||||||
41 | my $foo = WWW::Sucksub::Divxstation->new( | ||||||
42 | dbfile => "$ENV{HOME}"."/attila.db"; | ||||||
43 | html => "$ENV{HOME}"."/attila_repport.html"; | ||||||
44 | motif=> undef, | ||||||
45 | tempfile=> "$ENV{HOME}"."/.attila_tmp.html"; | ||||||
46 | debug=> 0, | ||||||
47 | logout => \*STDOUT | ||||||
48 | useragent=> "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.5) Gecko/20031007" | ||||||
49 | ); | ||||||
50 | |||||||
51 | The environnement variable $ENV{HOME} must exist unless you redefine the constructor value which need it. | ||||||
52 | |||||||
53 | |||||||
54 | =head3 new() constructor attributes and associated methods | ||||||
55 | |||||||
56 | All listed attributes can be modified by corresponding methods : | ||||||
57 | - set the attributes value when calling equivalent method whith args. | ||||||
58 | - get the attribute value when calling equivalent method whithout args. | ||||||
59 | |||||||
60 | $foo->WWW::Sucksub::Attila->new() | ||||||
61 | $foo->useragent() # get the useragent attribute value | ||||||
62 | $foo->useragent('tructruc') # set the useragent attribute value to 'tructruc' | ||||||
63 | |||||||
64 | =head4 motif() | ||||||
65 | |||||||
66 | you should here give a real value to this function : | ||||||
67 | if $foo->motif is undef, the package execution will be aborted | ||||||
68 | |||||||
69 | $foo->motif('xxx') | ||||||
70 | |||||||
71 | allows to precise that you're searching a word that contains 'xxx' | ||||||
72 | |||||||
73 | $foo->motif() | ||||||
74 | |||||||
75 | return the current value of the string you search. | ||||||
76 | |||||||
77 | =head4 debug() | ||||||
78 | |||||||
79 | WWW-Sucksub-Divxstation can produce a lot of interresting informations | ||||||
80 | The default value is "0" : that means that any debug informations will be written | ||||||
81 | on the output ( see the logout() method too.) | ||||||
82 | |||||||
83 | $foo->debug(0) # stop the product of debbugging informations | ||||||
84 | $foo->debug(1) # debug info will be written to the log file ( see logout() method) . | ||||||
85 | |||||||
86 | =head4 logout() | ||||||
87 | |||||||
88 | A log file can be defined to keep a trace of website parsing | ||||||
89 | You have to set $obj->debug(1) to get more detailled informations. | ||||||
90 | |||||||
91 | $foo->logout(); #get the current logout() value | ||||||
92 | $foo->logout('/home/xxx/log.txt') #set logout() value. | ||||||
93 | |||||||
94 | Note that default value is STDOUT | ||||||
95 | the logout() value can only be set in the new constructor. | ||||||
96 | |||||||
97 | =head4 dbfile() | ||||||
98 | |||||||
99 | define dbm file for store and retrieving extracted informations | ||||||
100 | you must provide au full path to the db file to store results | ||||||
101 | |||||||
102 | dbfile('/where/your/db/is.db') | ||||||
103 | |||||||
104 | The file will should be readable/writable. | ||||||
105 | |||||||
106 | =head4 html() | ||||||
107 | |||||||
108 | Define simple html output where to write search report. | ||||||
109 | you must provide au full path to the html file if you want to get an html output. | ||||||
110 | |||||||
111 | html('/where/the html/repport/is/written.html') | ||||||
112 | |||||||
113 | If $foo->html() is defined. you can get the value of this attribute like this : | ||||||
114 | |||||||
115 | my $html_page = $foo->html | ||||||
116 | |||||||
117 | Default value is automatically defined on the new() call. | ||||||
118 | |||||||
119 | html => "$ENV{HOME}"."/attila_report.html"; | ||||||
120 | |||||||
121 | html file will be used for reporting with search() methods | ||||||
122 | |||||||
123 | =head4 useragent() | ||||||
124 | |||||||
125 | arg should be a valid useragent. There's no reason to change this default value. | ||||||
126 | |||||||
127 | $foo->useragent() | ||||||
128 | |||||||
129 | return the value of the current useragent | ||||||
130 | |||||||
131 | $foo->useragent('xxxxxxxx') | ||||||
132 | |||||||
133 | set the useragent() value to ''xxxxxxxx'. | ||||||
134 | |||||||
135 | =head1 FUNCTIONS | ||||||
136 | |||||||
137 | these functions use the precedent attributes value. | ||||||
138 | |||||||
139 | =head2 search() | ||||||
140 | |||||||
141 | this function takes no arguments. | ||||||
142 | it allows to launch a local dbm search. | ||||||
143 | |||||||
144 | $foo-> search() | ||||||
145 | |||||||
146 | the dbm file is read to give you every couple (title,link) which corresponds to | ||||||
147 | the motif() pattern you defined before. | ||||||
148 | |||||||
149 | =head2 update() | ||||||
150 | |||||||
151 | this function takes no arguments. | ||||||
152 | it allows to initiate the distant search on the web site http://davidbillemont5.free.fr/ ( attila website) | ||||||
153 | the local dbm file is automatically written. Results are accumulated to the dbm file | ||||||
154 | you define on new() call . | ||||||
155 | Note that the update can take a while. | ||||||
156 | |||||||
157 | =head1 AUTHOR | ||||||
158 | |||||||
159 | Timothée Foucart, C<< |
||||||
160 | |||||||
161 | =head1 BUGS | ||||||
162 | |||||||
163 | Please report any bugs or feature requests to | ||||||
164 | C |
||||||
165 | L |
||||||
166 | I will be notified, and then you'll automatically be notified of progress on | ||||||
167 | your bug as I make changes. | ||||||
168 | |||||||
169 | =head1 SEE ALSO | ||||||
170 | |||||||
171 | =over 4 | ||||||
172 | |||||||
173 | =item * L |
||||||
174 | |||||||
175 | =item * L |
||||||
176 | |||||||
177 | =item * L |
||||||
178 | |||||||
179 | =item * L |
||||||
180 | |||||||
181 | =item * L |
||||||
182 | |||||||
183 | =back | ||||||
184 | |||||||
185 | =head1 COPYRIGHT & LICENSE | ||||||
186 | |||||||
187 | Copyright 2005 Timothée Foucart, all rights reserved. | ||||||
188 | |||||||
189 | This program is free software; you can redistribute it and/or modify it | ||||||
190 | under the same terms as Perl itself. | ||||||
191 | |||||||
192 | =cut | ||||||
193 | |||||||
194 | require Exporter; | ||||||
195 | 1 | 1 | 27797 | use vars qw(@ISA @EXPORT $VERSION); | |||
1 | 3 | ||||||
1 | 111 | ||||||
196 | @ISA = qw(Exporter); | ||||||
197 | @EXPORT=qw( debug dbfile | ||||||
198 | get_all_result html logout | ||||||
199 | motif search update useragent ); | ||||||
200 | |||||||
201 | #use warnings; | ||||||
202 | 1 | 1 | 980 | use utf8; | |||
1 | 10 | ||||||
1 | 7 | ||||||
203 | 1 | 1 | 32 | use warnings; | |||
1 | 6 | ||||||
1 | 28 | ||||||
204 | 1 | 1 | 5 | use strict; | |||
1 | 1 | ||||||
1 | 34 | ||||||
205 | 1 | 1 | 7 | use Carp; | |||
1 | 1 | ||||||
1 | 95 | ||||||
206 | 1 | 1 | 3280 | use WWW::Mechanize; | |||
1 | 253389 | ||||||
1 | 49 | ||||||
207 | # | ||||||
208 | 1 | 1 | 559 | use Alias qw(attr); | |||
0 | |||||||
0 | |||||||
209 | use vars qw( $site $nbres $base $debug $useragent $motif %sstsav $logout $fh $tempfile $dbfile $html ); | ||||||
210 | sub new{ | ||||||
211 | my $attila=shift; | ||||||
212 | my $classe= ref($attila) || $attila; | ||||||
213 | my $self={ }; | ||||||
214 | bless($self,$classe); | ||||||
215 | $self=$self->_init(@_); | ||||||
216 | logout($self->{logout}); | ||||||
217 | return $self; | ||||||
218 | }; | ||||||
219 | |||||||
220 | sub _init{ | ||||||
221 | my $self= attr shift; | ||||||
222 | # | ||||||
223 | # -- init default values | ||||||
224 | # | ||||||
225 | $self->{base} ="http://davidbillemont5.free.fr/"; | ||||||
226 | $self->{site} = "http://davidbillemont5.free.fr/Sous-Titres%200.htm"; | ||||||
227 | $self->{tempfile} = "$ENV{HOME}"."/.attila_tmp.html"; | ||||||
228 | $self->{useragent} ="Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.5) Gecko/20031007"; | ||||||
229 | $self->{motif} = undef; | ||||||
230 | $self->{debug} = 0; | ||||||
231 | $self->{logout} = \*STDOUT; | ||||||
232 | $self->{nbres} = 0; | ||||||
233 | $self->{sstsav} = {}; | ||||||
234 | $self->{dbfile} = "$ENV{HOME}"."/attila.db"; | ||||||
235 | $self->{html} = "$ENV{HOME}"."/attila_repport.html"; | ||||||
236 | # | ||||||
237 | # -- replace "forced" values | ||||||
238 | # | ||||||
239 | if (@_) | ||||||
240 | { | ||||||
241 | my %param=@_; | ||||||
242 | while (my($x,$y) =each(%param)){$self->{$x}=$y;}; | ||||||
243 | } | ||||||
244 | return $self; | ||||||
245 | }; | ||||||
246 | sub useragent { | ||||||
247 | my $self =attr shift; | ||||||
248 | if (@_) {$useragent=shift;} | ||||||
249 | return $useragent; | ||||||
250 | } | ||||||
251 | sub dbfile { | ||||||
252 | my $self =attr shift; | ||||||
253 | if (@_) {$dbfile=shift;} | ||||||
254 | return $dbfile; | ||||||
255 | } | ||||||
256 | sub html { | ||||||
257 | my $self =attr shift; | ||||||
258 | if (@_) {$html=shift;} | ||||||
259 | return $html; | ||||||
260 | } | ||||||
261 | sub debug { | ||||||
262 | my $self =attr shift; | ||||||
263 | if (@_) {$debug=shift;} | ||||||
264 | return $debug; | ||||||
265 | } | ||||||
266 | sub sstsav { | ||||||
267 | my $self =attr shift; | ||||||
268 | if (@_) {%sstsav=shift;} | ||||||
269 | return %sstsav; | ||||||
270 | } | ||||||
271 | sub get_all_result { | ||||||
272 | my $self =attr shift; | ||||||
273 | %sstsav=$self->sstsav(); | ||||||
274 | return %sstsav; | ||||||
275 | } | ||||||
276 | sub motif { | ||||||
277 | my $self =attr shift; | ||||||
278 | if (@_) {$motif=shift;} | ||||||
279 | return $motif; | ||||||
280 | } | ||||||
281 | sub logout { | ||||||
282 | #no update after first init | ||||||
283 | if (@_){$logout=shift; } | ||||||
284 | if ($logout) | ||||||
285 | { open(FH , ">>", $logout) or croak "$logout : $!\n"; | ||||||
286 | $fh=(\*FH);} | ||||||
287 | else | ||||||
288 | { $fh=(\*STDOUT);}; | ||||||
289 | return $logout; | ||||||
290 | } | ||||||
291 | sub update{ | ||||||
292 | my $self =attr shift; | ||||||
293 | my $mech = WWW::Mechanize->new(agent=>$useragent, | ||||||
294 | stack_depth => 1, | ||||||
295 | ); | ||||||
296 | print $fh "------------------------------------------------------------------------------------------------\n" if ($debug); | ||||||
297 | print $fh "[DEBUG] begin updating local database from $site at : ".localtime()."\n" if ($debug); | ||||||
298 | print $fh "------------------------------------------------------------------------------------------------\n" if ($debug); | ||||||
299 | |||||||
300 | my @update_base; | ||||||
301 | my $ipage=0; | ||||||
302 | my $attila_page; | ||||||
303 | $mech->get($site) or warn "[WARNING] http get problem on : $site !! \n"; | ||||||
304 | my $links=$mech->find_all_links(); | ||||||
305 | for ( my $ind=0; $ind <= $#{$links} ; $ind++) | ||||||
306 | { | ||||||
307 | if ($links->[$ind]->url_abs()=~m/Sous-Titres/m) | ||||||
308 | { | ||||||
309 | $ipage++; | ||||||
310 | print $fh "[DEBUG][SUBTITLE PAGE : $ipage ]\t".$links->[$ind]->url_abs()."\n" if $debug; | ||||||
311 | push @update_base,$links->[$ind]->url_abs(); | ||||||
312 | }; | ||||||
313 | }; | ||||||
314 | foreach $attila_page (@update_base) | ||||||
315 | { | ||||||
316 | if (-e ($tempfile)) | ||||||
317 | {unlink $tempfile or croak "can not suppress $tempfile : $!\n";}; | ||||||
318 | print $fh "[DEBUG] parsing : ".$attila_page ."\n"; | ||||||
319 | $mech->get($attila_page); | ||||||
320 | open (TAMPON,'>', $tempfile) or croak "can not open $tempfile:$!\n"; | ||||||
321 | print TAMPON $mech->response->as_string; | ||||||
322 | close TAMPON; | ||||||
323 | my %x=parse_attila($tempfile); | ||||||
324 | while (my ($k, $v) = each %x) | ||||||
325 | { $sstsav{$k}=$v;}; | ||||||
326 | save_dbm(%sstsav); | ||||||
327 | }; | ||||||
328 | |||||||
329 | print $fh "[DEBUG] update finished\n"; | ||||||
330 | return; | ||||||
331 | }; | ||||||
332 | # | ||||||
333 | # --- recherche du motif dans la db | ||||||
334 | sub search{ | ||||||
335 | my $self =attr shift; | ||||||
336 | $motif=$self->motif(); | ||||||
337 | if ($html) | ||||||
338 | { | ||||||
339 | open (HTMLFILE,">>",$html) or warn "can not access $html : $! \n"; | ||||||
340 | print HTMLFILE " html generated by suckSub perl module \n"; |
||||||
341 | print HTMLFILE "searching : ".$motif." on ".$site." \n"; |
||||||
342 | print HTMLFILE " ".localtime()." \n"; |
||||||
343 | }; | ||||||
344 | my %hashread; | ||||||
345 | return unless $motif; | ||||||
346 | print $fh " file db is : ". $dbfile."\n"; | ||||||
347 | unless (-e ($dbfile)) | ||||||
348 | {croak "[DEBUG SEARCH] db file ".$dbfile." not found \n maybe you should use update() method to build it ! \n";}; | ||||||
349 | use DB_File; | ||||||
350 | tie(%hashread,'DB_File',$dbfile) | ||||||
351 | or croak "can not access : $dbfile : $!\n"; | ||||||
352 | while (my ($k,$v)=each(%hashread)) | ||||||
353 | { | ||||||
354 | if ($v =~ m/$motif/i) | ||||||
355 | { | ||||||
356 | print $fh "[FOUND Libelle ] $v \n[FOUND LINK]".$k ."\n"; | ||||||
357 | if ($html) | ||||||
358 | { | ||||||
359 | print HTMLFILE "".$v." \n"; |
||||||
360 | $nbres++ | ||||||
361 | }; | ||||||
362 | |||||||
363 | }; | ||||||
364 | }; | ||||||
365 | untie(%hashread); | ||||||
366 | if ($html) | ||||||
367 | { | ||||||
368 | print HTMLFILE " ".$nbres." result(s) found \n"; |
||||||
369 | print HTMLFILE " html finished at ".localtime()." \n"; |
||||||
370 | close HTMLFILE; | ||||||
371 | }; | ||||||
372 | return; | ||||||
373 | }; | ||||||
374 | |||||||
375 | #--------------------------------------------------------------------------- | ||||||
376 | #-- save updated hash into dbm file | ||||||
377 | #-- internal use only | ||||||
378 | #--------------------------------------------------------------------------- | ||||||
379 | sub save_dbm{ | ||||||
380 | my $self =attr shift; | ||||||
381 | my %hashtosave; | ||||||
382 | use DB_File; | ||||||
383 | tie (%hashtosave,'DB_File',$dbfile ) | ||||||
384 | or croak "can not use $dbfile : $!\n"; | ||||||
385 | while (my ($k, $v) = each %sstsav) | ||||||
386 | { $hashtosave{$k}=$v;}; | ||||||
387 | untie(%hashtosave); | ||||||
388 | return; | ||||||
389 | }; | ||||||
390 | #--------------------------------------------------------------------------- | ||||||
391 | #--- parse one .htm page and extract label + info + link into memo hash | ||||||
392 | #-- internal use only | ||||||
393 | #--------------------------------------------------------------------------- | ||||||
394 | sub parse_attila{ | ||||||
395 | use HTML::Parser; | ||||||
396 | use vars qw( %hsav $top_label1 $label $endor ); | ||||||
397 | my $file=$_[0]; | ||||||
398 | $label=""; | ||||||
399 | $top_label1=0; #flag begin label or text to get | ||||||
400 | $endor=0;# flag end of row => re-init counters for states analyse | ||||||
401 | my $p = HTML::Parser->new(); | ||||||
402 | # | ||||||
403 | $p->handler( start => \&start_attila, "tagname,attr" ); | ||||||
404 | $p->handler( text => \&text_attila, "text" ); | ||||||
405 | $p->unbroken_text( 1 ); | ||||||
406 | $p->marked_sections( 0 ); | ||||||
407 | $p->ignore_elements(qw(script style)); | ||||||
408 | # | ||||||
409 | $p->parse_file($file); | ||||||
410 | $p->eof; | ||||||
411 | # | ||||||
412 | # | ||||||
413 | # | ||||||
414 | sub start_attila { | ||||||
415 | my ( $tag, $args ) = @_; | ||||||
416 | #--- searching 'td' tag -> verify width of each column | ||||||
417 | if ( $tag eq 'td' ) | ||||||
418 | { | ||||||
419 | return unless $args->{width}; | ||||||
420 | # french label and orig title in the array | ||||||
421 | if ( ($args->{width} eq '39%') && ($top_label1==0) ) | ||||||
422 | { $top_label1++;}; | ||||||
423 | if ( ($args->{width} eq '39%') && ($top_label1==1) ) | ||||||
424 | { $top_label1++;}; | ||||||
425 | # possible width variation | ||||||
426 | if ( ($args->{width} eq '38%') && ($top_label1==0) ) | ||||||
427 | { $top_label1++;}; | ||||||
428 | if ( ($args->{width} eq '38%') && ($top_label1==1) ) | ||||||
429 | { $top_label1++;}; | ||||||
430 | # number of cd width = 10-11% | ||||||
431 | if ( ($args->{width} eq '10%') && ($top_label1>0)) | ||||||
432 | { $top_label1++;$endor=1}; | ||||||
433 | if ( ($args->{width} eq '11%') && ($top_label1>0)) | ||||||
434 | { $top_label1++;$endor=1}; | ||||||
435 | } | ||||||
436 | #---searching sub links in html page | ||||||
437 | if (( $tag eq 'a' ) && ($args->{href})) | ||||||
438 | { | ||||||
439 | if ($args->{href} =~ m/Subs\// ) | ||||||
440 | { | ||||||
441 | $hsav{$base.$args->{href}}=$label; | ||||||
442 | #DEBUG#print "[DEBUG PARSER]". $args->{href} ." ===>".$label."\n"; | ||||||
443 | $label="";$top_label1=0; | ||||||
444 | |||||||
445 | }; | ||||||
446 | }; | ||||||
447 | }; | ||||||
448 | sub text_attila { | ||||||
449 | my $text= shift; | ||||||
450 | $text =~ tr/ //s; # nbsp html | ||||||
451 | $text =~ tr/ /_/s; # | ||||||
452 | $text =~ s/-/_/gi; | ||||||
453 | $text =~ s/\n//gi; # | ||||||
454 | $text =~ tr/_/_/s; # | ||||||
455 | if ($top_label1>0) | ||||||
456 | { | ||||||
457 | return if ($text eq "_"); # texte parasite | ||||||
458 | $label=$label."[".$text."]"; | ||||||
459 | $top_label1++; | ||||||
460 | if ($endor==1){$top_label1=0;$endor=0}; | ||||||
461 | #DEBUG#print "[DEBUG PARSER LABEL] ". $label ."\n"; | ||||||
462 | }; | ||||||
463 | return $label | ||||||
464 | }; | ||||||
465 | return %hsav; | ||||||
466 | } | ||||||
467 | # | ||||||
468 | |||||||
469 | |||||||
470 | |||||||
471 | 1; # End of WWW::Sucksub::Attila |