line
stmt
bran
cond
sub
pod
time
code
1
package HTML::Miner ;
2
3
6
6
32613
use 5.006 ;
6
56
6
259
4
5
6
6
66
use strict ;
6
11
6
256
6
6
6
31
use warnings FATAL => 'all' ;
6
16
6
636
7
8
6
6
42
use Carp ;
6
10
6
658
9
10
6
6
34
use Exporter ;
6
10
6
38272
11
12
13
=head1 NAME
14
15
HTML::Miner - This Module 'Mines' (hopefully) useful information for an URL or HTML snippet.
16
17
=head1 VERSION
18
19
Version 1.02
20
21
=cut
22
23
our $VERSION = '1.03';
24
25
=head1 SYNOPSIS
26
27
HTML::Miner 'Mines' (hopefully) useful information for an URL or HTML snippet. The following is a
28
list of HTML elements that can be extracted:
29
30
=over 5
31
32
=item *
33
34
Find all links and for each link extract:
35
36
=over 7
37
38
=item URL Title
39
40
=item URL href
41
42
=item URL Anchor Text
43
44
=item URL Domain
45
46
=item URL Protocol
47
48
=item URL URI
49
50
=item URL Absolute location
51
52
=back
53
54
=item *
55
56
Find all images and for each image extract:
57
58
=over 3
59
60
=item IMG Source URL
61
62
=item IMG Absolute Source URL
63
64
=item IMG Source Domain
65
66
=back
67
68
=item *
69
70
Extracts Meta Elements such as
71
72
=over 4
73
74
=item Page Title
75
76
=item Page Description
77
78
=item Page Keywords
79
80
=item Page RSS Feeds
81
82
=back
83
84
=item *
85
86
Finds the final destination URL of a potentially redirecting URL.
87
88
=item *
89
90
Find all JS and CSS files used within the HTML and find their absolute URL if required.
91
92
=back
93
94
95
=head2 Example ( Object Oriented Usage )
96
97
use HTML::Miner;
98
99
my $html = "some html";
100
# or $html = do{local $/;}; with __DATA__ provided
101
102
my $html_miner = HTML::Miner->new (
103
104
CURRENT_URL => 'www.perl.org' ,
105
CURRENT_URL_HTML => $html
106
107
);
108
109
110
my $meta_data = $html_miner->get_meta_elements() ;
111
my $links = $html_miner->get_links() ;
112
my $images = $html_miner->get_images() ;
113
114
my ( $clear_url, $protocol, $domain, $uri ) = $html_miner->break_url();
115
116
my $css_and_js = $html_miner->get_page_css_and_js() ;
117
118
my $out = HTML::Miner::get_redirect_destination( "redirectingurl_here.html" ) ;
119
120
my $out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/", "../../about/" );
121
122
123
=head2 Example ( Direct access of Methods )
124
125
use HTML::Miner;
126
127
my $html = "some html";
128
# or $html = do{local $/;}; with __DATA__ provided
129
130
my $url = "http://www.perl.org";
131
132
my $meta_data = HTML::Miner::get_meta_elements( $url, $html ) ;
133
my $links = HTML::Miner::get_links( $url, $html ) ;
134
my $images = HTML::Miner::get_images( $url, $html ) ;
135
136
my ( $clear_url, $protocol, $domain, $uri ) = HTML::Minerbreak_url( $url );
137
138
my $css_and_js = get_page_css_and_js(
139
URL => $url ,
140
HTML => $optionally_html_of_url ,
141
CONVERT_URLS_TO_ABS => 0/1 , [ Optional argument, default is 1 ]
142
);
143
144
my $out = HTML::Miner::get_redirect_destination( "redirectingurl_here.html" ) ;
145
146
my $out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/", "../../about/" );
147
148
149
150
151
=head2 Test Data
152
153
__DATA__
154
155
156
157
SiteTitle
158
159
160
161
162
163
164
165
166
167
168
169
170
171
Link1
172
Link2
173
Link3
174
175
176
177
178
179
180
181
182
183
184
185
186
=head2 Example Output:
187
188
189
my $meta_data = $html_miner->get_meta_elements() ;
190
191
# $meta_data->{ TITLE } => "SiteTitle"
192
# $meta_data->{ DESC } => "desc of site"
193
# $meta_data->{ KEYWORDS }->[0] => "kw1"
194
# $meta_data->{ RSS }->[0]->{TYPE} => "application/atom+xml"
195
196
197
198
my $links = $html_miner->get_links();
199
200
# $links->[0]->{ DOMAIN } => "linkone.com"
201
# $links->[0]->{ ANCHOR } => "Link1"
202
# $links->[2]->{ ABS_URL } => "http://my_domain_to_mine.com/link3"
203
# $links->[1]->{ DOMAIN_IS_BASE } => 1
204
# $links->[1]->{ TITLE } => "title2"
205
206
207
208
my $images = $html_miner->get_images();
209
210
# $images->[0]->{ IMG_LOC } => "http://my_domain_to_mine.com/logo_plain.jpg"
211
# $images->[2]->{ ALT } => "link3"
212
# $images->[0]->{ IMG_DOMAIN } => "my_domain_to_mine.com"
213
# $images->[3]->{ ABS_LOC } => "http://my_domain_to_mine.com/image3.jpg"
214
215
216
217
my $css_and_js = $html_miner->get_page_css_and_js(
218
CONVERT_URLS_TO_ABS => 0
219
);
220
221
# $css_and_js will contain:
222
# {
223
# CSS => [
224
# "http://static.mycssdomain.com/frameworks/style/main.css",
225
# "/rel_cssfile.css",
226
# ],
227
# JS => [
228
# "http://static.myjsdomain.com/frameworks/barlesque.js",
229
# "http://js.revsci.net/gateway/gw.js?csid=J08781",
230
# "/about/rel_jsfile.js",
231
# ],
232
# }
233
234
235
my $css_and_js = $html_miner->get_page_css_and_js(
236
CONVERT_URLS_TO_ABS => 1
237
);
238
239
# $css_and_js will contain:
240
# {
241
# CSS => [
242
# "http://static.mycssdomain.com/frameworks/style/main.css",
243
# "http://www.perl.org/rel_cssfile.css",
244
# ],
245
# JS => [
246
# "http://static.myjsdomain.com/frameworks/barlesque.js",
247
# "http://js.revsci.net/gateway/gw.js?csid=J08781",
248
# "http://www.perl.org/about/rel_jsfile.js",
249
# ],
250
# }
251
252
253
254
my ( $clear_url, $protocol, $domain, $uri ) = $html_miner->break_url();
255
256
# $clear_url => "http://my_domain_to_mine.com/my_page_to_mine.pl"
257
# $protocol => "http"
258
# $domain => "my_domain_to_mine.com"
259
# $uri => "/my_page_to_mine.pl"
260
261
262
HTML::Miner::get_redirect_destination( "redirectingurl_here.html" ) => 'redirected_to'
263
264
265
266
my $out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/", "../../about/" );
267
# $out => "http://www.perl.com/about/"
268
269
$out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/index.html", "index2.html" );
270
# $out => "http://www.perl.com/help/faq/index2.html"
271
272
$out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/", "../../index.html" );
273
# $out => "http://www.perl.com/index.html"
274
275
$out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/", "/about/" );
276
# $out => "http://www.perl.com/about/"
277
278
$out = HTML::Miner::get_absolute_url( "www.perl.comhelp/faq/", "http://othersite.com" );
279
# $out => "http://othersite.com/"
280
281
282
283
284
=head1 EXPORT
285
286
This Module does not export anything through @EXPORT, however does export all externally
287
available functions through @EXPORT_OK
288
289
=cut
290
291
our @ISA = qw(Exporter);
292
293
our @EXPORT_OK = qw( get_links get_absolute_url break_url get_redirect_destination get_redirect_destination_thread_safe get_images get_meta_elements get_page_css_and_js );
294
295
=head1 SUBROUTINES/METHODS
296
297
The following functions are all available directly and through the HTML::Miner Object.
298
299
=head2 new
300
301
The constructor validates the input data and retrieves a URL if the HTML is not provided.
302
303
The constructor takes the following parameters:
304
305
my $foo = HTML::Miner->new (
306
CURRENT_URL => 'www.site_i_am_crawling.com/page_i_am_crawling.html' , # REQUIRED - 'new' will croak
307
# if this is not provided.
308
CURRENT_URL_HTML => 'long string here' , # Optional - Will be extracted
309
# from CURRENT_URL if not provided.
310
USER_AGENT => 'Perl_HTML_Miner/$VERSION' , # Optional - default:
311
# 'Perl_HTML_Miner/$VERSION'
312
TIMEOUT => 5 , # Optional - default: 5 ( Seconds )
313
314
DEBUG => 0 , # Optional - default: 0
315
316
);
317
318
=cut
319
320
sub new {
321
322
4
4
1
14088
my $class = shift;
323
324
4
10
my %parameter_hash;
325
326
4
11
my $count = @_;
327
328
4
25
my $useage_howto = "
329
330
Usage:
331
332
333
my \$foo = HTML::Miner->new (
334
CURRENT_URL => 'www.site_i_am_crawling.com/page_i_am_crawling.html' , # REQUIRED - 'new' will croak
335
# if this is not provided.
336
CURRENT_URL_HTML => 'long string here' , # Optional - Will be extracted
337
# from CURRENT_URL if not provided.
338
USER_AGENT => 'Perl_HTML_Miner/$VERSION' , # Optional - default:
339
# 'Perl_HTML_Miner/$VERSION'
340
TIMEOUT => 5 , # Optional - default: 5 ( Seconds )
341
342
DEBUG => 0 , # Optional - default: 0
343
344
);
345
346
";
347
348
4
50
18
unless( $count > 1 ) {
349
0
0
croak( $useage_howto );
350
} else {
351
4
24
%parameter_hash = @_;
352
}
353
354
355
## Require parameter.
356
croak( $useage_howto )
357
4
50
20
unless( $parameter_hash{ CURRENT_URL } ) ;
358
359
## Setting defaults unless parameters are set.
360
my $require_extract = 1
361
4
50
19
unless( $parameter_hash{ CURRENT_URL_HTML } ) ;
362
363
$parameter_hash{USER_AGENT} = 'Perl_HTML_Miner/'.$VERSION
364
4
50
46
unless( $parameter_hash{ USER_AGENT } ) ;
365
$parameter_hash{TIMEOUT} = 60
366
4
50
25
unless( $parameter_hash{ TIMEOUT } ) ;
367
368
$parameter_hash{DEBUG} = 0
369
4
50
23
unless( $parameter_hash{ DEBUG } ) ;
370
371
$parameter_hash{ABSOLUTE_ALL_CONTAINED_URLS} = 0
372
4
50
15
unless( $parameter_hash{ ABSOLUTE_ALL_CONTAINED_URLS } ) ;
373
374
375
## Require additional modules.
376
377
4
50
21
if( $require_extract ) {
378
379
0
0
eval {
380
0
0
require LWP::UserAgent ;
381
0
0
require HTTP::Request ;
382
0
0
0
}; croak( "LWP::UserAgent and HTTP::Request are required if the url is to be fetched!" )
383
if( $@ );
384
385
0
0
my $tmp;
386
0
0
( $parameter_hash{ CURRENT_URL }, $tmp, $tmp, $tmp ) = _convert_to_valid_url( $parameter_hash{ CURRENT_URL } );
387
388
$parameter_hash{ CURRENT_URL_HTML } =
389
_get_url_html(
390
$parameter_hash{ CURRENT_URL },
391
$parameter_hash{ USERAGENT },
392
$parameter_hash{ TIMEOUT }
393
0
0
);
394
395
}
396
397
## Check on the correctness of the input url.
398
399
my ( $url, $protocol, $domain_name, $uri ) =
400
4
23
_convert_to_valid_url( $parameter_hash{ CURRENT_URL } );
401
402
4
674
$parameter_hash{ CURRENT_URL } = $url;
403
404
my $self = {
405
406
CURRENT_URL => $parameter_hash{ CURRENT_URL } ,
407
408
CURRENT_URL_HTML => $parameter_hash{ CURRENT_URL_HTML } ,
409
410
USER_AGENT => $parameter_hash{ USER_AGENT } ,
411
TIMEOUT => $parameter_hash{ TIMEOUT } ,
412
413
DEBUG => $parameter_hash{ DEBUG } ,
414
415
ABSOLUTE_ALL_CONTAINED_URLS => $parameter_hash{ ABSOLUTE_ALL_CONTAINED_URLS } ,
416
417
4
62
_REQUIRE_EXTRACT => $require_extract ,
418
_BASE_PROTOCOL => $protocol ,
419
_BASE_DOMAIN => $domain_name ,
420
_BASE_URI => $uri
421
422
};
423
424
425
## Private and class data here.
426
427
## NONE
428
429
430
4
683
bless( $self, $class );
431
432
4
50
589
if( $self->{ DEBUG } == 1 ) {
433
0
0
print STDERR "HTML::Miner Object: \n" ;
434
0
0
print "$self"; ;
435
}
436
437
4
37
return $self;
438
439
}
440
441
442
=head2 get_links
443
444
This function extracts all URLs from a web page.
445
446
B
447
448
When called on an HTML::Miner Object :
449
450
$retun_element = $html_miner->get_links();
451
452
When called directly :
453
454
$retun_element = get_links( $url, $optionally_html_of_url );
455
456
The direct call is intended to be a simplified version of OO call
457
and so does not allow for customization of the useragent and so on!
458
459
460
B
461
462
This function ( regardless of how its called ) returns a pointer to an Array of Hashes who's structure is as follows:
463
464
$->Array(
465
Hash->{
466
"URL" => "extracted url" ,
467
"ABS_EXISTS" => "0_if_abs_url_extraction_failed" ,
468
"ABS_URL" => "absolute_location_of_extracted_url" ,
469
"TITLE" => "title_of_this_url" ,
470
"ANCHOR" => "anchor_text_of_this_url" ,
471
"DOMAIN" => "domain_of_this_url" ,
472
"DOMAIN_IS_BASE" => "1_if_this_domain_same_as_base_domain ,
473
"PROTOCOL" => "protocol_of_this_domain" ,
474
"URI" => "URI_of_this_url" ,
475
},
476
...
477
)
478
479
So, to access the title of the second URL found you would use (yes the order is maintained):
480
481
@{ $retun_element }[1]->{ TITLE }
482
483
B
484
485
If ABS_EXISTS is 0 then DOMAIN, DOMAIN_IS_BASE, PROTOCOL and URI will be undefined
486
487
To extract URLs from a HTML snippet when one does not care about the url of that page, simply pass some garbage as the URL
488
and ignore everything except URL, TITLE and ANCHOR
489
490
"ANCHOR" might contain HTML such as , use HTML::Strip if required.
491
492
=cut
493
494
sub get_links {
495
496
5
5
1
22757
my $tmp = shift ;
497
498
5
13
my $self ;
499
my $url ;
500
0
0
my $html ;
501
502
0
0
my @result_arr ;
503
504
5
19
my $user_agent = "Html_Miner/$VERSION" ;
505
5
8
my $timeout = 60 ;
506
507
508
## First extract all required information.
509
510
5
100
42
if( UNIVERSAL::isa( $tmp, 'HTML::Miner' ) ) {
511
512
3
6
$self = $tmp ;
513
514
3
9
$url = $self->{ CURRENT_URL } ;
515
3
5
$html = $self->{ CURRENT_URL_HTML } ;
516
517
} else {
518
519
2
6
$url = $tmp ;
520
521
## Check for validity of url!
522
2
10
my ( $tmp, $protocol, $domain_name, $uri ) =
523
_convert_to_valid_url( $url ) ;
524
2
5
$url = $tmp ;
525
526
2
8
my @params = @_ ;
527
2
4
my $html_has_been_passed = @params ;
528
529
530
2
50
9
if( $html_has_been_passed ) {
531
2
7
$html = shift ;
532
} else {
533
534
## Need to retrieve html
535
536
0
0
eval {
537
0
0
require LWP::UserAgent ;
538
0
0
require HTTP::Request ;
539
};
540
0
0
0
croak( "LWP::UserAgent and HTTP::Request are required if the url is to be fetched!" )
541
if( $@ );
542
543
544
0
0
$html = _get_url_html( $url, $user_agent, $timeout ) ;
545
546
} ## HTML Not passed
547
548
549
} ## Not called on Object.
550
551
552
## Now start extracting the URLs
553
554
5
504
while( $html =~ m/(<\s*?a\s+?href\s*?=(\"|\')([^(\"|\')]*?)(\"|\')([^>]*?)>(.*?)<\s*?\/a\s*?>)/gis ){
555
556
797
1387
my $this_url = $3 ;
557
797
1097
my $this_anchor = $6 ;
558
559
797
1542
my $match = $1 ;
560
797
876
my $this_title = "" ;
561
797
100
3119
if( $match =~ m/title=(\"|\')([^(\"|\')]*?)(\"|\')/is ) {
562
139
246
$this_title = $2;
563
}
564
565
797
865
my $this_abs_url = "" ;
566
797
787
my $this_abs_url_exists = 1 ;
567
797
813
eval{
568
569
797
1245
$this_abs_url = get_absolute_url( $url, $this_url );
570
571
797
50
1484
}; $this_abs_url_exists = 0 if( $@ );
572
573
797
769
my $this_domain ;
574
my $this_domain_is_base_domain ;
575
0
0
my $this_protocol ;
576
0
0
my $this_uri ;
577
797
50
1349
if( $this_abs_url_exists ) {
578
579
797
726
my $tmp;
580
797
814
eval {
581
797
1202
( $tmp, $this_protocol, $this_domain, $this_uri ) =
582
_convert_to_valid_url( $this_abs_url ) ;
583
797
50
1980
}; $this_abs_url_exists = 0 if( $@ );
584
585
586
797
733
my ( $protocol, $domain, $uri );
587
797
799
eval {
588
797
1178
( $tmp, $protocol, $domain, $uri ) =
589
_convert_to_valid_url( $url ) ;
590
797
50
1990
}; croak( "Unexpected Error - Giving up!" ) if( $@ );
591
592
593
797
100
1691
$this_domain_is_base_domain = ( $domain eq $this_domain ) ? 1 : 0;
594
595
}
596
597
797
5624
my %this_url_hash = (
598
"URL" => $this_url ,
599
"ABS_EXISTS" => $this_abs_url_exists ,
600
"ABS_URL" => $this_abs_url ,
601
"TITLE" => $this_title ,
602
"ANCHOR" => $this_anchor ,
603
"DOMAIN" => $this_domain ,
604
"DOMAIN_IS_BASE" => $this_domain_is_base_domain ,
605
"PROTOCOL" => $this_protocol ,
606
"URI" => $this_uri
607
);
608
609
797
24569
push( @result_arr, \%this_url_hash );
610
611
}
612
613
614
5
171
return \@result_arr;
615
616
}
617
618
619
=head2 get_page_css_and_js
620
621
This function extracts all CSS style sheets and JS Script files use on a web page.
622
623
B
624
625
When called on an HTML::Miner Object :
626
627
$retun_element = $html_miner->get_page_css_and_js(
628
CONVERT_URLS_TO_ABS => 0/1 [ B argument, default is 1 ]
629
);
630
631
When called directly :
632
633
$retun_element = get_page_css_and_js(
634
URL => $url ,
635
HTML => $optionally_html_of_url , [ B argument, html extracted if not provided ]
636
CONVERT_URLS_TO_ABS => 0/1 , [ B argument, default is 1 ]
637
);
638
639
The direct call is intended to be a simplified version of OO call
640
and so does not allow for customization of the useragent and so on!
641
642
643
B
644
645
This function ( regardless of how its called ) returns a pointer to a Hash [ JS or CSS ] of Arrays containing the URLs
646
647
$->HASH->{
648
"CSS" => Array( "extracted url1", "extracted url2", .. )
649
"JS" => Array( "extracted url1", "extracted url2", .. )
650
}
651
652
So, to access the URL of the second CSS style sheet found you would use (again the order is maintained):
653
654
$$retun_element{ "CSS" }[1];
655
656
Or
657
$css_data = @{ $retun_element->{ "CSS" } } ;
658
$second_css_url_found = $css_data[1] ;
659
660
B
661
662
To extract CSS and JS links from a HTML snippet when one does not care about the url of that page, simply set CONVERT_URLS_TO_ABS to 0 and everything should be fine.
663
664
665
=cut
666
667
sub get_page_css_and_js {
668
669
4
4
1
2174
my $number_of_arguments = @_ ;
670
671
4
5
my $self ;
672
4
100
17
unless( int( $number_of_arguments / 2 ) * 2 == $number_of_arguments ) { # Odd number of elems, Must have been called on Obj.
673
2
3
$self = shift ;
674
}
675
676
4
12
my %params = @_ ;
677
678
4
50
11
$params{ CONVERT_URLS_TO_ABS } = 1 unless( defined( $params{ CONVERT_URLS_TO_ABS } ) );
679
680
4
4
my $url ;
681
my $html ;
682
683
4
9
my $user_agent = "Perl_Html_Miner/$VERSION" ;
684
4
5
my $timeout = 60 ;
685
686
## First extract all required information.
687
688
4
100
9
if( defined( $self ) ) {
689
2
50
8
if( UNIVERSAL::isa( $self, 'HTML::Miner' ) ) {
690
2
5
$url = $self->{ CURRENT_URL } ;
691
2
3
$html = $self->{ CURRENT_URL_HTML } ;
692
} else {
693
0
0
croak( "get_page_css_and_js called with params I can't understand!" );
694
}
695
} else {
696
697
2
4
$url = $params{ URL } ;
698
699
## Check for validity of url!
700
2
5
my ( $tmp, $protocol, $domain_name, $uri ) =
701
_convert_to_valid_url( $url ) ;
702
2
4
$url = $tmp ;
703
704
2
50
5
my $html_has_been_passed = defined( $params{ HTML } ) ? 1 : 0 ;
705
706
707
2
50
5
if( $html_has_been_passed ) {
708
2
4
$html = $params{ HTML } ;
709
} else {
710
711
## Need to retrieve html
712
713
0
0
eval {
714
0
0
require LWP::UserAgent ;
715
0
0
require HTTP::Request ;
716
};
717
0
0
0
croak( "LWP::UserAgent and HTTP::Request are required if the url is to be fetched!" )
718
if( $@ );
719
720
0
0
$html = _get_url_html( $url, $user_agent, $timeout ) ;
721
722
} ## HTML Not passed
723
724
725
} ## Not called on Object.
726
727
728
## Now start extracting the URLs
729
730
## CSS
731
732
4
5
my @css_files ;
733
4
110
while ( $html =~ m/(
734
8
48
my $css_url = $2 ;
735
8
100
22
if( $params{ CONVERT_URLS_TO_ABS } ) {
736
4
10
$css_url = get_absolute_url( $url, $2 ) ;
737
}
738
8
59
push @css_files, $css_url ;
739
}
740
741
742
743
## JS
744
745
4
5
my @js_files ;
746
4
57
while ( $html =~ m/(