File Coverage

blib/lib/HTML/Miner.pm

Criterion	Covered	Total	%
statement	258	395	65.3
branch	67	154	43.5
condition	10	30	33.3
subroutine	15	18	83.3
pod	9	9	100.0
total	359	606	59.2

line	stmt	bran	sub	pod	time	code
1						package HTML::Miner ;
2
3	6		6		32613	use 5.006 ;
	6				56
	6				259
4
5	6		6		66	use strict ;
	6				11
	6				256
6	6		6		31	use warnings FATAL => 'all' ;
	6				16
	6				636
7
8	6		6		42	use Carp ;
	6				10
	6				658
9
10	6		6		34	use Exporter ;
	6				10
	6				38272
11
12
13						=head1 NAME
14
15						HTML::Miner - This Module 'Mines' (hopefully) useful information for an URL or HTML snippet.
16
17						=head1 VERSION
18
19						Version 1.02
20
21						=cut
22
23						our $VERSION = '1.03';
24
25						=head1 SYNOPSIS
26
27						HTML::Miner 'Mines' (hopefully) useful information for an URL or HTML snippet. The following is a
28						list of HTML elements that can be extracted:
29
30						=over 5
31
32						=item *
33
34						Find all links and for each link extract:
35
36						=over 7
37
38						=item URL Title
39
40						=item URL href
41
42						=item URL Anchor Text
43
44						=item URL Domain
45
46						=item URL Protocol
47
48						=item URL URI
49
50						=item URL Absolute location
51
52						=back
53
54						=item *
55
56						Find all images and for each image extract:
57
58						=over 3
59
60						=item IMG Source URL
61
62						=item IMG Absolute Source URL
63
64						=item IMG Source Domain
65
66						=back
67
68						=item *
69
70						Extracts Meta Elements such as
71
72						=over 4
73
74						=item Page Title
75
76						=item Page Description
77
78						=item Page Keywords
79
80						=item Page RSS Feeds
81
82						=back
83
84						=item *
85
86						Finds the final destination URL of a potentially redirecting URL.
87
88						=item *
89
90						Find all JS and CSS files used within the HTML and find their absolute URL if required.
91
92						=back
93
94
95						=head2 Example ( Object Oriented Usage )
96
97						use HTML::Miner;
98
99						my $html = "some html";
100						# or $html = do{local $/;}; with __DATA__ provided
101
102						my $html_miner = HTML::Miner->new (
103
104						CURRENT_URL => 'www.perl.org' ,
105						CURRENT_URL_HTML => $html
106
107						);
108
109
110						my $meta_data = $html_miner->get_meta_elements() ;
111						my $links = $html_miner->get_links() ;
112						my $images = $html_miner->get_images() ;
113
114						my ( $clear_url, $protocol, $domain, $uri ) = $html_miner->break_url();
115
116						my $css_and_js = $html_miner->get_page_css_and_js() ;
117
118						my $out = HTML::Miner::get_redirect_destination( "redirectingurl_here.html" ) ;
119
120						my $out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/", "../../about/" );
121
122
123						=head2 Example ( Direct access of Methods )
124
125						use HTML::Miner;
126
127						my $html = "some html";
128						# or $html = do{local $/;}; with __DATA__ provided
129
130						my $url = "http://www.perl.org";
131
132						my $meta_data = HTML::Miner::get_meta_elements( $url, $html ) ;
133						my $links = HTML::Miner::get_links( $url, $html ) ;
134						my $images = HTML::Miner::get_images( $url, $html ) ;
135
136						my ( $clear_url, $protocol, $domain, $uri ) = HTML::Minerbreak_url( $url );
137
138						my $css_and_js = get_page_css_and_js(
139						URL => $url ,
140						HTML => $optionally_html_of_url ,
141						CONVERT_URLS_TO_ABS => 0/1 , [ Optional argument, default is 1 ]
142						);
143
144						my $out = HTML::Miner::get_redirect_destination( "redirectingurl_here.html" ) ;
145
146						my $out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/", "../../about/" );
147
148
149
150
151						=head2 Test Data
152
153						__DATA__
154
155
156
157						SiteTitle
158
159
160
161
162
163
164
165
166
167
168
169
170
171						Link1
172						Link2
173						Link3
174
175
176
177
178
179
180
181
182
183
184
185
186						=head2 Example Output:
187
188
189						my $meta_data = $html_miner->get_meta_elements() ;
190
191						# $meta_data->{ TITLE } => "SiteTitle"
192						# $meta_data->{ DESC } => "desc of site"
193						# $meta_data->{ KEYWORDS }->[0] => "kw1"
194						# $meta_data->{ RSS }->[0]->{TYPE} => "application/atom+xml"
195
196
197
198						my $links = $html_miner->get_links();
199
200						# $links->[0]->{ DOMAIN } => "linkone.com"
201						# $links->[0]->{ ANCHOR } => "Link1"
202						# $links->[2]->{ ABS_URL } => "http://my_domain_to_mine.com/link3"
203						# $links->[1]->{ DOMAIN_IS_BASE } => 1
204						# $links->[1]->{ TITLE } => "title2"
205
206
207
208						my $images = $html_miner->get_images();
209
210						# $images->[0]->{ IMG_LOC } => "http://my_domain_to_mine.com/logo_plain.jpg"
211						# $images->[2]->{ ALT } => "link3"
212						# $images->[0]->{ IMG_DOMAIN } => "my_domain_to_mine.com"
213						# $images->[3]->{ ABS_LOC } => "http://my_domain_to_mine.com/image3.jpg"
214
215
216
217						my $css_and_js = $html_miner->get_page_css_and_js(
218						CONVERT_URLS_TO_ABS => 0
219						);
220
221						# $css_and_js will contain:
222						# {
223						# CSS => [
224						# "http://static.mycssdomain.com/frameworks/style/main.css",
225						# "/rel_cssfile.css",
226						# ],
227						# JS => [
228						# "http://static.myjsdomain.com/frameworks/barlesque.js",
229						# "http://js.revsci.net/gateway/gw.js?csid=J08781",
230						# "/about/rel_jsfile.js",
231						# ],
232						# }
233
234
235						my $css_and_js = $html_miner->get_page_css_and_js(
236						CONVERT_URLS_TO_ABS => 1
237						);
238
239						# $css_and_js will contain:
240						# {
241						# CSS => [
242						# "http://static.mycssdomain.com/frameworks/style/main.css",
243						# "http://www.perl.org/rel_cssfile.css",
244						# ],
245						# JS => [
246						# "http://static.myjsdomain.com/frameworks/barlesque.js",
247						# "http://js.revsci.net/gateway/gw.js?csid=J08781",
248						# "http://www.perl.org/about/rel_jsfile.js",
249						# ],
250						# }
251
252
253
254						my ( $clear_url, $protocol, $domain, $uri ) = $html_miner->break_url();
255
256						# $clear_url => "http://my_domain_to_mine.com/my_page_to_mine.pl"
257						# $protocol => "http"
258						# $domain => "my_domain_to_mine.com"
259						# $uri => "/my_page_to_mine.pl"
260
261
262						HTML::Miner::get_redirect_destination( "redirectingurl_here.html" ) => 'redirected_to'
263
264
265
266						my $out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/", "../../about/" );
267						# $out => "http://www.perl.com/about/"
268
269						$out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/index.html", "index2.html" );
270						# $out => "http://www.perl.com/help/faq/index2.html"
271
272						$out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/", "../../index.html" );
273						# $out => "http://www.perl.com/index.html"
274
275						$out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/", "/about/" );
276						# $out => "http://www.perl.com/about/"
277
278						$out = HTML::Miner::get_absolute_url( "www.perl.comhelp/faq/", "http://othersite.com" );
279						# $out => "http://othersite.com/"
280
281
282
283
284						=head1 EXPORT
285
286						This Module does not export anything through @EXPORT, however does export all externally
287						available functions through @EXPORT_OK
288
289						=cut
290
291						our @ISA = qw(Exporter);
292
293						our @EXPORT_OK = qw( get_links get_absolute_url break_url get_redirect_destination get_redirect_destination_thread_safe get_images get_meta_elements get_page_css_and_js );
294
295						=head1 SUBROUTINES/METHODS
296
297						The following functions are all available directly and through the HTML::Miner Object.
298
299						=head2 new
300
301						The constructor validates the input data and retrieves a URL if the HTML is not provided.
302
303						The constructor takes the following parameters:
304
305						my $foo = HTML::Miner->new (
306						CURRENT_URL => 'www.site_i_am_crawling.com/page_i_am_crawling.html' , # REQUIRED - 'new' will croak
307						# if this is not provided.
308						CURRENT_URL_HTML => 'long string here' , # Optional - Will be extracted
309						# from CURRENT_URL if not provided.
310						USER_AGENT => 'Perl_HTML_Miner/$VERSION' , # Optional - default:
311						# 'Perl_HTML_Miner/$VERSION'
312						TIMEOUT => 5 , # Optional - default: 5 ( Seconds )
313
314						DEBUG => 0 , # Optional - default: 0
315
316						);
317
318						=cut
319
320						sub new {
321
322	4		4	1	14088	my $class = shift;
323
324	4				10	my %parameter_hash;
325
326	4				11	my $count = @_;
327
328	4				25	my $useage_howto = "
329
330						Usage:
331
332
333						my \$foo = HTML::Miner->new (
334						CURRENT_URL => 'www.site_i_am_crawling.com/page_i_am_crawling.html' , # REQUIRED - 'new' will croak
335						# if this is not provided.
336						CURRENT_URL_HTML => 'long string here' , # Optional - Will be extracted
337						# from CURRENT_URL if not provided.
338						USER_AGENT => 'Perl_HTML_Miner/$VERSION' , # Optional - default:
339						# 'Perl_HTML_Miner/$VERSION'
340						TIMEOUT => 5 , # Optional - default: 5 ( Seconds )
341
342						DEBUG => 0 , # Optional - default: 0
343
344						);
345
346						";
347
348	4	50			18	unless( $count > 1 ) {
349	0				0	croak( $useage_howto );
350						} else {
351	4				24	%parameter_hash = @_;
352						}
353
354
355						## Require parameter.
356						croak( $useage_howto )
357	4	50			20	unless( $parameter_hash{ CURRENT_URL } ) ;
358
359						## Setting defaults unless parameters are set.
360						my $require_extract = 1
361	4	50			19	unless( $parameter_hash{ CURRENT_URL_HTML } ) ;
362
363						$parameter_hash{USER_AGENT} = 'Perl_HTML_Miner/'.$VERSION
364	4	50			46	unless( $parameter_hash{ USER_AGENT } ) ;
365						$parameter_hash{TIMEOUT} = 60
366	4	50			25	unless( $parameter_hash{ TIMEOUT } ) ;
367
368						$parameter_hash{DEBUG} = 0
369	4	50			23	unless( $parameter_hash{ DEBUG } ) ;
370
371						$parameter_hash{ABSOLUTE_ALL_CONTAINED_URLS} = 0
372	4	50			15	unless( $parameter_hash{ ABSOLUTE_ALL_CONTAINED_URLS } ) ;
373
374
375						## Require additional modules.
376
377	4	50			21	if( $require_extract ) {
378
379	0				0	eval {
380	0				0	require LWP::UserAgent ;
381	0				0	require HTTP::Request ;
382	0	0			0	}; croak( "LWP::UserAgent and HTTP::Request are required if the url is to be fetched!" )
383						if( $@ );
384
385	0				0	my $tmp;
386	0				0	( $parameter_hash{ CURRENT_URL }, $tmp, $tmp, $tmp ) = _convert_to_valid_url( $parameter_hash{ CURRENT_URL } );
387
388						$parameter_hash{ CURRENT_URL_HTML } =
389						_get_url_html(
390						$parameter_hash{ CURRENT_URL },
391						$parameter_hash{ USERAGENT },
392						$parameter_hash{ TIMEOUT }
393	0				0	);
394
395						}
396
397						## Check on the correctness of the input url.
398
399						my ( $url, $protocol, $domain_name, $uri ) =
400	4				23	_convert_to_valid_url( $parameter_hash{ CURRENT_URL } );
401
402	4				674	$parameter_hash{ CURRENT_URL } = $url;
403
404						my $self = {
405
406						CURRENT_URL => $parameter_hash{ CURRENT_URL } ,
407
408						CURRENT_URL_HTML => $parameter_hash{ CURRENT_URL_HTML } ,
409
410						USER_AGENT => $parameter_hash{ USER_AGENT } ,
411						TIMEOUT => $parameter_hash{ TIMEOUT } ,
412
413						DEBUG => $parameter_hash{ DEBUG } ,
414
415						ABSOLUTE_ALL_CONTAINED_URLS => $parameter_hash{ ABSOLUTE_ALL_CONTAINED_URLS } ,
416
417	4				62	_REQUIRE_EXTRACT => $require_extract ,
418						_BASE_PROTOCOL => $protocol ,
419						_BASE_DOMAIN => $domain_name ,
420						_BASE_URI => $uri
421
422						};
423
424
425						## Private and class data here.
426
427						## NONE
428
429
430	4				683	bless( $self, $class );
431
432	4	50			589	if( $self->{ DEBUG } == 1 ) {
433	0				0	print STDERR "HTML::Miner Object: \n" ;
434	0				0	print "$self"; ;
435						}
436
437	4				37	return $self;
438
439						}
440
441
442						=head2 get_links
443
444						This function extracts all URLs from a web page.
445
446						B
447
448						When called on an HTML::Miner Object :
449
450						$retun_element = $html_miner->get_links();
451
452						When called directly :
453
454						$retun_element = get_links( $url, $optionally_html_of_url );
455
456						The direct call is intended to be a simplified version of OO call
457						and so does not allow for customization of the useragent and so on!
458
459
460						B
461
462						This function ( regardless of how its called ) returns a pointer to an Array of Hashes who's structure is as follows:
463
464						$->Array(
465						Hash->{
466						"URL" => "extracted url" ,
467						"ABS_EXISTS" => "0_if_abs_url_extraction_failed" ,
468						"ABS_URL" => "absolute_location_of_extracted_url" ,
469						"TITLE" => "title_of_this_url" ,
470						"ANCHOR" => "anchor_text_of_this_url" ,
471						"DOMAIN" => "domain_of_this_url" ,
472						"DOMAIN_IS_BASE" => "1_if_this_domain_same_as_base_domain ,
473						"PROTOCOL" => "protocol_of_this_domain" ,
474						"URI" => "URI_of_this_url" ,
475						},
476						...
477						)
478
479						So, to access the title of the second URL found you would use (yes the order is maintained):
480
481						@{ $retun_element }[1]->{ TITLE }
482
483						B
484
485						If ABS_EXISTS is 0 then DOMAIN, DOMAIN_IS_BASE, PROTOCOL and URI will be undefined
486
487						To extract URLs from a HTML snippet when one does not care about the url of that page, simply pass some garbage as the URL
488						and ignore everything except URL, TITLE and ANCHOR
489
490						"ANCHOR" might contain HTML such as , use HTML::Strip if required.
491
492						=cut
493
494						sub get_links {
495
496	5		5	1	22757	my $tmp = shift ;
497
498	5				13	my $self ;
499						my $url ;
500	0				0	my $html ;
501
502	0				0	my @result_arr ;
503
504	5				19	my $user_agent = "Html_Miner/$VERSION" ;
505	5				8	my $timeout = 60 ;
506
507
508						## First extract all required information.
509
510	5	100			42	if( UNIVERSAL::isa( $tmp, 'HTML::Miner' ) ) {
511
512	3				6	$self = $tmp ;
513
514	3				9	$url = $self->{ CURRENT_URL } ;
515	3				5	$html = $self->{ CURRENT_URL_HTML } ;
516
517						} else {
518
519	2				6	$url = $tmp ;
520
521						## Check for validity of url!
522	2				10	my ( $tmp, $protocol, $domain_name, $uri ) =
523						_convert_to_valid_url( $url ) ;
524	2				5	$url = $tmp ;
525
526	2				8	my @params = @_ ;
527	2				4	my $html_has_been_passed = @params ;
528
529
530	2	50			9	if( $html_has_been_passed ) {
531	2				7	$html = shift ;
532						} else {
533
534						## Need to retrieve html
535
536	0				0	eval {
537	0				0	require LWP::UserAgent ;
538	0				0	require HTTP::Request ;
539						};
540	0	0			0	croak( "LWP::UserAgent and HTTP::Request are required if the url is to be fetched!" )
541						if( $@ );
542
543
544	0				0	$html = _get_url_html( $url, $user_agent, $timeout ) ;
545
546						} ## HTML Not passed
547
548
549						} ## Not called on Object.
550
551
552						## Now start extracting the URLs
553
554	5				504	while( $html =~ m/(<\s?a\s+?href\s?=(\"\|\')([^(\"\|\')]?)(\"\|\')([^>]?)>(.?)<\s?\/a\s*?>)/gis ){
555
556	797				1387	my $this_url = $3 ;
557	797				1097	my $this_anchor = $6 ;
558
559	797				1542	my $match = $1 ;
560	797				876	my $this_title = "" ;
561	797	100			3119	if( $match =~ m/title=(\"\|\')([^(\"\|\')]*?)(\"\|\')/is ) {
562	139				246	$this_title = $2;
563						}
564
565	797				865	my $this_abs_url = "" ;
566	797				787	my $this_abs_url_exists = 1 ;
567	797				813	eval{
568
569	797				1245	$this_abs_url = get_absolute_url( $url, $this_url );
570
571	797	50			1484	}; $this_abs_url_exists = 0 if( $@ );
572
573	797				769	my $this_domain ;
574						my $this_domain_is_base_domain ;
575	0				0	my $this_protocol ;
576	0				0	my $this_uri ;
577	797	50			1349	if( $this_abs_url_exists ) {
578
579	797				726	my $tmp;
580	797				814	eval {
581	797				1202	( $tmp, $this_protocol, $this_domain, $this_uri ) =
582						_convert_to_valid_url( $this_abs_url ) ;
583	797	50			1980	}; $this_abs_url_exists = 0 if( $@ );
584
585
586	797				733	my ( $protocol, $domain, $uri );
587	797				799	eval {
588	797				1178	( $tmp, $protocol, $domain, $uri ) =
589						_convert_to_valid_url( $url ) ;
590	797	50			1990	}; croak( "Unexpected Error - Giving up!" ) if( $@ );
591
592
593	797	100			1691	$this_domain_is_base_domain = ( $domain eq $this_domain ) ? 1 : 0;
594
595						}
596
597	797				5624	my %this_url_hash = (
598						"URL" => $this_url ,
599						"ABS_EXISTS" => $this_abs_url_exists ,
600						"ABS_URL" => $this_abs_url ,
601						"TITLE" => $this_title ,
602						"ANCHOR" => $this_anchor ,
603						"DOMAIN" => $this_domain ,
604						"DOMAIN_IS_BASE" => $this_domain_is_base_domain ,
605						"PROTOCOL" => $this_protocol ,
606						"URI" => $this_uri
607						);
608
609	797				24569	push( @result_arr, \%this_url_hash );
610
611						}
612
613
614	5				171	return \@result_arr;
615
616						}
617
618
619						=head2 get_page_css_and_js
620
621						This function extracts all CSS style sheets and JS Script files use on a web page.
622
623						B
624
625						When called on an HTML::Miner Object :
626
627						$retun_element = $html_miner->get_page_css_and_js(
628						CONVERT_URLS_TO_ABS => 0/1 [ B argument, default is 1 ]
629						);
630
631						When called directly :
632
633						$retun_element = get_page_css_and_js(
634						URL => $url ,
635						HTML => $optionally_html_of_url , [ B argument, html extracted if not provided ]
636						CONVERT_URLS_TO_ABS => 0/1 , [ B argument, default is 1 ]
637						);
638
639						The direct call is intended to be a simplified version of OO call
640						and so does not allow for customization of the useragent and so on!
641
642
643						B
644
645						This function ( regardless of how its called ) returns a pointer to a Hash [ JS or CSS ] of Arrays containing the URLs
646
647						$->HASH->{
648						"CSS" => Array( "extracted url1", "extracted url2", .. )
649						"JS" => Array( "extracted url1", "extracted url2", .. )
650						}
651
652						So, to access the URL of the second CSS style sheet found you would use (again the order is maintained):
653
654						$$retun_element{ "CSS" }[1];
655
656						Or
657						$css_data = @{ $retun_element->{ "CSS" } } ;
658						$second_css_url_found = $css_data[1] ;
659
660						B
661
662						To extract CSS and JS links from a HTML snippet when one does not care about the url of that page, simply set CONVERT_URLS_TO_ABS to 0 and everything should be fine.
663
664
665						=cut
666
667						sub get_page_css_and_js {
668
669	4		4	1	2174	my $number_of_arguments = @_ ;
670
671	4				5	my $self ;
672	4	100			17	unless( int( $number_of_arguments / 2 ) * 2 == $number_of_arguments ) { # Odd number of elems, Must have been called on Obj.
673	2				3	$self = shift ;
674						}
675
676	4				12	my %params = @_ ;
677
678	4	50			11	$params{ CONVERT_URLS_TO_ABS } = 1 unless( defined( $params{ CONVERT_URLS_TO_ABS } ) );
679
680	4				4	my $url ;
681						my $html ;
682
683	4				9	my $user_agent = "Perl_Html_Miner/$VERSION" ;
684	4				5	my $timeout = 60 ;
685
686						## First extract all required information.
687
688	4	100			9	if( defined( $self ) ) {
689	2	50			8	if( UNIVERSAL::isa( $self, 'HTML::Miner' ) ) {
690	2				5	$url = $self->{ CURRENT_URL } ;
691	2				3	$html = $self->{ CURRENT_URL_HTML } ;
692						} else {
693	0				0	croak( "get_page_css_and_js called with params I can't understand!" );
694						}
695						} else {
696
697	2				4	$url = $params{ URL } ;
698
699						## Check for validity of url!
700	2				5	my ( $tmp, $protocol, $domain_name, $uri ) =
701						_convert_to_valid_url( $url ) ;
702	2				4	$url = $tmp ;
703
704	2	50			5	my $html_has_been_passed = defined( $params{ HTML } ) ? 1 : 0 ;
705
706
707	2	50			5	if( $html_has_been_passed ) {
708	2				4	$html = $params{ HTML } ;
709						} else {
710
711						## Need to retrieve html
712
713	0				0	eval {
714	0				0	require LWP::UserAgent ;
715	0				0	require HTTP::Request ;
716						};
717	0	0			0	croak( "LWP::UserAgent and HTTP::Request are required if the url is to be fetched!" )
718						if( $@ );
719
720	0				0	$html = _get_url_html( $url, $user_agent, $timeout ) ;
721
722						} ## HTML Not passed
723
724
725						} ## Not called on Object.
726
727
728						## Now start extracting the URLs
729
730						## CSS
731
732	4				5	my @css_files ;
733	4				110	while ( $html =~ m/(
734	8				48	my $css_url = $2 ;
735	8	100			22	if( $params{ CONVERT_URLS_TO_ABS } ) {
736	4				10	$css_url = get_absolute_url( $url, $2 ) ;
737						}
738	8				59	push @css_files, $css_url ;
739						}
740
741
742
743						## JS
744
745	4				5	my @js_files ;
746	4				57	while ( $html =~ m/(