File Coverage

blib/lib/Text/HTML/ExtractInfo.pm

Criterion	Covered	Total	%
statement	35	38	92.1
branch	10	12	83.3
condition			n/a
subroutine	7	7	100.0
pod	0	1	0.0
total	52	58	89.6

line	stmt	bran	sub	pod	time	code
1						package Text::HTML::ExtractInfo 0.12;
2	7		7		339077	use 5.020;
	7				30
3	7		7		652	use stable 'postderef';
	7				6625
	7				44
4	7		7		1394	use experimental 'signatures';
	7				19
	7				31
5	7		7		417	use Carp 'croak';
	7				17
	7				536
6
7	7		7		83	use Exporter 'import';
	7				14
	7				6282
8
9						our @EXPORT_OK = (qw(extract_info));
10
11						=head1 SYNOPSIS
12
13						use Text::HTML::ExtractInfo 'extract_info';
14
15						my $tree = XML::LibXML->new->parse_html_string(
16						$input,
17						{ recover => 2, encoding => 'UTF-8' }
18						);
19						say Dumper extract_info($tree, url => 'https://example.com' );
20						# {
21						# title => '...',
22						# external => {
23						# image => [ 'https://example.com/img1.jpg', ... ],
24						# }
25						# }
26
27						=cut
28
29						our %elements = (
30						title => {
31						single => 1,
32						q => [
33						'//title',
34						'//meta[@property="og:title"]/@content',
35						'//meta[@property="twitter:title"]/@content',
36						'//h1[1]',
37						],
38						},
39
40						url => {
41						default => 'url',
42						single => 1,
43						q => [
44						'//link[@rel="canonical"]/@href',
45						'//meta[@property="og:url"]/@content',
46						'//meta[@property="twitter:url"]/@content',
47						],
48						},
49
50						image => {
51						q => [
52						'//meta[@property="og:image"]/@content',
53						'//meta[@property="og:image:url"]/@content',
54						'//meta[@property="og:image:secure_url"]/@content',
55						'//meta[@property="twitter:image"]/@content',
56						],
57						},
58
59						authors =>{
60						q => [
61						],
62						},
63						);
64
65	8		8		19	sub _get_value( $node ) {
	8				14
	8				14
66	8	100			83	if( my $c = $node->can( 'value' )) {
		50
67	4				47	return $c->($node)
68						} elsif( $c = $node->can( 'textContent' )) {
69	4				33	return $c->($node)
70						} else {
71	0				0	croak "Don't know how to handle " . $node->toString
72						}
73						}
74
75	10		10	0	269967	sub extract_info( $tree, %options ) {
	10				21
	10				25
	10				18
76	10				19	my %res;
77
78	10				42	for my $k (keys( %elements )) {
79	40	100			373	if( my $d = $elements{ $k }->{default} ) {
80						$res{ $k } = $options{ $d }
81	10	100			38	if exists $options{ $d };
82						}
83	40				107	for my $q ( $elements{ $k }->{q}->@* ) {
84	97				251	my @nodes = $tree->findnodes($q);
85	97	100			2068	if( @nodes ) {
86	8	50			30	if( $elements{ $k }->{single} ) {
87	8				23	$res{ $k } = _get_value( $nodes[0] )
88						} else {
89	0				0	$res{ $k } = [ map { _get_value( $_ )} @nodes ];
	0				0
90						}
91						last
92	8				50	}
93						}
94						}
95
96	10				69	return \%res,
97						}
98
99						1;
100
101						=head1 SEE ALSO
102
103						L - extract information from C<< > tags
104
105						=cut