File Coverage

blib/lib/Text/HTML/ExtractInfo.pm

Criterion	Covered	Total	%
statement	35	38	92.1
branch	10	12	83.3
condition			n/a
subroutine	7	7	100.0
pod	0	1	0.0
total	52	58	89.6

line	stmt	bran	sub	pod	time	code
1						package Text::HTML::ExtractInfo 0.11;
2	6		6		194325	use 5.020;
	6				25
3	6		6		691	use stable 'postderef';
	6				6424
	6				63
4	6		6		1055	use experimental 'signatures';
	6				12
	6				26
5	6		6		387	use Carp 'croak';
	6				13
	6				376
6
7	6		6		58	use Exporter 'import';
	6				29
	6				6382
8
9						our @EXPORT_OK = (qw(extract_info));
10
11						=head1 SYNOPSIS
12
13						use Text::HTML::ExtractInfo 'extract_info';
14
15						my $tree = XML::LibXML->new->parse_html_string(
16						$input,
17						{ recover => 2, encoding => 'UTF-8' }
18						);
19						say Dumper extract_info($tree, url => 'https://example.com' );
20						# {
21						# title => '...',
22						# external => {
23						# image => [ 'https://example.com/img1.jpg', ... ],
24						# }
25						# }
26
27						=cut
28
29						our %elements = (
30						title => {
31						single => 1,
32						q => [
33						'//title',
34						'//meta[@property="og:title"]/@content',
35						'//meta[@property="twitter:title"]/@content',
36						'//h1[1]',
37						],
38						},
39
40						url => {
41						default => 'url',
42						single => 1,
43						q => [
44						'//link[@rel="canonical"]/@href',
45						'//meta[@property="og:url"]/@content',
46						'//meta[@property="twitter:url"]/@content',
47						],
48						},
49
50						image => {
51						q => [
52						'//meta[@property="og:image"]/@content',
53						'//meta[@property="og:image:url"]/@content',
54						'//meta[@property="og:image:secure_url"]/@content',
55						'//meta[@property="twitter:image"]/@content',
56						],
57						},
58
59						authors =>{
60						q => [
61						],
62						},
63						);
64
65	8		8		15	sub _get_value( $node ) {
	8				16
	8				14
66	8	100			90	if( my $c = $node->can( 'value' )) {
		50
67	4				30	return $c->($node)
68						} elsif( $c = $node->can( 'textContent' )) {
69	4				30	return $c->($node)
70						} else {
71	0				0	croak "Don't know how to handle " . $node->toString
72						}
73						}
74
75	10		10	0	264539	sub extract_info( $tree, %options ) {
	10				21
	10				46
	10				17
76	10				20	my %res;
77
78	10				43	for my $k (keys( %elements )) {
79	40	100			363	if( my $d = $elements{ $k }->{default} ) {
80						$res{ $k } = $options{ $d }
81	10	100			36	if exists $options{ $d };
82						}
83	40				110	for my $q ( $elements{ $k }->{q}->@* ) {
84	97				243	my @nodes = $tree->findnodes($q);
85	97	100			2022	if( @nodes ) {
86	8	50			28	if( $elements{ $k }->{single} ) {
87	8				27	$res{ $k } = _get_value( $nodes[0] )
88						} else {
89	0				0	$res{ $k } = [ map { _get_value( $_ )} @nodes ];
	0				0
90						}
91						last
92	8				47	}
93						}
94						}
95
96	10				67	return \%res,
97						}
98
99						1;
100
101						=head1 SEE ALSO
102
103						L - extract information from C<< > tags
104
105						=cut