File Coverage

blib/lib/App/FeedDeduplicator/Deduplicator.pm

Criterion	Covered	Total	%
statement	17	17	100.0
branch			n/a
condition			n/a
subroutine	6	6	100.0
pod			n/a
total	23	23	100.0

line	stmt	sub	time	code
1				=head1 NAME
2
3				App::FeedDeduplicator::Deduplicator - Deduplicator class for
4				App::FeedDeduplicator
5
6				=head1 DESCRIPTION
7
8				This module is part of the App::FeedDeduplicator application. It is
9				responsible for deduplicating entries from aggregated feeds.
10
11				It uses the LWP::UserAgent to fetch the feeds and HTML::TreeBuilder::XPath
12				to parse the HTML content for canonical links.
13
14				The deduplication process is based on the entry's, id canonical link or title.
15				The deduplicated entries are stored in an array for further processing. It is
16				designed to be used in conjunction with the Aggregator and Publisher classes
17				to provide a complete feed deduplication and publishing solution.
18
19				=head1 SYNOPSIS
20
21				use App::FeedDeduplicator::Deduplicator;
22
23				my $deduplicator = App::FeedDeduplicator::Deduplicator->new(
24				entries => $aggregated_entries,
25				ua => LWP::UserAgent->new(),
26				);
27
28				$deduplicator->deduplicate();
29
30				=head1 METHODS
31
32				=head2 new
33
34				Creates a new instance of App::FeedDeduplicator::Deduplicator. The constructor
35				accepts a list of entries and a user agent as parameters.
36
37				The entries should be an array reference containing hash references with the
38				keys 'entry' and 'feed'.
39
40				The 'entry' key should contain an XML::Feed::Entry object, and the 'feed' key
41				should contain a hash reference with the feed information.
42
43				The user agent should be an instance of LWP::UserAgent.
44
45				=head2 deduplicate
46
47				The main method that deduplicates entries from the aggregated feeds. It
48				iterates through the entries and checks for duplicates based on the id,
49				canonical link or title.
50
51				It uses a hash to keep track of seen entries and filters out duplicates. The
52				deduplicated entries are stored in the $deduplicated attribute.
53
54				It is designed to be used in conjunction with the Aggregator and Publisher
55				classes to provide a complete feed deduplication and publishing solution.
56
57				=head2 find_canonical
58
59				Finds the canonical link for a given entry. It fetches the entry's link using
60				LWP::UserAgent and parses the HTML content using HTML::TreeBuilder::XPath.
61
62				It looks for the <link rel="canonical"> tag in the HTML content and returns
63				the canonical URL if found. If the canonical link is not found, it returns
64				undef.
65
66				It is used during the deduplication process to determine the unique
67				identifier for each entry.
68
69				=cut
70
71				package App::FeedDeduplicator::Deduplicator; # For MetaCPAN
72
73	1	1	8	use v5.40;
	1		3
74	1	1	4	use feature 'class';
	1		1
	1		162
75	1	1	5	no warnings 'experimental::class';
	1		1
	1		72
76
77				class App::FeedDeduplicator::Deduplicator {
78	1	1	581	use HTML::TreeBuilder::XPath;
	1		52794
	1		5
79	1	1	36	use LWP::UserAgent;
	1		1
	1		16
80	1	1	4	use URI;
	1		1
	1		360
81
82				field $entries :param;
83				field $deduplicated :reader;
84				field $ua :param;
85
86				method deduplicate {
87				my %seen;
88				my @result;
89
90				for my $entry (@$entries) {
91				# warn ref($entry) . "\n" . ref($entry->{entry}) . "\n";
92				my $canonical = $self->find_canonical($entry->{entry}) // '';
93				my $title = $entry->{entry}->title // '';
94
95				push @result, $entry
96				unless ($canonical and $seen{$canonical})
97				or ($title and $seen{$title});
98
99				++$seen{$canonical} if $canonical;
100				++$seen{$title} if $title;
101				}
102
103				$deduplicated = \@result;
104				}
105
106				method find_canonical ($entry) {
107				my $link = $entry->link;
108				return unless $link;
109
110				my $response = $ua->get($link);
111				return unless $response->is_success;
112
113				my $tree = HTML::TreeBuilder::XPath->new_from_content(
114				$response->decoded_content
115				);
116				my $node = $tree->findnodes('//link[@rel="canonical"]')->[0];
117
118				return unless $node;
119				return URI->new($node->attr('href'))->as_string;
120				}
121				}
122
123				=head1 AUTHOR
124
125				Dave Cross <dave@perlhacks.com>
126
127				=head1 COPYRIGHT AND LICENSE
128
129				This software is copyright (c) 2025 Magnum Solutions Ltd.
130				All rights reserved.
131
132				This program is free software; you can redistribute it and/or modify it under
133				the same terms as Perl itself.
134
135				See L<http://dev.perl.org/licenses/artistic.html> for more details.
136
137				=cut
138
139				1;