File Coverage

blib/lib/App/FeedDeduplicator/Deduplicator.pm
Criterion Covered Total %
statement 17 17 100.0
branch n/a
condition n/a
subroutine 6 6 100.0
pod n/a
total 23 23 100.0


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             App::FeedDeduplicator::Deduplicator - Deduplicator class for
4             App::FeedDeduplicator
5              
6             =head1 DESCRIPTION
7              
8             This module is part of the App::FeedDeduplicator application. It is
9             responsible for deduplicating entries from aggregated feeds.
10              
11             It uses the LWP::UserAgent to fetch the feeds and HTML::TreeBuilder::XPath
12             to parse the HTML content for canonical links.
13              
14             The deduplication process is based on the entry's, id canonical link or title.
15             The deduplicated entries are stored in an array for further processing. It is
16             designed to be used in conjunction with the Aggregator and Publisher classes
17             to provide a complete feed deduplication and publishing solution.
18              
19             =head1 SYNOPSIS
20              
21             use App::FeedDeduplicator::Deduplicator;
22              
23             my $deduplicator = App::FeedDeduplicator::Deduplicator->new(
24             entries => $aggregated_entries,
25             ua => LWP::UserAgent->new(),
26             );
27              
28             $deduplicator->deduplicate();
29              
30             =head1 METHODS
31              
32             =head2 new
33              
34             Creates a new instance of App::FeedDeduplicator::Deduplicator. The constructor
35             accepts a list of entries and a user agent as parameters.
36              
37             The entries should be an array reference containing hash references with the
38             keys 'entry' and 'feed'.
39              
40             The 'entry' key should contain an XML::Feed::Entry object, and the 'feed' key
41             should contain a hash reference with the feed information.
42              
43             The user agent should be an instance of LWP::UserAgent.
44              
45             =head2 deduplicate
46              
47             The main method that deduplicates entries from the aggregated feeds. It
48             iterates through the entries and checks for duplicates based on the id,
49             canonical link or title.
50              
51             It uses a hash to keep track of seen entries and filters out duplicates. The
52             deduplicated entries are stored in the $deduplicated attribute.
53              
54             It is designed to be used in conjunction with the Aggregator and Publisher
55             classes to provide a complete feed deduplication and publishing solution.
56              
57             =head2 find_canonical
58              
59             Finds the canonical link for a given entry. It fetches the entry's link using
60             LWP::UserAgent and parses the HTML content using HTML::TreeBuilder::XPath.
61              
62             It looks for the <link rel="canonical"> tag in the HTML content and returns
63             the canonical URL if found. If the canonical link is not found, it returns
64             undef.
65              
66             It is used during the deduplication process to determine the unique
67             identifier for each entry.
68              
69             =cut
70              
71             package App::FeedDeduplicator::Deduplicator; # For MetaCPAN
72              
73 1     1   8 use v5.40;
  1         3  
74 1     1   4 use feature 'class';
  1         1  
  1         162  
75 1     1   5 no warnings 'experimental::class';
  1         1  
  1         72  
76              
77             class App::FeedDeduplicator::Deduplicator {
78 1     1   581 use HTML::TreeBuilder::XPath;
  1         52794  
  1         5  
79 1     1   36 use LWP::UserAgent;
  1         1  
  1         16  
80 1     1   4 use URI;
  1         1  
  1         360  
81              
82             field $entries :param;
83             field $deduplicated :reader;
84             field $ua :param;
85              
86             method deduplicate {
87             my %seen;
88             my @result;
89              
90             for my $entry (@$entries) {
91             # warn ref($entry) . "\n" . ref($entry->{entry}) . "\n";
92             my $canonical = $self->find_canonical($entry->{entry}) // '';
93             my $title = $entry->{entry}->title // '';
94              
95             push @result, $entry
96             unless ($canonical and $seen{$canonical})
97             or ($title and $seen{$title});
98              
99             ++$seen{$canonical} if $canonical;
100             ++$seen{$title} if $title;
101             }
102              
103             $deduplicated = \@result;
104             }
105              
106             method find_canonical ($entry) {
107             my $link = $entry->link;
108             return unless $link;
109              
110             my $response = $ua->get($link);
111             return unless $response->is_success;
112              
113             my $tree = HTML::TreeBuilder::XPath->new_from_content(
114             $response->decoded_content
115             );
116             my $node = $tree->findnodes('//link[@rel="canonical"]')->[0];
117              
118             return unless $node;
119             return URI->new($node->attr('href'))->as_string;
120             }
121             }
122              
123             =head1 AUTHOR
124              
125             Dave Cross <dave@perlhacks.com>
126              
127             =head1 COPYRIGHT AND LICENSE
128              
129             This software is copyright (c) 2025 Magnum Solutions Ltd.
130             All rights reserved.
131              
132             This program is free software; you can redistribute it and/or modify it under
133             the same terms as Perl itself.
134              
135             See L<http://dev.perl.org/licenses/artistic.html> for more details.
136              
137             =cut
138              
139             1;