File Coverage

blib/lib/WWW/Link/Selector.pm
Criterion Covered Total %
statement 29 93 31.1
branch 4 54 7.4
condition 0 3 0.0
subroutine 3 11 27.2
pod 3 4 75.0
total 39 165 23.6


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             WWW::Link::Selector - link selection functions.
4              
5             =head1 SYNOPSIS
6              
7             use MLDBM qw(DB_File);
8             use CDB_File::BiIndex;
9             use WWW::Link::Selector;
10             use WWW::Link::Reporter;
11              
12             #generate a function which uses lists of regexs to include or
13             #exclude links
14             $::include=WWW::Link::Selector::gen_include_exclude @::exclude, @::include;
15              
16             $::index = new CDB_File::BiIndex $::page_index, $::link_index
17             $::linkdbm = tie %::links, "MLDBM", $::links, O_RDONLY, 0666, $::DB_HASH
18             or die $!;
19             $::reporter=new WWW::Link::Reporter::HTML \*STDOUT, $::index;
20              
21             #generate a function which will use all
22             $::selectfunc = WWW::Link::Selector::generate_select_func
23             ( \%::links, $::reporter, $::include, $::index, );
24              
25             #report on all selectedlinks
26             &$::selectfunc;
27              
28             =head1 DESCRIPTION
29              
30             This is a package (not a class though) which builds functions for
31             selecting links to give information about to a user. So far there are
32             two ways of doing this, either scanning the entire database or using
33             an index to get the information.
34              
35             =cut
36              
37             package WWW::Link::Selector;
38             $REVISION=q$Revision: 1.10 $ ; $VERSION = sprintf ( "%d.%02d", $REVISION =~ /(\d+).(\d+)/ );
39 1     1   734 use strict;
  1         1  
  1         36  
40 1     1   6 use Carp qw(croak carp cluck);
  1         2  
  1         1395  
41              
42             $WWW::Link::Selector::ignore_missing = 0;
43              
44             #$WWW::Link::Selector::verbose = 0xFF;
45             $WWW::Link::Selector::verbose = 0x00
46             unless defined $WWW::Link::Selector::verbose;
47              
48              
49             sub _check_link ($$$) {
50 0     0   0 my ($link,$url,$reporter)=@_;
51 0 0       0 (defined $link) or do {
52 0 0       0 $reporter->not_found( $url ) unless $WWW::Link::Selector::ignore_missing;
53 0         0 return 0;
54             };
55 0 0       0 ref $link or do {
56 0         0 cluck "Non reference $link found in database for url $url.\n";
57 0         0 return 0;
58             };
59 0         0 return 1;
60             }
61              
62             =head1 generate_url_func
63              
64             This function creates a url function which will act on each of the
65             links for the urls given in its arguments. If any of the arguments
66             have spaces it split them into different urls around that space.
67              
68             =cut
69              
70             sub generate_url_func ($$\@) {
71 0     0 0 0 my $link_database=shift;
72 0         0 my $reporter=shift;
73 0         0 my $urls=shift;
74              
75 0         0 my @urllist=();
76 0         0 foreach (@$urls) {
77 0         0 push @urllist, split /\s+/, $_;
78             }
79              
80 0 0       0 croak "empty urllist" unless @urllist;
81              
82             return sub {
83             # a closure with the link database and urllist enclosed
84 0     0   0 foreach ( @urllist ) {
85 0         0 s/\s//g;
86 0         0 my $url = $_;
87 0         0 my $link=$link_database->{$url};
88 0 0       0 next unless _check_link($link,$url,$reporter);
89 0         0 $reporter->examine( $link );
90             }
91             }
92 0         0 }
93              
94             =head2 generate_select_func(link_database, reporter, include_func, [index])
95              
96             This function generates a selector function which works in one of two modes.
97              
98             In the first, no index is given and it recurses through all of the
99             links in the database.
100              
101             In the second it generates a selection function which recurses through
102             the B working on each url.
103              
104             For each url it finds, it calls the given link B if the
105             B returns true for that url.
106              
107             =cut
108              
109             sub generate_select_func ($$$;$) {
110 0     0 1 0 my ($link_database, $reporter, $include_func, $index)=@_;
111 0 0       0 if ($index) {
112             return sub {
113              
114 0 0   0   0 print STDERR "Using index to generate list of urls to examine.\n"
115             if $WWW::Link::Selector::verbose & 16;
116              
117             #check within an infostructure
118              
119 0         0 my $url;
120 0         0 $index->second_reset();
121 0         0 URL: while($url=$index->second_next()) {
122 0         0 my $link=$link_database->{$url};
123 0 0       0 print STDERR "WWW::Link::Selector::[generated selector] Looking"
124             . " at link $url.\n"
125             if $WWW::Link::Selector::verbose & 64;
126 0 0       0 next unless _check_link($link,$url,$reporter);
127 0 0       0 $reporter->examine ( $link ) if &$include_func($url);
128             }
129              
130 0 0       0 print STDERR "Finished reporting index of urls.\n"
131             if $WWW::Link::Selector::verbose & 16;
132              
133             }
134 0         0 } else {
135             return sub {
136              
137 0 0   0   0 print STDERR "Going through all urls in database\n"
138             if $WWW::Link::Selector::verbose & 16;
139              
140             # check across the whole database of links
141              
142 0         0 my ($url,$link);
143 0         0 LINK: while(($url,$link)=each %$link_database) {
144 0 0 0     0 next if (!ref $url) and $url =~ m/^\%\+\+/;
145 0 0       0 next unless _check_link($link,$url,$reporter);
146 0 0       0 print STDERR "Looking at link $url.\n"
147             if $WWW::Link::Selector::verbose & 64;
148 0 0       0 $reporter->examine ( $link ) if &$include_func($url);
149             }
150             }
151 0         0 }
152             }
153              
154              
155             =head2 generate_index_select_func(link_database, reporter, include_func, index)
156              
157             This function returns a function which iterates through all of the
158             links found in the index, calling $reporter->examine() for each link.
159              
160             In this select function, the include_func is a function which is
161             called on each page url in our own pages to decide whether or not to
162             report the link.
163              
164             =cut
165              
166             sub generate_index_select_func ($$$$) {
167 0     0 1 0 my ($link_database, $reporter, $include_func, $index)=@_;
168             return sub {
169 0     0   0 my $url;
170 0         0 $index->second_reset();
171 0         0 URL: while($url=$index->second_next()) {
172 0         0 my $pagelist=$index->lookup_second($url);
173              
174 0         0 my $include=0;
175 0         0 foreach my $page (@$pagelist) {
176 0 0       0 &$include_func($page) || next;
177 0         0 $include=1;
178             }
179 0 0       0 next URL unless $include;
180 0         0 my $link=$link_database->{$url};
181 0 0       0 ref $link or do {
182 0         0 warn "Non reference $link found in database for url $url.\n";
183 0         0 next;
184             };
185 0 0       0 print STDERR "WWW::Link::Selector::[generated index selector] "
186             . "Looking at link $url.\n" if $WWW::Link::Selector::verbose & 64;
187 0 0       0 unless ($link) {
188 0 0       0 $reporter->not_found( $url ) unless $WWW::Link::Selector::ignore_missing;
189 0         0 next URL;
190             }
191 0         0 $reporter->examine ( $link ) ;
192             }
193 0 0       0 print STDERR "Finished reporting index of urls.\n"
194             if $WWW::Link::Selector::verbose & 16;
195             }
196 0         0 }
197              
198             =head2 gen_include_exclude (@exclude, @include)
199              
200             This function generates a function which will return false if any of
201             the regexps in the exclude_listre match and even then will return
202             false unless one of the regexps in the include listref matches.
203              
204             If the first list is empty then all links matching the include list
205             will be accepted.
206              
207             If the second list is empty, then all links not matching the exlcude
208             list will be accepted.
209              
210             The fuction generated can be used by generate_select_func (see above).
211              
212             =cut
213              
214              
215             # this following fuction could be much more efficient with a compile
216             # once single regex. See Manifest.pm for what seems to be an example.
217              
218             # after Tom Christiansen in his FMTYEWTK on regexps.
219              
220             sub gen_include_exclude (\@\@){
221 1     1 1 24 my ($excludes, $includes) = @_;
222              
223 1         2 my @excludearray=();
224 1         3 foreach my $exclude (@$excludes) {
225 2         28 $exclude =~ s,(?<=[^\\])((:\\\\)*)/,$1\\/,g;
226 2         7 push @excludearray, $exclude;
227             }
228 1         2 my @includearray=();
229 1         3 foreach my $include (@$includes) {
230 2         21 $include =~ s,(?<=[^\\])((:\\\\)*)/,$1\\/,g;
231 2         5 push @includearray, $include;
232             }
233              
234 1         2 my $code = <<"EOCODE";
235             sub {
236             EOCODE
237 1 50       5 $code .= <<"EOCODE" if @excludearray + @includearray > 5;
238             study;
239             EOCODE
240 1         2 my $pat;
241 1         3 for $pat (@excludearray) {
242 2         8 $code .= <<"EOCODE";
243             return 0 if \$_[0] =~ /$pat/;
244             EOCODE
245             }
246 1 50       3 unless (@includearray) {
247 0         0 $code .= <<"EOCODE";
248             return 1;
249             EOCODE
250             } else {
251 1         4 for $pat (@includearray) {
252 2         7 $code .= <<"EOCODE";
253             return 1 if \$_[0] =~ /$pat/;
254             EOCODE
255             }
256 1         4 $code .= <<"EOCODE";
257             return 0;
258             EOCODE
259             }
260 1         2 $code .= "}\n";
261 1 50       7 print "CODE: $code\n"
262             if $WWW::Link::Selector::verbose & 128;
263 1         212 my $func = eval $code;
264 1 50       5 die "bad pattern: $@" if $@;
265 1         4 return $func;
266             }
267              
268             1;