line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package URI::Find::Delimited; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
1787
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
43
|
|
4
|
|
|
|
|
|
|
|
5
|
1
|
|
|
1
|
|
5
|
use vars qw( $VERSION ); |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
58
|
|
6
|
|
|
|
|
|
|
$VERSION = '0.02'; |
7
|
|
|
|
|
|
|
|
8
|
1
|
|
|
1
|
|
22
|
use base qw(URI::Find); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
939
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
# For 5.005_03 compatibility (copied from URI::Find::Schemeless) |
11
|
1
|
|
|
1
|
|
17814
|
use URI::Find (); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
592
|
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=head1 NAME |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
URI::Find::Delimited - Find URIs which may be wrapped in enclosing delimiters. |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=head1 DESCRIPTION |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
Works like L, but is prepared for URIs in your text to be |
20
|
|
|
|
|
|
|
wrapped in a pair of delimiters and optionally have a title. This will |
21
|
|
|
|
|
|
|
be useful for processing text that already has some minimal markup in |
22
|
|
|
|
|
|
|
it, like bulletin board posts or wiki text. |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=head1 SYNOPSIS |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
my $finder = URI::Find::Delimited->new; |
27
|
|
|
|
|
|
|
my $text = "This is a [http://the.earth.li/ titled link]."; |
28
|
|
|
|
|
|
|
$finder->find(\$text); |
29
|
|
|
|
|
|
|
print $text; |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=head1 METHODS |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
=over 4 |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
=item B |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
my $finder = URI::Find::Delimited->new( |
38
|
|
|
|
|
|
|
callback => \&callback, |
39
|
|
|
|
|
|
|
delimiter_re => [ '\[', '\]' ], |
40
|
|
|
|
|
|
|
ignore_quoted => 1 # defaults to 0 |
41
|
|
|
|
|
|
|
); |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
All arguments are optional; defaults are provided (see below). |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
Creates a new URI::Find::Delimited object. This object works similarly |
46
|
|
|
|
|
|
|
to a L object, but as well as just looking for URIs it is also |
47
|
|
|
|
|
|
|
aware of the concept of a wrapped, titled URI. These look something like |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
[http://foo.com/ the foo website] |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
where: |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
=over 4 |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=item * C<[> is the opening delimiter |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=item * C<]> is the closing delimiter |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=item * C is the URI |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
=item * C is the title |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=item * the URI and title are separated by spaces and/or tabs |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=back |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
The URI::Find::Delimited object will extract each of these parts |
68
|
|
|
|
|
|
|
separately and pass them to your callback. |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=over 4 |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=item B |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
C is a function which is called on each URI found. It is |
75
|
|
|
|
|
|
|
passed five arguments: the opening delimiter (if found), the closing |
76
|
|
|
|
|
|
|
delimiter (if found), the URI, the title (if found), and any |
77
|
|
|
|
|
|
|
whitespace found between the URI and title. |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
The return value of the callback will replace the original URI in the |
80
|
|
|
|
|
|
|
text. |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
If you do not supply your own callback, the object will create a |
83
|
|
|
|
|
|
|
default one which will put your URIs in 'a href' tags using the URI |
84
|
|
|
|
|
|
|
for the target and the title for the link text. If no title is |
85
|
|
|
|
|
|
|
provided for a URI then the URI itself will be used as the title. If |
86
|
|
|
|
|
|
|
the delimiters aren't balanced (eg if the opening one is present but |
87
|
|
|
|
|
|
|
no closing one is found) then the URI is treated as not being wrapped. |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
Note: the default callback will not remove the delimiters from the |
90
|
|
|
|
|
|
|
text. It should be simple enough to write your own callback to remove |
91
|
|
|
|
|
|
|
them, based on the one in the source, if that's what you want. In fact |
92
|
|
|
|
|
|
|
there's an example in this distribution, in C. |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
=item B |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
The C parameter is optional. If you do supply it then it |
97
|
|
|
|
|
|
|
should be a ref to an array containing two regexes. It defaults to |
98
|
|
|
|
|
|
|
using single square brackets as the delimiters. |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
Don't use capturing groupings C<( )> in your delimiters or things |
101
|
|
|
|
|
|
|
will break. Use non-capturing C<(?: )> instead. |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
=item B |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
If the C parameter is supplied and set to a true value, |
106
|
|
|
|
|
|
|
then any URIs immediately preceded with a double-quote character will |
107
|
|
|
|
|
|
|
not be matched, ie your callback will not be executed for them and |
108
|
|
|
|
|
|
|
they'll be treated just as normal text. |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
This is kinda lame but it's in here because I need to be able to |
111
|
|
|
|
|
|
|
ignore things like |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
A better implementation may happen at some point. |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
=back |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=cut |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub new { |
122
|
1
|
|
|
1
|
1
|
492
|
my ($class, %args) = @_; |
123
|
|
|
|
|
|
|
|
124
|
1
|
|
|
|
|
3
|
my ( $callback, $delimiter_re, $ignore_quoted ) = |
125
|
|
|
|
|
|
|
@args{ qw( callback delimiter_re ignore_quoted ) }; |
126
|
|
|
|
|
|
|
|
127
|
1
|
50
|
|
|
|
5
|
unless (defined $callback) { |
128
|
|
|
|
|
|
|
$callback = sub { |
129
|
0
|
|
|
0
|
|
0
|
my ($open, $close, $uri, $title, $whitespace) = @_; |
130
|
0
|
0
|
0
|
|
|
0
|
if ( $open && $close ) { |
131
|
0
|
|
0
|
|
|
0
|
$title ||= $uri; |
132
|
0
|
|
|
|
|
0
|
qq|$open$title$close|; |
133
|
|
|
|
|
|
|
} else { |
134
|
0
|
|
|
|
|
0
|
qq|$open$uri$whitespace$title$close|; |
135
|
|
|
|
|
|
|
} |
136
|
1
|
|
|
|
|
7
|
}; |
137
|
|
|
|
|
|
|
} |
138
|
1
|
|
50
|
|
|
10
|
$delimiter_re ||= [ '\[', '\]' ]; |
139
|
|
|
|
|
|
|
|
140
|
1
|
|
|
|
|
6
|
my $self = bless { callback => $callback, |
141
|
|
|
|
|
|
|
delimiter_re => $delimiter_re, |
142
|
|
|
|
|
|
|
ignore_quoted => $ignore_quoted |
143
|
|
|
|
|
|
|
}, $class; |
144
|
1
|
|
|
|
|
4
|
return $self; |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
sub find { |
148
|
1
|
|
|
1
|
1
|
11
|
my($self, $r_text) = @_; |
149
|
|
|
|
|
|
|
|
150
|
1
|
|
|
|
|
2
|
my $urlsfound = 0; |
151
|
|
|
|
|
|
|
|
152
|
1
|
|
|
|
|
166
|
URI::URL::strict(1); # Don't assume any old thing followed by : is a scheme |
153
|
|
|
|
|
|
|
|
154
|
0
|
|
|
|
|
|
my $uri_re = $self->uri_re; |
155
|
0
|
0
|
|
|
|
|
my $prefix_re = $self->{ignore_quoted} ? '(?
|
156
|
0
|
|
|
|
|
|
my $open_re = $self->{delimiter_re}[0]; |
157
|
0
|
|
|
|
|
|
my $close_re = $self->{delimiter_re}[1]; |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
# Note we only allow spaces and tabs, not all whitespace, between a URI |
160
|
|
|
|
|
|
|
# and its title. Also we disallow newlines *in* the title. These are |
161
|
|
|
|
|
|
|
# both to avoid the bug where $uri1\n$uri2 leads to $uri2 being considered |
162
|
|
|
|
|
|
|
# as part of the title, and thus not wrapped. |
163
|
0
|
|
|
|
|
|
$$r_text =~ s{$prefix_re # maybe don't match things preceded by a " |
164
|
|
|
|
|
|
|
(?: |
165
|
|
|
|
|
|
|
($open_re) # opening delimiter |
166
|
|
|
|
|
|
|
($uri_re) # the URI itself |
167
|
|
|
|
|
|
|
([ \t]*) # optional whitespace between URI and title |
168
|
|
|
|
|
|
|
((?<=[ \t])[^\n$close_re]+)? #title if there was whitespace |
169
|
|
|
|
|
|
|
($close_re) # closing delimiter |
170
|
|
|
|
|
|
|
| |
171
|
|
|
|
|
|
|
($uri_re) # just the URI itself |
172
|
|
|
|
|
|
|
) |
173
|
|
|
|
|
|
|
}{ |
174
|
0
|
|
|
|
|
|
my ($open, $uri_match, $whitespace, $title, $close, $just_uri) = |
175
|
|
|
|
|
|
|
($1, $2, $3, $4, $5, $6); |
176
|
0
|
0
|
|
|
|
|
$uri_match = $just_uri if $just_uri; |
177
|
0
|
|
|
|
|
|
foreach ( $open, $whitespace, $title, $close ) { |
178
|
0
|
|
0
|
|
|
|
$_ ||= ""; |
179
|
|
|
|
|
|
|
} |
180
|
0
|
|
|
|
|
|
my $orig_text = qq|$open$uri_match$whitespace$title$close|; |
181
|
|
|
|
|
|
|
|
182
|
0
|
0
|
|
|
|
|
if( my $uri = $self->_is_uri( \$uri_match ) ) { # if not a false alarm |
183
|
0
|
|
|
|
|
|
$urlsfound++; |
184
|
0
|
|
|
|
|
|
$self->{callback}->($open,$close,$uri_match,$title,$whitespace); |
185
|
|
|
|
|
|
|
} else { |
186
|
0
|
|
|
|
|
|
$orig_text; |
187
|
|
|
|
|
|
|
} |
188
|
|
|
|
|
|
|
}egx; |
189
|
|
|
|
|
|
|
|
190
|
0
|
|
|
|
|
|
return $urlsfound; |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=head1 SEE ALSO |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
L. |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head1 AUTHOR |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
Kake Pugh (kake@earth.li). |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
=head1 COPYRIGHT |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
Copyright (C) 2003 Kake Pugh. All Rights Reserved. |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or modify it |
206
|
|
|
|
|
|
|
under the same terms as Perl itself. |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
=head1 CREDITS |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
Tim Bagot helped me stop faffing over the name, by pointing out that |
211
|
|
|
|
|
|
|
RFC 2396 Appendix E uses "delimited". Dave Hinton helped me fix the |
212
|
|
|
|
|
|
|
regex to make it work for delimited URIs with no title. Nick Cleaton |
213
|
|
|
|
|
|
|
helped me make C work. Some of the code was taken from |
214
|
|
|
|
|
|
|
L. |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
=cut |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
1; |