| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package URI::Find::Delimited; |
|
2
|
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
1787
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
43
|
|
|
4
|
|
|
|
|
|
|
|
|
5
|
1
|
|
|
1
|
|
5
|
use vars qw( $VERSION ); |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
58
|
|
|
6
|
|
|
|
|
|
|
$VERSION = '0.02'; |
|
7
|
|
|
|
|
|
|
|
|
8
|
1
|
|
|
1
|
|
22
|
use base qw(URI::Find); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
939
|
|
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
# For 5.005_03 compatibility (copied from URI::Find::Schemeless) |
|
11
|
1
|
|
|
1
|
|
17814
|
use URI::Find (); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
592
|
|
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=head1 NAME |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
URI::Find::Delimited - Find URIs which may be wrapped in enclosing delimiters. |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
Works like L, but is prepared for URIs in your text to be |
|
20
|
|
|
|
|
|
|
wrapped in a pair of delimiters and optionally have a title. This will |
|
21
|
|
|
|
|
|
|
be useful for processing text that already has some minimal markup in |
|
22
|
|
|
|
|
|
|
it, like bulletin board posts or wiki text. |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
my $finder = URI::Find::Delimited->new; |
|
27
|
|
|
|
|
|
|
my $text = "This is a [http://the.earth.li/ titled link]."; |
|
28
|
|
|
|
|
|
|
$finder->find(\$text); |
|
29
|
|
|
|
|
|
|
print $text; |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=head1 METHODS |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
=over 4 |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
=item B |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
my $finder = URI::Find::Delimited->new( |
|
38
|
|
|
|
|
|
|
callback => \&callback, |
|
39
|
|
|
|
|
|
|
delimiter_re => [ '\[', '\]' ], |
|
40
|
|
|
|
|
|
|
ignore_quoted => 1 # defaults to 0 |
|
41
|
|
|
|
|
|
|
); |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
All arguments are optional; defaults are provided (see below). |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
Creates a new URI::Find::Delimited object. This object works similarly |
|
46
|
|
|
|
|
|
|
to a L object, but as well as just looking for URIs it is also |
|
47
|
|
|
|
|
|
|
aware of the concept of a wrapped, titled URI. These look something like |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
[http://foo.com/ the foo website] |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
where: |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
=over 4 |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=item * C<[> is the opening delimiter |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=item * C<]> is the closing delimiter |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=item * C is the URI |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
=item * C is the title |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=item * the URI and title are separated by spaces and/or tabs |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=back |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
The URI::Find::Delimited object will extract each of these parts |
|
68
|
|
|
|
|
|
|
separately and pass them to your callback. |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=over 4 |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=item B |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
C is a function which is called on each URI found. It is |
|
75
|
|
|
|
|
|
|
passed five arguments: the opening delimiter (if found), the closing |
|
76
|
|
|
|
|
|
|
delimiter (if found), the URI, the title (if found), and any |
|
77
|
|
|
|
|
|
|
whitespace found between the URI and title. |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
The return value of the callback will replace the original URI in the |
|
80
|
|
|
|
|
|
|
text. |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
If you do not supply your own callback, the object will create a |
|
83
|
|
|
|
|
|
|
default one which will put your URIs in 'a href' tags using the URI |
|
84
|
|
|
|
|
|
|
for the target and the title for the link text. If no title is |
|
85
|
|
|
|
|
|
|
provided for a URI then the URI itself will be used as the title. If |
|
86
|
|
|
|
|
|
|
the delimiters aren't balanced (eg if the opening one is present but |
|
87
|
|
|
|
|
|
|
no closing one is found) then the URI is treated as not being wrapped. |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
Note: the default callback will not remove the delimiters from the |
|
90
|
|
|
|
|
|
|
text. It should be simple enough to write your own callback to remove |
|
91
|
|
|
|
|
|
|
them, based on the one in the source, if that's what you want. In fact |
|
92
|
|
|
|
|
|
|
there's an example in this distribution, in C. |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
=item B |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
The C parameter is optional. If you do supply it then it |
|
97
|
|
|
|
|
|
|
should be a ref to an array containing two regexes. It defaults to |
|
98
|
|
|
|
|
|
|
using single square brackets as the delimiters. |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
Don't use capturing groupings C<( )> in your delimiters or things |
|
101
|
|
|
|
|
|
|
will break. Use non-capturing C<(?: )> instead. |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
=item B |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
If the C parameter is supplied and set to a true value, |
|
106
|
|
|
|
|
|
|
then any URIs immediately preceded with a double-quote character will |
|
107
|
|
|
|
|
|
|
not be matched, ie your callback will not be executed for them and |
|
108
|
|
|
|
|
|
|
they'll be treated just as normal text. |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
This is kinda lame but it's in here because I need to be able to |
|
111
|
|
|
|
|
|
|
ignore things like |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
A better implementation may happen at some point. |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
=back |
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=cut |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub new { |
|
122
|
1
|
|
|
1
|
1
|
492
|
my ($class, %args) = @_; |
|
123
|
|
|
|
|
|
|
|
|
124
|
1
|
|
|
|
|
3
|
my ( $callback, $delimiter_re, $ignore_quoted ) = |
|
125
|
|
|
|
|
|
|
@args{ qw( callback delimiter_re ignore_quoted ) }; |
|
126
|
|
|
|
|
|
|
|
|
127
|
1
|
50
|
|
|
|
5
|
unless (defined $callback) { |
|
128
|
|
|
|
|
|
|
$callback = sub { |
|
129
|
0
|
|
|
0
|
|
0
|
my ($open, $close, $uri, $title, $whitespace) = @_; |
|
130
|
0
|
0
|
0
|
|
|
0
|
if ( $open && $close ) { |
|
131
|
0
|
|
0
|
|
|
0
|
$title ||= $uri; |
|
132
|
0
|
|
|
|
|
0
|
qq|$open$title$close|; |
|
133
|
|
|
|
|
|
|
} else { |
|
134
|
0
|
|
|
|
|
0
|
qq|$open$uri$whitespace$title$close|; |
|
135
|
|
|
|
|
|
|
} |
|
136
|
1
|
|
|
|
|
7
|
}; |
|
137
|
|
|
|
|
|
|
} |
|
138
|
1
|
|
50
|
|
|
10
|
$delimiter_re ||= [ '\[', '\]' ]; |
|
139
|
|
|
|
|
|
|
|
|
140
|
1
|
|
|
|
|
6
|
my $self = bless { callback => $callback, |
|
141
|
|
|
|
|
|
|
delimiter_re => $delimiter_re, |
|
142
|
|
|
|
|
|
|
ignore_quoted => $ignore_quoted |
|
143
|
|
|
|
|
|
|
}, $class; |
|
144
|
1
|
|
|
|
|
4
|
return $self; |
|
145
|
|
|
|
|
|
|
} |
|
146
|
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
sub find { |
|
148
|
1
|
|
|
1
|
1
|
11
|
my($self, $r_text) = @_; |
|
149
|
|
|
|
|
|
|
|
|
150
|
1
|
|
|
|
|
2
|
my $urlsfound = 0; |
|
151
|
|
|
|
|
|
|
|
|
152
|
1
|
|
|
|
|
166
|
URI::URL::strict(1); # Don't assume any old thing followed by : is a scheme |
|
153
|
|
|
|
|
|
|
|
|
154
|
0
|
|
|
|
|
|
my $uri_re = $self->uri_re; |
|
155
|
0
|
0
|
|
|
|
|
my $prefix_re = $self->{ignore_quoted} ? '(?
|
|
156
|
0
|
|
|
|
|
|
my $open_re = $self->{delimiter_re}[0]; |
|
157
|
0
|
|
|
|
|
|
my $close_re = $self->{delimiter_re}[1]; |
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
# Note we only allow spaces and tabs, not all whitespace, between a URI |
|
160
|
|
|
|
|
|
|
# and its title. Also we disallow newlines *in* the title. These are |
|
161
|
|
|
|
|
|
|
# both to avoid the bug where $uri1\n$uri2 leads to $uri2 being considered |
|
162
|
|
|
|
|
|
|
# as part of the title, and thus not wrapped. |
|
163
|
0
|
|
|
|
|
|
$$r_text =~ s{$prefix_re # maybe don't match things preceded by a " |
|
164
|
|
|
|
|
|
|
(?: |
|
165
|
|
|
|
|
|
|
($open_re) # opening delimiter |
|
166
|
|
|
|
|
|
|
($uri_re) # the URI itself |
|
167
|
|
|
|
|
|
|
([ \t]*) # optional whitespace between URI and title |
|
168
|
|
|
|
|
|
|
((?<=[ \t])[^\n$close_re]+)? #title if there was whitespace |
|
169
|
|
|
|
|
|
|
($close_re) # closing delimiter |
|
170
|
|
|
|
|
|
|
| |
|
171
|
|
|
|
|
|
|
($uri_re) # just the URI itself |
|
172
|
|
|
|
|
|
|
) |
|
173
|
|
|
|
|
|
|
}{ |
|
174
|
0
|
|
|
|
|
|
my ($open, $uri_match, $whitespace, $title, $close, $just_uri) = |
|
175
|
|
|
|
|
|
|
($1, $2, $3, $4, $5, $6); |
|
176
|
0
|
0
|
|
|
|
|
$uri_match = $just_uri if $just_uri; |
|
177
|
0
|
|
|
|
|
|
foreach ( $open, $whitespace, $title, $close ) { |
|
178
|
0
|
|
0
|
|
|
|
$_ ||= ""; |
|
179
|
|
|
|
|
|
|
} |
|
180
|
0
|
|
|
|
|
|
my $orig_text = qq|$open$uri_match$whitespace$title$close|; |
|
181
|
|
|
|
|
|
|
|
|
182
|
0
|
0
|
|
|
|
|
if( my $uri = $self->_is_uri( \$uri_match ) ) { # if not a false alarm |
|
183
|
0
|
|
|
|
|
|
$urlsfound++; |
|
184
|
0
|
|
|
|
|
|
$self->{callback}->($open,$close,$uri_match,$title,$whitespace); |
|
185
|
|
|
|
|
|
|
} else { |
|
186
|
0
|
|
|
|
|
|
$orig_text; |
|
187
|
|
|
|
|
|
|
} |
|
188
|
|
|
|
|
|
|
}egx; |
|
189
|
|
|
|
|
|
|
|
|
190
|
0
|
|
|
|
|
|
return $urlsfound; |
|
191
|
|
|
|
|
|
|
} |
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
L. |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head1 AUTHOR |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
Kake Pugh (kake@earth.li). |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
=head1 COPYRIGHT |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
Copyright (C) 2003 Kake Pugh. All Rights Reserved. |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or modify it |
|
206
|
|
|
|
|
|
|
under the same terms as Perl itself. |
|
207
|
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
=head1 CREDITS |
|
209
|
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
Tim Bagot helped me stop faffing over the name, by pointing out that |
|
211
|
|
|
|
|
|
|
RFC 2396 Appendix E uses "delimited". Dave Hinton helped me fix the |
|
212
|
|
|
|
|
|
|
regex to make it work for delimited URIs with no title. Nick Cleaton |
|
213
|
|
|
|
|
|
|
helped me make C work. Some of the code was taken from |
|
214
|
|
|
|
|
|
|
L. |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
=cut |
|
217
|
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
1; |