| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Text::Lossy; |
|
2
|
|
|
|
|
|
|
|
|
3
|
13
|
|
|
13
|
|
396351
|
use 5.008; |
|
|
13
|
|
|
|
|
53
|
|
|
|
13
|
|
|
|
|
570
|
|
|
4
|
13
|
|
|
13
|
|
74
|
use strict; |
|
|
13
|
|
|
|
|
23
|
|
|
|
13
|
|
|
|
|
445
|
|
|
5
|
13
|
|
|
13
|
|
65
|
use warnings; |
|
|
13
|
|
|
|
|
27
|
|
|
|
13
|
|
|
|
|
395
|
|
|
6
|
13
|
|
|
13
|
|
3241
|
use utf8; |
|
|
13
|
|
|
|
|
49
|
|
|
|
13
|
|
|
|
|
90
|
|
|
7
|
|
|
|
|
|
|
|
|
8
|
13
|
|
|
13
|
|
466
|
use Carp; |
|
|
13
|
|
|
|
|
26
|
|
|
|
13
|
|
|
|
|
1369
|
|
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
=head1 NAME |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
Text::Lossy - Lossy text compression |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 VERSION |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Version 0.40.2 |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=cut |
|
19
|
|
|
|
|
|
|
|
|
20
|
13
|
|
|
13
|
|
15467
|
use version 0.77; our $VERSION = version->declare('v0.40.2'); |
|
|
13
|
|
|
|
|
37181
|
|
|
|
13
|
|
|
|
|
132
|
|
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
use Text::Lossy; |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
my $lossy = Text::Lossy->new; |
|
28
|
|
|
|
|
|
|
$lossy->add('whitespace'); |
|
29
|
|
|
|
|
|
|
my $short = $lossy->process($long); |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
my $lossy = Text::Lossy->new->add('lower', 'punctuation'); # Chaining usage |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
$lossy->process($long); # In place |
|
34
|
|
|
|
|
|
|
$lossy->process(); # Filters $_ in place |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
C is a collection of text filters for lossy compression. |
|
39
|
|
|
|
|
|
|
"Lossy compression" changes the data in a way which is irreversible, |
|
40
|
|
|
|
|
|
|
but results in a smaller file size after compression. One of the best |
|
41
|
|
|
|
|
|
|
known lossy compression uses is the JPEG image format. |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
Note that this module does not perform the actual compression itself, |
|
44
|
|
|
|
|
|
|
it merely changes the text so that it may be compressed better. |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=head2 Alpha software |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
This code is currently B. Anything can and will change, |
|
49
|
|
|
|
|
|
|
most likely in a backwards-incompatible manner. You have been warned. |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
=head2 Usage |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
C uses an object oriented interface. You create a new |
|
54
|
|
|
|
|
|
|
C object, set the filters you wish to use (described below), |
|
55
|
|
|
|
|
|
|
and call the L method on the object. You can call this |
|
56
|
|
|
|
|
|
|
method as often as you like. In addition, there is a method which produces |
|
57
|
|
|
|
|
|
|
a closure, an anonymous subroutine, that acts like the process method on |
|
58
|
|
|
|
|
|
|
the given object. |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=head2 Adding new filters |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
New filters can be added with the L class method. |
|
63
|
|
|
|
|
|
|
Each filter is a subroutine which takes a single string and returns this |
|
64
|
|
|
|
|
|
|
string filtered. |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=cut |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
our %filtermap; |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=head1 CONSTRUCTORS |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=head2 new |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
my $lossy = Text::Lossy->new(); |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
The constructor for a new lossy text compressor. The constructor is quite |
|
77
|
|
|
|
|
|
|
light-weight; the only purpose of a compressor object is to accept and remember |
|
78
|
|
|
|
|
|
|
a sequence of filters to apply to text. |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
The constructor takes no arguments. |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
=cut |
|
83
|
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
sub new { |
|
85
|
17
|
|
|
17
|
1
|
692
|
my $class = shift; |
|
86
|
17
|
|
|
|
|
71
|
my $self = { |
|
87
|
|
|
|
|
|
|
filters => [], |
|
88
|
|
|
|
|
|
|
}; |
|
89
|
17
|
|
|
|
|
113
|
return bless $self, $class; |
|
90
|
|
|
|
|
|
|
} |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
=head1 METHODS |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
=head2 process |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
my $new_text = $lossy->process( $old_text ); |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
This method takes a single text string, applies all the selected filters |
|
99
|
|
|
|
|
|
|
to it, and returns the filtered string. Filters are selected via |
|
100
|
|
|
|
|
|
|
L; see L. |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
The text is upgraded to character semantics via a call to |
|
103
|
|
|
|
|
|
|
C, see L. This will not change the text you passed |
|
104
|
|
|
|
|
|
|
in, nor should it have too surprising an effect on the output. |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
If no text is passed in, nothing is returned (the empty list or C, |
|
107
|
|
|
|
|
|
|
depending on context). |
|
108
|
|
|
|
|
|
|
If an explicit C is passed in, an explicit C is returned, even in |
|
109
|
|
|
|
|
|
|
list context. |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=cut |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
sub process { |
|
114
|
46
|
|
|
46
|
1
|
2458
|
my ($self, $text) = @_; |
|
115
|
46
|
100
|
|
|
|
143
|
return unless @_ > 1; |
|
116
|
44
|
100
|
|
|
|
138
|
return undef unless defined $text; |
|
117
|
42
|
|
|
|
|
111
|
utf8::upgrade($text); |
|
118
|
42
|
|
|
|
|
50
|
foreach my $f (@{$self->{'filters'}}) { |
|
|
42
|
|
|
|
|
245
|
|
|
119
|
47
|
|
|
|
|
127
|
$text = $f->{'code'}->($text); |
|
120
|
|
|
|
|
|
|
} |
|
121
|
42
|
|
|
|
|
5954
|
return $text; |
|
122
|
|
|
|
|
|
|
} |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
=head2 add |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
$lossy->add( 'lower', 'whitespace' ); |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
This method takes a list of filter names and adds them to the filter list |
|
129
|
|
|
|
|
|
|
of the filter object, in the order given. This allows a programmatic |
|
130
|
|
|
|
|
|
|
selection of filters, for example via command line. Returns the object |
|
131
|
|
|
|
|
|
|
for method chaining. |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
If the filter is unknown, an exception is thrown. This may happen when you |
|
134
|
|
|
|
|
|
|
misspell the name, or forgot to use a module which registers the filter, |
|
135
|
|
|
|
|
|
|
or forgot to register it yourself. |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
=cut |
|
138
|
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
sub add { |
|
140
|
23
|
|
|
23
|
1
|
145
|
my ($self, @filters) = @_; |
|
141
|
23
|
|
|
|
|
54
|
foreach my $name (@filters) { |
|
142
|
31
|
|
|
|
|
70
|
my $code = $filtermap{$name}; |
|
143
|
31
|
100
|
|
|
|
94
|
if (not $code) { |
|
144
|
2
|
|
|
|
|
57
|
croak "Unknown filter $name (did you forget to use the right module?)"; |
|
145
|
|
|
|
|
|
|
} |
|
146
|
29
|
|
|
|
|
41
|
push @{$self->{'filters'}}, { code => $code, name => $name }; |
|
|
29
|
|
|
|
|
181
|
|
|
147
|
|
|
|
|
|
|
} |
|
148
|
21
|
|
|
|
|
68
|
return $self; |
|
149
|
|
|
|
|
|
|
} |
|
150
|
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
=head2 clear |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
$lossy->clear(); |
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
Remove the filters from the filter object. The object will behave as |
|
156
|
|
|
|
|
|
|
if newly constructed. Returns the object for method chaining. |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
=cut |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
sub clear { |
|
161
|
3
|
|
|
3
|
1
|
9
|
my ($self) = @_; |
|
162
|
3
|
|
|
|
|
7
|
@{$self->{'filters'}} = (); |
|
|
3
|
|
|
|
|
15
|
|
|
163
|
3
|
|
|
|
|
9
|
return $self; |
|
164
|
|
|
|
|
|
|
} |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
=head2 list |
|
167
|
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
my @names = $lossy->list(); |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
List the filters added to this object, in order. The names (not the |
|
171
|
|
|
|
|
|
|
code) are returned in a list. |
|
172
|
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
=cut |
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
sub list { |
|
176
|
5
|
|
|
5
|
1
|
12
|
my ($self) = @_; |
|
177
|
5
|
|
|
|
|
7
|
return map $_->{'name'}, @{$self->{'filters'}}; |
|
|
5
|
|
|
|
|
51
|
|
|
178
|
|
|
|
|
|
|
} |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
=head2 as_coderef |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
my $code = $lossy->as_coderef(); |
|
183
|
|
|
|
|
|
|
$new_text = $code->( $old_text ); |
|
184
|
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
Returns a code reference that closes over the object. This code reference |
|
186
|
|
|
|
|
|
|
acts like a bound L method on the constructed object. It |
|
187
|
|
|
|
|
|
|
can be used in places like L that expect a code reference that |
|
188
|
|
|
|
|
|
|
filters text. |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
The code reference is bound to the object, not a particular object state. |
|
191
|
|
|
|
|
|
|
Adding filters to the object after calling C will also change |
|
192
|
|
|
|
|
|
|
the behaviour of the code reference. |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=cut |
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
sub as_coderef { |
|
197
|
0
|
|
|
0
|
1
|
0
|
my ($self) = @_; |
|
198
|
|
|
|
|
|
|
return sub { |
|
199
|
0
|
|
|
0
|
|
0
|
return $self->process(@_); |
|
200
|
|
|
|
|
|
|
} |
|
201
|
0
|
|
|
|
|
0
|
} |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
=head1 FILTERS |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
The following filters are defined by this module. Other modules may define |
|
206
|
|
|
|
|
|
|
more filters. |
|
207
|
|
|
|
|
|
|
Each of these filters can be added to the set via the L method. |
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
=head2 lower |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
Corresponds exactly to the L builtin in Perl, up |
|
212
|
|
|
|
|
|
|
to and including its Unicode handling. |
|
213
|
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
=cut |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
sub lower { |
|
217
|
14
|
|
|
14
|
1
|
18
|
my ($text) = @_; |
|
218
|
14
|
|
|
|
|
157
|
return lc($text); |
|
219
|
|
|
|
|
|
|
} |
|
220
|
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=head2 whitespace |
|
222
|
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
Collapses any whitespace (C<\s> in regular expressions) to a single space, C. |
|
224
|
|
|
|
|
|
|
Whitespace at the beginning of the text is stripped completely. Whitespace at the end |
|
225
|
|
|
|
|
|
|
is also collapsed to a single space, to help separate lines. Text consisting only |
|
226
|
|
|
|
|
|
|
of whitespace results in an empty string. |
|
227
|
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
=cut |
|
229
|
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
sub whitespace { |
|
231
|
8
|
|
|
8
|
1
|
12
|
my ($text) = @_; |
|
232
|
8
|
|
|
|
|
48
|
$text =~ s{ \s+ }{ }xmsg; |
|
233
|
|
|
|
|
|
|
# the above line also works for the end of the text |
|
234
|
8
|
|
|
|
|
26
|
$text =~ s{ \A \s+ }{}xms; |
|
235
|
8
|
|
|
|
|
20
|
return $text; |
|
236
|
|
|
|
|
|
|
} |
|
237
|
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
=head2 whitespace_nl |
|
239
|
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
A variant of the L filter that leaves newlines on the end of the text |
|
241
|
|
|
|
|
|
|
alone. Other whitespace at the end will get collapsed into a single newline. |
|
242
|
|
|
|
|
|
|
If the text ends in whitespace that does not contain a new line, it is replaced |
|
243
|
|
|
|
|
|
|
by a space, as before. |
|
244
|
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
This filter is most useful if you are creating a Unix-style text filter, and do not |
|
246
|
|
|
|
|
|
|
want to buffer the entire input before writing the (only) line to C. The |
|
247
|
|
|
|
|
|
|
newline at the end will allow downstream processes to work on new lines, too. |
|
248
|
|
|
|
|
|
|
Otherwise, this filter is not quite as efficient as the L filter. |
|
249
|
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
Any newlines in the middle of text are collapsed to a space, too. This is especially |
|
251
|
|
|
|
|
|
|
useful if you are reading in "paragraph mode", e.g. C<$/ = ''>, as you will get |
|
252
|
|
|
|
|
|
|
one long line per former paragraph. |
|
253
|
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=cut |
|
255
|
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
sub whitespace_nl { |
|
257
|
7
|
|
|
7
|
1
|
14
|
my ($text) = @_; |
|
258
|
|
|
|
|
|
|
# Remember whether a newline was present |
|
259
|
7
|
100
|
|
|
|
35
|
my $has_nl = ($text =~ m{ \n \s* \z }xms) ? 1 : 0; |
|
260
|
7
|
|
|
|
|
55
|
$text =~ s{ \s+ }{ }xmsg; |
|
261
|
7
|
|
|
|
|
24
|
$text =~ s{ \A \s+ }{}xms; |
|
262
|
|
|
|
|
|
|
# whitespace-at-end is now a space |
|
263
|
7
|
100
|
|
|
|
17
|
if ($has_nl) { |
|
264
|
|
|
|
|
|
|
# replace this space with a newline |
|
265
|
2
|
|
|
|
|
11
|
$text =~ s{ \s+ \z }{\n}xms; |
|
266
|
|
|
|
|
|
|
} |
|
267
|
7
|
|
|
|
|
26
|
return $text; |
|
268
|
|
|
|
|
|
|
} |
|
269
|
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
=head2 punctuation |
|
271
|
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
Strips punctuation, that is anything matching C<\p{Punctuation}>. It is replaced by |
|
273
|
|
|
|
|
|
|
nothing, removing it completely. |
|
274
|
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
=cut |
|
276
|
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
sub punctuation { |
|
278
|
6
|
|
|
6
|
1
|
12
|
my ($text) = @_; |
|
279
|
|
|
|
|
|
|
# Turns out '\p{Punctuation}' fails on Perl 5.6, use the abbreviation '\pP' instead |
|
280
|
13
|
|
|
13
|
|
11223
|
$text =~ s{ \pP }{}xmsg; |
|
|
13
|
|
|
|
|
35
|
|
|
|
13
|
|
|
|
|
211
|
|
|
|
6
|
|
|
|
|
77
|
|
|
281
|
6
|
|
|
|
|
18
|
return $text; |
|
282
|
|
|
|
|
|
|
} |
|
283
|
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
=head2 punctuation_sp |
|
285
|
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
A variant of L that replaces punctuation with a space character, C, |
|
287
|
|
|
|
|
|
|
instead of removing it completely. This is usually less efficient for compression, but |
|
288
|
|
|
|
|
|
|
retains more readability, for example in the presence of URLs or email addresses. |
|
289
|
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
=cut |
|
291
|
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
sub punctuation_sp { |
|
293
|
3
|
|
|
3
|
1
|
5
|
my ($text) = @_; |
|
294
|
|
|
|
|
|
|
# Turns out '\p{Punctuation}' fails on Perl 5.6, use the abbreviation '\pP' instead |
|
295
|
3
|
|
|
|
|
25
|
$text =~ s{ \pP }{ }xmsg; |
|
296
|
3
|
|
|
|
|
10
|
return $text; |
|
297
|
|
|
|
|
|
|
} |
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
=head2 alphabetize |
|
300
|
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
Leaves the first and last letters of a word alone, but replaces the interior letters with |
|
302
|
|
|
|
|
|
|
the same set, sorted by the L function. This is done on the observation |
|
303
|
|
|
|
|
|
|
(source uncertain at the time) that words can still be made out if the letters are present, but |
|
304
|
|
|
|
|
|
|
in a different order, as long as the outer ones remain the same. |
|
305
|
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
This filter may not work as proposed with every language or writing system. Specifically, it |
|
307
|
|
|
|
|
|
|
uses end-of-word matches C<\b> to determine which letters to leave alone. |
|
308
|
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
=cut |
|
310
|
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
sub alphabetize { |
|
312
|
6
|
|
|
6
|
1
|
7
|
my ($text) = @_; |
|
313
|
6
|
|
|
|
|
81
|
$text =~ s{ \b (\p{Alpha}) (\p{Alpha}+) (\p{Alpha}) \b }{ $1 . join('', sort split(//,$2)) . $3 }xmseg; |
|
|
6
|
|
|
|
|
52
|
|
|
314
|
6
|
|
|
|
|
23
|
return $text; |
|
315
|
|
|
|
|
|
|
} |
|
316
|
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
# TODO: |
|
318
|
|
|
|
|
|
|
# - unidecode (separate module) |
|
319
|
|
|
|
|
|
|
# - normalize (separate module) |
|
320
|
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
=head1 CLASS METHODS |
|
322
|
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
These methods are not called on a filter object, but on the class C |
|
324
|
|
|
|
|
|
|
itself. They are typically concerned with the filters that can be added to filter |
|
325
|
|
|
|
|
|
|
objects. |
|
326
|
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
=head2 register_filters |
|
328
|
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
Text::Lossy->register_filters( |
|
330
|
|
|
|
|
|
|
change_stuff => \&Other::Module::change_text, |
|
331
|
|
|
|
|
|
|
remove_ps => sub { my ($text) = @_; $text =~ s{[Pp]}{}; return $text; }, |
|
332
|
|
|
|
|
|
|
); |
|
333
|
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
Adds one or more named filters to the set of available filters. Filters are |
|
335
|
|
|
|
|
|
|
passed in an anonymous hash. |
|
336
|
|
|
|
|
|
|
Previously defined mappings may be overwritten by this function. |
|
337
|
|
|
|
|
|
|
Specifically, passing C as the code reference removes the filter. |
|
338
|
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
=cut |
|
340
|
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
%filtermap = ( |
|
342
|
|
|
|
|
|
|
'lower' => \&lower, |
|
343
|
|
|
|
|
|
|
'whitespace' => \&whitespace, |
|
344
|
|
|
|
|
|
|
'whitespace_nl' => \&whitespace_nl, |
|
345
|
|
|
|
|
|
|
'punctuation' => \&punctuation, |
|
346
|
|
|
|
|
|
|
'punctuation_sp' => \&punctuation_sp, |
|
347
|
|
|
|
|
|
|
'alphabetize' => \&alphabetize, |
|
348
|
|
|
|
|
|
|
); |
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
sub register_filters { |
|
351
|
7
|
|
|
7
|
1
|
1507
|
my ($class, %mapping) = @_; |
|
352
|
7
|
|
|
|
|
101
|
foreach my $name (keys %mapping) { |
|
353
|
11
|
100
|
|
|
|
121
|
if (defined $mapping{$name}) { |
|
354
|
3
|
|
|
|
|
11
|
$filtermap{$name} = $mapping{$name}; |
|
355
|
|
|
|
|
|
|
} else { |
|
356
|
8
|
|
|
|
|
30
|
delete $filtermap{$name}; |
|
357
|
|
|
|
|
|
|
} |
|
358
|
|
|
|
|
|
|
} |
|
359
|
7
|
|
|
|
|
29
|
return; |
|
360
|
|
|
|
|
|
|
} |
|
361
|
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
=head2 available_filters |
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
my @filters = Text::Lossy->available_filters(); |
|
365
|
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
Lists the available filters at this point in time, specifically their names |
|
367
|
|
|
|
|
|
|
as used by L and L. The list is sorted alphabetically. |
|
368
|
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
=cut |
|
370
|
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
sub available_filters { |
|
372
|
4
|
|
|
4
|
1
|
1410
|
my ($class) = @_; |
|
373
|
4
|
|
|
|
|
62
|
return sort keys %filtermap; |
|
374
|
|
|
|
|
|
|
} |
|
375
|
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
=head1 CREATING FILTERS |
|
377
|
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
A filter is a subroutine which takes a single parameter (the text to be converted) and |
|
379
|
|
|
|
|
|
|
returns the filtered text. The text may also be changed in-place, as long as it is |
|
380
|
|
|
|
|
|
|
returned again. |
|
381
|
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
These filters are then made available to the rest of the system via the |
|
383
|
|
|
|
|
|
|
L function. |
|
384
|
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
=head1 USAGE WITH Text::Filter |
|
386
|
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
The L module provides an infrastructure for filtering text, but no actual filters. |
|
388
|
|
|
|
|
|
|
It can be used with C by passing the result of L as the C |
|
389
|
|
|
|
|
|
|
parameter. |
|
390
|
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
It is recommended to set L to leave line endings alone when using the L |
|
392
|
|
|
|
|
|
|
filter, i.e. the L and |
|
393
|
|
|
|
|
|
|
L should be C<0>. This is the default |
|
394
|
|
|
|
|
|
|
for L. It will allow L to perform its assigned task on line endings. |
|
395
|
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
One thing to note is that the C filters do not follow the L's convention |
|
397
|
|
|
|
|
|
|
that lines "to be skipped" should result in an C. |
|
398
|
|
|
|
|
|
|
This means you need to expect completely empty lines (C, not even a newline character) in |
|
399
|
|
|
|
|
|
|
your output. |
|
400
|
|
|
|
|
|
|
This should be no problem if you print to a file handle or append to a string, but may be surprising |
|
401
|
|
|
|
|
|
|
if you are filtering an array of lines. |
|
402
|
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
=head1 EXPORT |
|
404
|
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
Nothing exported or exportable; use the OO interface instead. |
|
406
|
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
=head1 UNICODE |
|
408
|
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
This code strives to be completely Unicode compatible. All filters aim to "do the right thing" on non-ASCII strings. |
|
410
|
|
|
|
|
|
|
Any failure to handle Unicode should be considered a bug; please report it. |
|
411
|
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
=head1 AUTHOR |
|
413
|
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
Ben Deutsch, C<< >> |
|
415
|
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
=head1 BUGS |
|
417
|
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
None known so far. |
|
419
|
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
Please report any bugs or feature requests to C, or through |
|
421
|
|
|
|
|
|
|
the web interface at L. I will be notified, and then you'll |
|
422
|
|
|
|
|
|
|
automatically be notified of progress on your bug as I make changes. |
|
423
|
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
=head1 SUPPORT |
|
425
|
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. |
|
427
|
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
perldoc Text::Lossy |
|
429
|
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
You can also look for information at: |
|
432
|
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
=over 4 |
|
434
|
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker (report bugs here) |
|
436
|
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
L |
|
438
|
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation |
|
440
|
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
L |
|
442
|
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
=item * CPAN Ratings |
|
444
|
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
L |
|
446
|
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
=item * Search CPAN |
|
448
|
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
L |
|
450
|
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
=back |
|
452
|
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS |
|
455
|
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT |
|
458
|
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
Copyright 2012 Ben Deutsch. |
|
460
|
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
|
462
|
|
|
|
|
|
|
under the terms of either: the GNU General Public License as published |
|
463
|
|
|
|
|
|
|
by the Free Software Foundation; or the Artistic License. |
|
464
|
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
See http://dev.perl.org/licenses/ for more information. |
|
466
|
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
=cut |
|
469
|
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
1; # End of Text::Lossy |