File Coverage

blib/lib/Text/Lossy.pm
Criterion Covered Total %
statement 78 81 96.3
branch 12 12 100.0
condition n/a
subroutine 20 22 90.9
pod 14 14 100.0
total 124 129 96.1


line stmt bran cond sub pod time code
1             package Text::Lossy;
2              
3 13     13   396351 use 5.008;
  13         53  
  13         570  
4 13     13   74 use strict;
  13         23  
  13         445  
5 13     13   65 use warnings;
  13         27  
  13         395  
6 13     13   3241 use utf8;
  13         49  
  13         90  
7              
8 13     13   466 use Carp;
  13         26  
  13         1369  
9              
10             =head1 NAME
11              
12             Text::Lossy - Lossy text compression
13              
14             =head1 VERSION
15              
16             Version 0.40.2
17              
18             =cut
19              
20 13     13   15467 use version 0.77; our $VERSION = version->declare('v0.40.2');
  13         37181  
  13         132  
21              
22              
23             =head1 SYNOPSIS
24              
25             use Text::Lossy;
26              
27             my $lossy = Text::Lossy->new;
28             $lossy->add('whitespace');
29             my $short = $lossy->process($long);
30              
31             my $lossy = Text::Lossy->new->add('lower', 'punctuation'); # Chaining usage
32              
33             $lossy->process($long); # In place
34             $lossy->process(); # Filters $_ in place
35              
36             =head1 DESCRIPTION
37              
38             C is a collection of text filters for lossy compression.
39             "Lossy compression" changes the data in a way which is irreversible,
40             but results in a smaller file size after compression. One of the best
41             known lossy compression uses is the JPEG image format.
42              
43             Note that this module does not perform the actual compression itself,
44             it merely changes the text so that it may be compressed better.
45              
46             =head2 Alpha software
47              
48             This code is currently B. Anything can and will change,
49             most likely in a backwards-incompatible manner. You have been warned.
50              
51             =head2 Usage
52              
53             C uses an object oriented interface. You create a new
54             C object, set the filters you wish to use (described below),
55             and call the L method on the object. You can call this
56             method as often as you like. In addition, there is a method which produces
57             a closure, an anonymous subroutine, that acts like the process method on
58             the given object.
59              
60             =head2 Adding new filters
61              
62             New filters can be added with the L class method.
63             Each filter is a subroutine which takes a single string and returns this
64             string filtered.
65              
66             =cut
67              
68             our %filtermap;
69              
70             =head1 CONSTRUCTORS
71              
72             =head2 new
73              
74             my $lossy = Text::Lossy->new();
75              
76             The constructor for a new lossy text compressor. The constructor is quite
77             light-weight; the only purpose of a compressor object is to accept and remember
78             a sequence of filters to apply to text.
79              
80             The constructor takes no arguments.
81              
82             =cut
83              
84             sub new {
85 17     17 1 692 my $class = shift;
86 17         71 my $self = {
87             filters => [],
88             };
89 17         113 return bless $self, $class;
90             }
91              
92             =head1 METHODS
93              
94             =head2 process
95              
96             my $new_text = $lossy->process( $old_text );
97              
98             This method takes a single text string, applies all the selected filters
99             to it, and returns the filtered string. Filters are selected via
100             L; see L.
101              
102             The text is upgraded to character semantics via a call to
103             C, see L. This will not change the text you passed
104             in, nor should it have too surprising an effect on the output.
105              
106             If no text is passed in, nothing is returned (the empty list or C,
107             depending on context).
108             If an explicit C is passed in, an explicit C is returned, even in
109             list context.
110              
111             =cut
112              
113             sub process {
114 46     46 1 2458 my ($self, $text) = @_;
115 46 100       143 return unless @_ > 1;
116 44 100       138 return undef unless defined $text;
117 42         111 utf8::upgrade($text);
118 42         50 foreach my $f (@{$self->{'filters'}}) {
  42         245  
119 47         127 $text = $f->{'code'}->($text);
120             }
121 42         5954 return $text;
122             }
123              
124             =head2 add
125              
126             $lossy->add( 'lower', 'whitespace' );
127              
128             This method takes a list of filter names and adds them to the filter list
129             of the filter object, in the order given. This allows a programmatic
130             selection of filters, for example via command line. Returns the object
131             for method chaining.
132              
133             If the filter is unknown, an exception is thrown. This may happen when you
134             misspell the name, or forgot to use a module which registers the filter,
135             or forgot to register it yourself.
136              
137             =cut
138              
139             sub add {
140 23     23 1 145 my ($self, @filters) = @_;
141 23         54 foreach my $name (@filters) {
142 31         70 my $code = $filtermap{$name};
143 31 100       94 if (not $code) {
144 2         57 croak "Unknown filter $name (did you forget to use the right module?)";
145             }
146 29         41 push @{$self->{'filters'}}, { code => $code, name => $name };
  29         181  
147             }
148 21         68 return $self;
149             }
150              
151             =head2 clear
152              
153             $lossy->clear();
154              
155             Remove the filters from the filter object. The object will behave as
156             if newly constructed. Returns the object for method chaining.
157              
158             =cut
159              
160             sub clear {
161 3     3 1 9 my ($self) = @_;
162 3         7 @{$self->{'filters'}} = ();
  3         15  
163 3         9 return $self;
164             }
165              
166             =head2 list
167              
168             my @names = $lossy->list();
169              
170             List the filters added to this object, in order. The names (not the
171             code) are returned in a list.
172              
173             =cut
174              
175             sub list {
176 5     5 1 12 my ($self) = @_;
177 5         7 return map $_->{'name'}, @{$self->{'filters'}};
  5         51  
178             }
179              
180             =head2 as_coderef
181              
182             my $code = $lossy->as_coderef();
183             $new_text = $code->( $old_text );
184              
185             Returns a code reference that closes over the object. This code reference
186             acts like a bound L method on the constructed object. It
187             can be used in places like L that expect a code reference that
188             filters text.
189              
190             The code reference is bound to the object, not a particular object state.
191             Adding filters to the object after calling C will also change
192             the behaviour of the code reference.
193              
194             =cut
195              
196             sub as_coderef {
197 0     0 1 0 my ($self) = @_;
198             return sub {
199 0     0   0 return $self->process(@_);
200             }
201 0         0 }
202              
203             =head1 FILTERS
204              
205             The following filters are defined by this module. Other modules may define
206             more filters.
207             Each of these filters can be added to the set via the L method.
208              
209             =head2 lower
210              
211             Corresponds exactly to the L builtin in Perl, up
212             to and including its Unicode handling.
213              
214             =cut
215              
216             sub lower {
217 14     14 1 18 my ($text) = @_;
218 14         157 return lc($text);
219             }
220              
221             =head2 whitespace
222              
223             Collapses any whitespace (C<\s> in regular expressions) to a single space, C.
224             Whitespace at the beginning of the text is stripped completely. Whitespace at the end
225             is also collapsed to a single space, to help separate lines. Text consisting only
226             of whitespace results in an empty string.
227              
228             =cut
229              
230             sub whitespace {
231 8     8 1 12 my ($text) = @_;
232 8         48 $text =~ s{ \s+ }{ }xmsg;
233             # the above line also works for the end of the text
234 8         26 $text =~ s{ \A \s+ }{}xms;
235 8         20 return $text;
236             }
237              
238             =head2 whitespace_nl
239              
240             A variant of the L filter that leaves newlines on the end of the text
241             alone. Other whitespace at the end will get collapsed into a single newline.
242             If the text ends in whitespace that does not contain a new line, it is replaced
243             by a space, as before.
244              
245             This filter is most useful if you are creating a Unix-style text filter, and do not
246             want to buffer the entire input before writing the (only) line to C. The
247             newline at the end will allow downstream processes to work on new lines, too.
248             Otherwise, this filter is not quite as efficient as the L filter.
249              
250             Any newlines in the middle of text are collapsed to a space, too. This is especially
251             useful if you are reading in "paragraph mode", e.g. C<$/ = ''>, as you will get
252             one long line per former paragraph.
253              
254             =cut
255              
256             sub whitespace_nl {
257 7     7 1 14 my ($text) = @_;
258             # Remember whether a newline was present
259 7 100       35 my $has_nl = ($text =~ m{ \n \s* \z }xms) ? 1 : 0;
260 7         55 $text =~ s{ \s+ }{ }xmsg;
261 7         24 $text =~ s{ \A \s+ }{}xms;
262             # whitespace-at-end is now a space
263 7 100       17 if ($has_nl) {
264             # replace this space with a newline
265 2         11 $text =~ s{ \s+ \z }{\n}xms;
266             }
267 7         26 return $text;
268             }
269              
270             =head2 punctuation
271              
272             Strips punctuation, that is anything matching C<\p{Punctuation}>. It is replaced by
273             nothing, removing it completely.
274              
275             =cut
276              
277             sub punctuation {
278 6     6 1 12 my ($text) = @_;
279             # Turns out '\p{Punctuation}' fails on Perl 5.6, use the abbreviation '\pP' instead
280 13     13   11223 $text =~ s{ \pP }{}xmsg;
  13         35  
  13         211  
  6         77  
281 6         18 return $text;
282             }
283              
284             =head2 punctuation_sp
285              
286             A variant of L that replaces punctuation with a space character, C,
287             instead of removing it completely. This is usually less efficient for compression, but
288             retains more readability, for example in the presence of URLs or email addresses.
289              
290             =cut
291              
292             sub punctuation_sp {
293 3     3 1 5 my ($text) = @_;
294             # Turns out '\p{Punctuation}' fails on Perl 5.6, use the abbreviation '\pP' instead
295 3         25 $text =~ s{ \pP }{ }xmsg;
296 3         10 return $text;
297             }
298              
299             =head2 alphabetize
300              
301             Leaves the first and last letters of a word alone, but replaces the interior letters with
302             the same set, sorted by the L function. This is done on the observation
303             (source uncertain at the time) that words can still be made out if the letters are present, but
304             in a different order, as long as the outer ones remain the same.
305              
306             This filter may not work as proposed with every language or writing system. Specifically, it
307             uses end-of-word matches C<\b> to determine which letters to leave alone.
308              
309             =cut
310              
311             sub alphabetize {
312 6     6 1 7 my ($text) = @_;
313 6         81 $text =~ s{ \b (\p{Alpha}) (\p{Alpha}+) (\p{Alpha}) \b }{ $1 . join('', sort split(//,$2)) . $3 }xmseg;
  6         52  
314 6         23 return $text;
315             }
316              
317             # TODO:
318             # - unidecode (separate module)
319             # - normalize (separate module)
320              
321             =head1 CLASS METHODS
322              
323             These methods are not called on a filter object, but on the class C
324             itself. They are typically concerned with the filters that can be added to filter
325             objects.
326              
327             =head2 register_filters
328              
329             Text::Lossy->register_filters(
330             change_stuff => \&Other::Module::change_text,
331             remove_ps => sub { my ($text) = @_; $text =~ s{[Pp]}{}; return $text; },
332             );
333              
334             Adds one or more named filters to the set of available filters. Filters are
335             passed in an anonymous hash.
336             Previously defined mappings may be overwritten by this function.
337             Specifically, passing C as the code reference removes the filter.
338              
339             =cut
340              
341             %filtermap = (
342             'lower' => \&lower,
343             'whitespace' => \&whitespace,
344             'whitespace_nl' => \&whitespace_nl,
345             'punctuation' => \&punctuation,
346             'punctuation_sp' => \&punctuation_sp,
347             'alphabetize' => \&alphabetize,
348             );
349              
350             sub register_filters {
351 7     7 1 1507 my ($class, %mapping) = @_;
352 7         101 foreach my $name (keys %mapping) {
353 11 100       121 if (defined $mapping{$name}) {
354 3         11 $filtermap{$name} = $mapping{$name};
355             } else {
356 8         30 delete $filtermap{$name};
357             }
358             }
359 7         29 return;
360             }
361              
362             =head2 available_filters
363              
364             my @filters = Text::Lossy->available_filters();
365              
366             Lists the available filters at this point in time, specifically their names
367             as used by L and L. The list is sorted alphabetically.
368              
369             =cut
370              
371             sub available_filters {
372 4     4 1 1410 my ($class) = @_;
373 4         62 return sort keys %filtermap;
374             }
375              
376             =head1 CREATING FILTERS
377              
378             A filter is a subroutine which takes a single parameter (the text to be converted) and
379             returns the filtered text. The text may also be changed in-place, as long as it is
380             returned again.
381              
382             These filters are then made available to the rest of the system via the
383             L function.
384              
385             =head1 USAGE WITH Text::Filter
386              
387             The L module provides an infrastructure for filtering text, but no actual filters.
388             It can be used with C by passing the result of L as the C
389             parameter.
390              
391             It is recommended to set L to leave line endings alone when using the L
392             filter, i.e. the L and
393             L should be C<0>. This is the default
394             for L. It will allow L to perform its assigned task on line endings.
395              
396             One thing to note is that the C filters do not follow the L's convention
397             that lines "to be skipped" should result in an C.
398             This means you need to expect completely empty lines (C, not even a newline character) in
399             your output.
400             This should be no problem if you print to a file handle or append to a string, but may be surprising
401             if you are filtering an array of lines.
402              
403             =head1 EXPORT
404              
405             Nothing exported or exportable; use the OO interface instead.
406              
407             =head1 UNICODE
408              
409             This code strives to be completely Unicode compatible. All filters aim to "do the right thing" on non-ASCII strings.
410             Any failure to handle Unicode should be considered a bug; please report it.
411              
412             =head1 AUTHOR
413              
414             Ben Deutsch, C<< >>
415              
416             =head1 BUGS
417              
418             None known so far.
419              
420             Please report any bugs or feature requests to C, or through
421             the web interface at L. I will be notified, and then you'll
422             automatically be notified of progress on your bug as I make changes.
423              
424             =head1 SUPPORT
425              
426             You can find documentation for this module with the perldoc command.
427              
428             perldoc Text::Lossy
429              
430              
431             You can also look for information at:
432              
433             =over 4
434              
435             =item * RT: CPAN's request tracker (report bugs here)
436              
437             L
438              
439             =item * AnnoCPAN: Annotated CPAN documentation
440              
441             L
442              
443             =item * CPAN Ratings
444              
445             L
446              
447             =item * Search CPAN
448              
449             L
450              
451             =back
452              
453              
454             =head1 ACKNOWLEDGEMENTS
455              
456              
457             =head1 LICENSE AND COPYRIGHT
458              
459             Copyright 2012 Ben Deutsch.
460              
461             This program is free software; you can redistribute it and/or modify it
462             under the terms of either: the GNU General Public License as published
463             by the Free Software Foundation; or the Artistic License.
464              
465             See http://dev.perl.org/licenses/ for more information.
466              
467              
468             =cut
469              
470             1; # End of Text::Lossy