File Coverage

blib/lib/Text/Lossy.pm

Criterion	Covered	Total	%
statement	78	81	96.3
branch	12	12	100.0
condition			n/a
subroutine	20	22	90.9
pod	14	14	100.0
total	124	129	96.1

line	stmt	bran	sub	pod	time	code
1						package Text::Lossy;
2
3	13		13		396351	use 5.008;
	13				53
	13				570
4	13		13		74	use strict;
	13				23
	13				445
5	13		13		65	use warnings;
	13				27
	13				395
6	13		13		3241	use utf8;
	13				49
	13				90
7
8	13		13		466	use Carp;
	13				26
	13				1369
9
10						=head1 NAME
11
12						Text::Lossy - Lossy text compression
13
14						=head1 VERSION
15
16						Version 0.40.2
17
18						=cut
19
20	13		13		15467	use version 0.77; our $VERSION = version->declare('v0.40.2');
	13				37181
	13				132
21
22
23						=head1 SYNOPSIS
24
25						use Text::Lossy;
26
27						my $lossy = Text::Lossy->new;
28						$lossy->add('whitespace');
29						my $short = $lossy->process($long);
30
31						my $lossy = Text::Lossy->new->add('lower', 'punctuation'); # Chaining usage
32
33						$lossy->process($long); # In place
34						$lossy->process(); # Filters $_ in place
35
36						=head1 DESCRIPTION
37
38						C is a collection of text filters for lossy compression.
39						"Lossy compression" changes the data in a way which is irreversible,
40						but results in a smaller file size after compression. One of the best
41						known lossy compression uses is the JPEG image format.
42
43						Note that this module does not perform the actual compression itself,
44						it merely changes the text so that it may be compressed better.
45
46						=head2 Alpha software
47
48						This code is currently B. Anything can and will change,
49						most likely in a backwards-incompatible manner. You have been warned.
50
51						=head2 Usage
52
53						C uses an object oriented interface. You create a new
54						C object, set the filters you wish to use (described below),
55						and call the L method on the object. You can call this
56						method as often as you like. In addition, there is a method which produces
57						a closure, an anonymous subroutine, that acts like the process method on
58						the given object.
59
60						=head2 Adding new filters
61
62						New filters can be added with the L class method.
63						Each filter is a subroutine which takes a single string and returns this
64						string filtered.
65
66						=cut
67
68						our %filtermap;
69
70						=head1 CONSTRUCTORS
71
72						=head2 new
73
74						my $lossy = Text::Lossy->new();
75
76						The constructor for a new lossy text compressor. The constructor is quite
77						light-weight; the only purpose of a compressor object is to accept and remember
78						a sequence of filters to apply to text.
79
80						The constructor takes no arguments.
81
82						=cut
83
84						sub new {
85	17		17	1	692	my $class = shift;
86	17				71	my $self = {
87						filters => [],
88						};
89	17				113	return bless $self, $class;
90						}
91
92						=head1 METHODS
93
94						=head2 process
95
96						my $new_text = $lossy->process( $old_text );
97
98						This method takes a single text string, applies all the selected filters
99						to it, and returns the filtered string. Filters are selected via
100						L; see L.
101
102						The text is upgraded to character semantics via a call to
103						C, see L. This will not change the text you passed
104						in, nor should it have too surprising an effect on the output.
105
106						If no text is passed in, nothing is returned (the empty list or C,
107						depending on context).
108						If an explicit C is passed in, an explicit C is returned, even in
109						list context.
110
111						=cut
112
113						sub process {
114	46		46	1	2458	my ($self, $text) = @_;
115	46	100			143	return unless @_ > 1;
116	44	100			138	return undef unless defined $text;
117	42				111	utf8::upgrade($text);
118	42				50	foreach my $f (@{$self->{'filters'}}) {
	42				245
119	47				127	$text = $f->{'code'}->($text);
120						}
121	42				5954	return $text;
122						}
123
124						=head2 add
125
126						$lossy->add( 'lower', 'whitespace' );
127
128						This method takes a list of filter names and adds them to the filter list
129						of the filter object, in the order given. This allows a programmatic
130						selection of filters, for example via command line. Returns the object
131						for method chaining.
132
133						If the filter is unknown, an exception is thrown. This may happen when you
134						misspell the name, or forgot to use a module which registers the filter,
135						or forgot to register it yourself.
136
137						=cut
138
139						sub add {
140	23		23	1	145	my ($self, @filters) = @_;
141	23				54	foreach my $name (@filters) {
142	31				70	my $code = $filtermap{$name};
143	31	100			94	if (not $code) {
144	2				57	croak "Unknown filter $name (did you forget to use the right module?)";
145						}
146	29				41	push @{$self->{'filters'}}, { code => $code, name => $name };
	29				181
147						}
148	21				68	return $self;
149						}
150
151						=head2 clear
152
153						$lossy->clear();
154
155						Remove the filters from the filter object. The object will behave as
156						if newly constructed. Returns the object for method chaining.
157
158						=cut
159
160						sub clear {
161	3		3	1	9	my ($self) = @_;
162	3				7	@{$self->{'filters'}} = ();
	3				15
163	3				9	return $self;
164						}
165
166						=head2 list
167
168						my @names = $lossy->list();
169
170						List the filters added to this object, in order. The names (not the
171						code) are returned in a list.
172
173						=cut
174
175						sub list {
176	5		5	1	12	my ($self) = @_;
177	5				7	return map $_->{'name'}, @{$self->{'filters'}};
	5				51
178						}
179
180						=head2 as_coderef
181
182						my $code = $lossy->as_coderef();
183						$new_text = $code->( $old_text );
184
185						Returns a code reference that closes over the object. This code reference
186						acts like a bound L method on the constructed object. It
187						can be used in places like L that expect a code reference that
188						filters text.
189
190						The code reference is bound to the object, not a particular object state.
191						Adding filters to the object after calling C will also change
192						the behaviour of the code reference.
193
194						=cut
195
196						sub as_coderef {
197	0		0	1	0	my ($self) = @_;
198						return sub {
199	0		0		0	return $self->process(@_);
200						}
201	0				0	}
202
203						=head1 FILTERS
204
205						The following filters are defined by this module. Other modules may define
206						more filters.
207						Each of these filters can be added to the set via the L method.
208
209						=head2 lower
210
211						Corresponds exactly to the L builtin in Perl, up
212						to and including its Unicode handling.
213
214						=cut
215
216						sub lower {
217	14		14	1	18	my ($text) = @_;
218	14				157	return lc($text);
219						}
220
221						=head2 whitespace
222
223						Collapses any whitespace (C<\s> in regular expressions) to a single space, C.
224						Whitespace at the beginning of the text is stripped completely. Whitespace at the end
225						is also collapsed to a single space, to help separate lines. Text consisting only
226						of whitespace results in an empty string.
227
228						=cut
229
230						sub whitespace {
231	8		8	1	12	my ($text) = @_;
232	8				48	$text =~ s{ \s+ }{ }xmsg;
233						# the above line also works for the end of the text
234	8				26	$text =~ s{ \A \s+ }{}xms;
235	8				20	return $text;
236						}
237
238						=head2 whitespace_nl
239
240						A variant of the L filter that leaves newlines on the end of the text
241						alone. Other whitespace at the end will get collapsed into a single newline.
242						If the text ends in whitespace that does not contain a new line, it is replaced
243						by a space, as before.
244
245						This filter is most useful if you are creating a Unix-style text filter, and do not
246						want to buffer the entire input before writing the (only) line to C. The
247						newline at the end will allow downstream processes to work on new lines, too.
248						Otherwise, this filter is not quite as efficient as the L filter.
249
250						Any newlines in the middle of text are collapsed to a space, too. This is especially
251						useful if you are reading in "paragraph mode", e.g. C<$/ = ''>, as you will get
252						one long line per former paragraph.
253
254						=cut
255
256						sub whitespace_nl {
257	7		7	1	14	my ($text) = @_;
258						# Remember whether a newline was present
259	7	100			35	my $has_nl = ($text =~ m{ \n \s* \z }xms) ? 1 : 0;
260	7				55	$text =~ s{ \s+ }{ }xmsg;
261	7				24	$text =~ s{ \A \s+ }{}xms;
262						# whitespace-at-end is now a space
263	7	100			17	if ($has_nl) {
264						# replace this space with a newline
265	2				11	$text =~ s{ \s+ \z }{\n}xms;
266						}
267	7				26	return $text;
268						}
269
270						=head2 punctuation
271
272						Strips punctuation, that is anything matching C<\p{Punctuation}>. It is replaced by
273						nothing, removing it completely.
274
275						=cut
276
277						sub punctuation {
278	6		6	1	12	my ($text) = @_;
279						# Turns out '\p{Punctuation}' fails on Perl 5.6, use the abbreviation '\pP' instead
280	13		13		11223	$text =~ s{ \pP }{}xmsg;
	13				35
	13				211
	6				77
281	6				18	return $text;
282						}
283
284						=head2 punctuation_sp
285
286						A variant of L that replaces punctuation with a space character, C,
287						instead of removing it completely. This is usually less efficient for compression, but
288						retains more readability, for example in the presence of URLs or email addresses.
289
290						=cut
291
292						sub punctuation_sp {
293	3		3	1	5	my ($text) = @_;
294						# Turns out '\p{Punctuation}' fails on Perl 5.6, use the abbreviation '\pP' instead
295	3				25	$text =~ s{ \pP }{ }xmsg;
296	3				10	return $text;
297						}
298
299						=head2 alphabetize
300
301						Leaves the first and last letters of a word alone, but replaces the interior letters with
302						the same set, sorted by the L function. This is done on the observation
303						(source uncertain at the time) that words can still be made out if the letters are present, but
304						in a different order, as long as the outer ones remain the same.
305
306						This filter may not work as proposed with every language or writing system. Specifically, it
307						uses end-of-word matches C<\b> to determine which letters to leave alone.
308
309						=cut
310
311						sub alphabetize {
312	6		6	1	7	my ($text) = @_;
313	6				81	$text =~ s{ \b (\p{Alpha}) (\p{Alpha}+) (\p{Alpha}) \b }{ $1 . join('', sort split(//,$2)) . $3 }xmseg;
	6				52
314	6				23	return $text;
315						}
316
317						# TODO:
318						# - unidecode (separate module)
319						# - normalize (separate module)
320
321						=head1 CLASS METHODS
322
323						These methods are not called on a filter object, but on the class C
324						itself. They are typically concerned with the filters that can be added to filter
325						objects.
326
327						=head2 register_filters
328
329						Text::Lossy->register_filters(
330						change_stuff => \&Other::Module::change_text,
331						remove_ps => sub { my ($text) = @_; $text =~ s{[Pp]}{}; return $text; },
332						);
333
334						Adds one or more named filters to the set of available filters. Filters are
335						passed in an anonymous hash.
336						Previously defined mappings may be overwritten by this function.
337						Specifically, passing C as the code reference removes the filter.
338
339						=cut
340
341						%filtermap = (
342						'lower' => \&lower,
343						'whitespace' => \&whitespace,
344						'whitespace_nl' => \&whitespace_nl,
345						'punctuation' => \&punctuation,
346						'punctuation_sp' => \&punctuation_sp,
347						'alphabetize' => \&alphabetize,
348						);
349
350						sub register_filters {
351	7		7	1	1507	my ($class, %mapping) = @_;
352	7				101	foreach my $name (keys %mapping) {
353	11	100			121	if (defined $mapping{$name}) {
354	3				11	$filtermap{$name} = $mapping{$name};
355						} else {
356	8				30	delete $filtermap{$name};
357						}
358						}
359	7				29	return;
360						}
361
362						=head2 available_filters
363
364						my @filters = Text::Lossy->available_filters();
365
366						Lists the available filters at this point in time, specifically their names
367						as used by L and L. The list is sorted alphabetically.
368
369						=cut
370
371						sub available_filters {
372	4		4	1	1410	my ($class) = @_;
373	4				62	return sort keys %filtermap;
374						}
375
376						=head1 CREATING FILTERS
377
378						A filter is a subroutine which takes a single parameter (the text to be converted) and
379						returns the filtered text. The text may also be changed in-place, as long as it is
380						returned again.
381
382						These filters are then made available to the rest of the system via the
383						L function.
384
385						=head1 USAGE WITH Text::Filter
386
387						The L module provides an infrastructure for filtering text, but no actual filters.
388						It can be used with C by passing the result of L as the C
389						parameter.
390
391						It is recommended to set L to leave line endings alone when using the L
392						filter, i.e. the L and
393						L should be C<0>. This is the default
394						for L. It will allow L to perform its assigned task on line endings.
395
396						One thing to note is that the C filters do not follow the L's convention
397						that lines "to be skipped" should result in an C.
398						This means you need to expect completely empty lines (C, not even a newline character) in
399						your output.
400						This should be no problem if you print to a file handle or append to a string, but may be surprising
401						if you are filtering an array of lines.
402
403						=head1 EXPORT
404
405						Nothing exported or exportable; use the OO interface instead.
406
407						=head1 UNICODE
408
409						This code strives to be completely Unicode compatible. All filters aim to "do the right thing" on non-ASCII strings.
410						Any failure to handle Unicode should be considered a bug; please report it.
411
412						=head1 AUTHOR
413
414						Ben Deutsch, C<< >>
415
416						=head1 BUGS
417
418						None known so far.
419
420						Please report any bugs or feature requests to C, or through
421						the web interface at L. I will be notified, and then you'll
422						automatically be notified of progress on your bug as I make changes.
423
424						=head1 SUPPORT
425
426						You can find documentation for this module with the perldoc command.
427
428						perldoc Text::Lossy
429
430
431						You can also look for information at:
432
433						=over 4
434
435						=item * RT: CPAN's request tracker (report bugs here)
436
437						L
438
439						=item * AnnoCPAN: Annotated CPAN documentation
440
441						L
442
443						=item * CPAN Ratings
444
445						L
446
447						=item * Search CPAN
448
449						L
450
451						=back
452
453
454						=head1 ACKNOWLEDGEMENTS
455
456
457						=head1 LICENSE AND COPYRIGHT
458
459						Copyright 2012 Ben Deutsch.
460
461						This program is free software; you can redistribute it and/or modify it
462						under the terms of either: the GNU General Public License as published
463						by the Free Software Foundation; or the Artistic License.
464
465						See http://dev.perl.org/licenses/ for more information.
466
467
468						=cut
469
470						1; # End of Text::Lossy