File Coverage

blib/lib/Text/CleanFragment.pm

Criterion	Covered	Total	%
statement	22	22	100.0
branch	2	2	100.0
condition			n/a
subroutine	5	5	100.0
pod	1	1	100.0
total	30	30	100.0

line	stmt	bran	sub	pod	time	code
1						package Text::CleanFragment;
2	2		2		76526	use strict;
	2				14
	2				66
3	2		2		11	use Exporter qw'import';
	2				6
	2				56
4	2		2		1052	use Text::Unidecode;
	2				4871
	2				261
5
6						our $VERSION = '0.06';
7						our @EXPORT = (qw(clean_fragment));
8
9						=head1 NAME
10
11						=encoding utf8
12
13						Text::CleanFragment - clean up text to use as URL fragment or filename
14
15						=head1 SYNOPSIS
16
17						my $title = "Do p\x{00FC}t into/URL's?";
18						my $id = 42;
19						my $url = join "/",
20						$id,
21						clean_fragment( $title );
22						# 42/Do_put_this_into_URLs
23
24						=head1 DESCRIPTION
25
26						This module downgrades strings of text to match
27
28						/^[-._A-Za-z0-9]*$/
29
30						or, to be more exact
31
32						/^([-.A-Za-z0-9]([-._A-Za-z0-9]*[-.A-Za-z0-9])?)?$/
33
34						This makes the return values safe to be used as URL fragments
35						or as file names on many file systems where whitespace
36						and characters outside of the Latin alphabet are undesired
37						or problematic.
38
39						=head1 FUNCTIONS
40
41						=head2 C<< clean_fragment( @fragments ) >>
42
43						my $url_title = join("_", clean_fragment("Ümloud vs. ß",'by',"Grégory"));
44						# Umloud_vs._ss_by_Gregory
45
46						Returns a cleaned up list of elements. The input elements
47						are expected to be encoded as Unicode strings. Decode them using
48						L if you read the fragments as file names from the filesystem.
49
50						The operations performed are:
51
52						=over 4
53
54						=item *
55
56						Use L to downgrade the text from Unicode to 7-bit ASCII.
57
58						=item *
59
60						Eliminate single and double quotes, apostrophes.
61
62						=item *
63
64						Replace all non-letters, non-digits by underscores, including whitespace
65						and control characters.
66
67						=item *
68
69						Squash dashes to a single dash
70
71						=item *
72
73						Squash C<_-_> and C<_-_(-_)+> to -
74
75						=item *
76
77						Eliminate leading underscores
78
79						=item *
80
81						Eliminate trailing underscores
82
83						=item *
84
85						Eliminate underscores before - or .
86
87						=back
88
89						In scalar context, returns the first element of the cleaned up list.
90
91						=cut
92
93						sub clean_fragment {
94						# make uri-sane filenames
95						# We assume Unicode on input.
96
97						# First, downgrade to ASCII chars (or transliterate if possible)
98	67		67	1	70234	@_ = unidecode(@_);
99
100	67				6829	for( @_ ) {
101	2		2		13	tr/['"\x{2019}]//d; # Eliminate apostrophes
	2				4
	2				26
	68				267
102	68				422	s/[^a-zA-Z0-9.-]+/_/g; # Replace all non-ascii by underscores, including whitespace
103	68				191	s/-+/-/g; # Squash dashes
104	68				160	s/_(?:-_)+/-/g; # Squash _-_ and _-_-_ to -
105	68				163	s/^[-_]+//; # Eliminate leading underscores
106	68				223	s/[-_]+$//; # Eliminate trailing underscores
107	68				257	s/_(\W)/$1/; # No underscore before - or .
108						};
109	67	100			313	wantarray ? @_ : $_[0];
110						};
111
112						1;
113
114						__END__