File Coverage

blib/lib/Text/CleanFragment.pm

Criterion	Covered	Total	%
statement	30	30	100.0
branch	3	4	75.0
condition			n/a
subroutine	5	5	100.0
pod	2	2	100.0
total	40	41	97.5

line	stmt	bran	sub	pod	time	code
1						package Text::CleanFragment;
2	3		3		421444	use strict;
	3				6
	3				133
3	3		3		18	use Exporter qw'import';
	3				8
	3				113
4	3		3		1880	use Text::Unidecode;
	3				7604
	3				2296
5
6						our $VERSION = '0.08';
7						our @EXPORT = (qw(clean_fragment clean_fragment_filename));
8
9						=head1 NAME
10
11						=encoding utf8
12
13						Text::CleanFragment - clean up text to use as URL fragment or filename
14
15						=head1 SYNOPSIS
16
17						my $title = "Do p\x{00FC}t into/URL's?";
18						my $id = 42;
19						my $url = join "/",
20						$id,
21						clean_fragment( $title );
22						# 42/Do_put_this_into_URLs
23
24						=head1 DESCRIPTION
25
26						This module downgrades strings of text to match
27
28						/^[-._A-Za-z0-9]*$/
29
30						or, to be more exact
31
32						/^([-.A-Za-z0-9]([-._A-Za-z0-9]*[-.A-Za-z0-9])?)?$/
33
34						This makes the return values safe to be used as URL fragments
35						or as file names on many file systems where whitespace
36						and characters outside of the Latin alphabet are undesired
37						or problematic.
38
39						=head1 FUNCTIONS
40
41						=head2 C<< clean_fragment( @fragments ) >>
42
43						my $url_title = join("_", clean_fragment("Ümloud vs. ß",'by',"Grégory"));
44						# Umloud_vs._ss_by_Gregory
45
46						Returns a cleaned up list of elements. The input elements
47						are expected to be encoded as Unicode strings. Decode them using
48						L if you read the fragments as file names from the filesystem.
49
50						The operations performed are:
51
52						=over 4
53
54						=item *
55
56						Use L to downgrade the text from Unicode to 7-bit ASCII.
57
58						=item *
59
60						Eliminate single and double quotes, backquotes and apostrophes.
61
62						=item *
63
64						Replace all non-letters, non-digits by underscores, including whitespace
65						and control characters.
66
67						=item *
68
69						Squash dashes to a single dash
70
71						=item *
72
73						Squash C<_-_> and C<_-_(-_)+> to -
74
75						=item *
76
77						Eliminate leading underscores
78
79						=item *
80
81						Eliminate trailing underscores
82
83						=item *
84
85						Eliminate underscores before - or .
86
87						=back
88
89						In scalar context, returns the first element of the cleaned up list.
90
91						=cut
92
93						sub clean_fragment {
94						# make uri-sane filenames
95						# We assume Unicode on input.
96
97						# First, downgrade to ASCII chars (or transliterate if possible)
98	70		70	1	333805	@_ = unidecode(@_);
99
100	70				13154	for( @_ ) {
101	72				258	tr/'"\x{2019}`´//d; # Eliminate apostrophes and backquotes
102	72				537	s/[^a-zA-Z0-9.-]+/_/g; # Replace all non-ascii by underscores, including whitespace
103	72				233	s/-+/-/g; # Squash dashes
104	72				389	s/_+/_/g; # Squash underscores
105	72				217	s/_(?:-_)+/-/g; # Squash _-_ and _-_-_ to -
106	72				183	s/^[-_]+//; # Eliminate leading underscores
107	72				294	s/[-_]+$//; # Eliminate trailing underscores
108	72				296	s/_(\W)/$1/; # No underscore before - or .
109						};
110	70	100			405	wantarray ? @_ : $_[0];
111						};
112
113						=head2 C<< clean_fragment_filename( @fragments ) >>
114
115						my @parts = clean_fragment_filename( @fragments );
116
117						Does the same as C but only removes the following characters,
118						making the output safe for Unicode-capable filesystems:
119
120						\x{00}-\x{1f}
121						/ \ * < > : \| ?
122						' " ` ´
123						\x{2019}
124
125						This does not necessarily make the filename safe for blind use in shell
126						commands, as for example C<;> and C remain in the filenames.
127
128						=cut
129
130						sub clean_fragment_filename {
131	68		68	1	283381	for( @_ ) {
132	68				251	tr/'"\x{2019}`´//d; # Eliminate apostrophes and backquotes
133	68				477	s/[\s\x00-\x1f\/\\\*<>:\|\?]+/_/g; # Replace all non-ascii by underscores, including whitespace
134	68				245	s/-+/-/g; # Squash dashes
135	68				537	s/_+/_/g; # Squash underscores
136	68				461	s/_(?:(\p{IsPunctuation})_)+/$1/g; # Squash _-_ and _-_-_ to -
137	68				222	s/^[-_]+//; # Eliminate leading underscores
138	68				405	s/[-_]+$//; # Eliminate trailing underscores
139	68				258	s/_(\W)/$1/; # No underscore before - or .
140						};
141	68	50			386	wantarray ? @_ : $_[0];
142						}
143
144						1;
145
146						__END__