File Coverage

blib/lib/Text/CleanFragment.pm
Criterion Covered Total %
statement 30 30 100.0
branch 3 4 75.0
condition n/a
subroutine 5 5 100.0
pod 2 2 100.0
total 40 41 97.5


line stmt bran cond sub pod time code
1             package Text::CleanFragment;
2 3     3   421444 use strict;
  3         6  
  3         133  
3 3     3   18 use Exporter qw'import';
  3         8  
  3         113  
4 3     3   1880 use Text::Unidecode;
  3         7604  
  3         2296  
5              
6             our $VERSION = '0.08';
7             our @EXPORT = (qw(clean_fragment clean_fragment_filename));
8              
9             =head1 NAME
10              
11             =encoding utf8
12              
13             Text::CleanFragment - clean up text to use as URL fragment or filename
14              
15             =head1 SYNOPSIS
16              
17             my $title = "Do p\x{00FC}t into/URL's?";
18             my $id = 42;
19             my $url = join "/",
20             $id,
21             clean_fragment( $title );
22             # 42/Do_put_this_into_URLs
23              
24             =head1 DESCRIPTION
25              
26             This module downgrades strings of text to match
27              
28             /^[-._A-Za-z0-9]*$/
29              
30             or, to be more exact
31              
32             /^([-.A-Za-z0-9]([-._A-Za-z0-9]*[-.A-Za-z0-9])?)?$/
33              
34             This makes the return values safe to be used as URL fragments
35             or as file names on many file systems where whitespace
36             and characters outside of the Latin alphabet are undesired
37             or problematic.
38              
39             =head1 FUNCTIONS
40              
41             =head2 C<< clean_fragment( @fragments ) >>
42              
43             my $url_title = join("_", clean_fragment("Ümloud vs. ß",'by',"Grégory"));
44             # Umloud_vs._ss_by_Gregory
45              
46             Returns a cleaned up list of elements. The input elements
47             are expected to be encoded as Unicode strings. Decode them using
48             L if you read the fragments as file names from the filesystem.
49              
50             The operations performed are:
51              
52             =over 4
53              
54             =item *
55              
56             Use L to downgrade the text from Unicode to 7-bit ASCII.
57              
58             =item *
59              
60             Eliminate single and double quotes, backquotes and apostrophes.
61              
62             =item *
63              
64             Replace all non-letters, non-digits by underscores, including whitespace
65             and control characters.
66              
67             =item *
68              
69             Squash dashes to a single dash
70              
71             =item *
72              
73             Squash C<_-_> and C<_-_(-_)+> to -
74              
75             =item *
76              
77             Eliminate leading underscores
78              
79             =item *
80              
81             Eliminate trailing underscores
82              
83             =item *
84              
85             Eliminate underscores before - or .
86              
87             =back
88              
89             In scalar context, returns the first element of the cleaned up list.
90              
91             =cut
92              
93             sub clean_fragment {
94             # make uri-sane filenames
95             # We assume Unicode on input.
96              
97             # First, downgrade to ASCII chars (or transliterate if possible)
98 70     70 1 333805 @_ = unidecode(@_);
99              
100 70         13154 for( @_ ) {
101 72         258 tr/'"\x{2019}`´//d; # Eliminate apostrophes and backquotes
102 72         537 s/[^a-zA-Z0-9.-]+/_/g; # Replace all non-ascii by underscores, including whitespace
103 72         233 s/-+/-/g; # Squash dashes
104 72         389 s/_+/_/g; # Squash underscores
105 72         217 s/_(?:-_)+/-/g; # Squash _-_ and _-_-_ to -
106 72         183 s/^[-_]+//; # Eliminate leading underscores
107 72         294 s/[-_]+$//; # Eliminate trailing underscores
108 72         296 s/_(\W)/$1/; # No underscore before - or .
109             };
110 70 100       405 wantarray ? @_ : $_[0];
111             };
112              
113             =head2 C<< clean_fragment_filename( @fragments ) >>
114              
115             my @parts = clean_fragment_filename( @fragments );
116              
117             Does the same as C but only removes the following characters,
118             making the output safe for Unicode-capable filesystems:
119              
120             \x{00}-\x{1f}
121             / \ * < > : | ?
122             ' " ` ´
123             \x{2019}
124              
125             This does not necessarily make the filename safe for blind use in shell
126             commands, as for example C<;> and C remain in the filenames.
127              
128             =cut
129              
130             sub clean_fragment_filename {
131 68     68 1 283381 for( @_ ) {
132 68         251 tr/'"\x{2019}`´//d; # Eliminate apostrophes and backquotes
133 68         477 s/[\s\x00-\x1f\/\\\*<>:|\?]+/_/g; # Replace all non-ascii by underscores, including whitespace
134 68         245 s/-+/-/g; # Squash dashes
135 68         537 s/_+/_/g; # Squash underscores
136 68         461 s/_(?:(\p{IsPunctuation})_)+/$1/g; # Squash _-_ and _-_-_ to -
137 68         222 s/^[-_]+//; # Eliminate leading underscores
138 68         405 s/[-_]+$//; # Eliminate trailing underscores
139 68         258 s/_(\W)/$1/; # No underscore before - or .
140             };
141 68 50       386 wantarray ? @_ : $_[0];
142             }
143              
144             1;
145              
146             __END__