line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Unicode::Diacritic::Strip; |
2
|
5
|
|
|
5
|
|
347704
|
use warnings; |
|
5
|
|
|
|
|
54
|
|
|
5
|
|
|
|
|
215
|
|
3
|
5
|
|
|
5
|
|
33
|
use strict; |
|
5
|
|
|
|
|
11
|
|
|
5
|
|
|
|
|
104
|
|
4
|
5
|
|
|
5
|
|
1904
|
use utf8; |
|
5
|
|
|
|
|
50
|
|
|
5
|
|
|
|
|
37
|
|
5
|
|
|
|
|
|
|
require Exporter; |
6
|
5
|
|
|
5
|
|
213
|
use base qw(Exporter); |
|
5
|
|
|
|
|
12
|
|
|
5
|
|
|
|
|
1049
|
|
7
|
|
|
|
|
|
|
our @EXPORT_OK = qw/strip_diacritics strip_alphabet fast_strip/; |
8
|
|
|
|
|
|
|
our %EXPORT_TAGS = (all => \@EXPORT_OK); |
9
|
|
|
|
|
|
|
our $VERSION = '0.11'; |
10
|
5
|
|
|
5
|
|
5068
|
use Unicode::UCD 'charinfo'; |
|
5
|
|
|
|
|
264536
|
|
|
5
|
|
|
|
|
430
|
|
11
|
5
|
|
|
5
|
|
2742
|
use Encode 'decode_utf8'; |
|
5
|
|
|
|
|
49012
|
|
|
5
|
|
|
|
|
4451
|
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
sub strip_diacritics |
14
|
|
|
|
|
|
|
{ |
15
|
3
|
|
|
3
|
1
|
2421
|
my ($diacritics_text) = @_; |
16
|
3
|
100
|
|
|
|
26
|
if ($diacritics_text !~ /[^\x{01}-\x{80}]/) { |
17
|
|
|
|
|
|
|
# All the characters in this text are ASCII, and so there are |
18
|
|
|
|
|
|
|
# no diacritics. |
19
|
1
|
|
|
|
|
3
|
return $diacritics_text; |
20
|
|
|
|
|
|
|
} |
21
|
2
|
|
|
|
|
22
|
my @characters = split //, $diacritics_text; |
22
|
2
|
|
|
|
|
8
|
for my $character (@characters) { |
23
|
|
|
|
|
|
|
# Leave non-word characters unaltered. |
24
|
34
|
100
|
|
|
|
161
|
if ($character =~ /\W/) { |
25
|
1
|
|
|
|
|
3
|
next; |
26
|
|
|
|
|
|
|
} |
27
|
33
|
|
|
|
|
84
|
my $decomposed = decompose ($character); |
28
|
33
|
100
|
|
|
|
139
|
if ($character ne $decomposed) { |
29
|
28
|
|
|
|
|
88
|
$character = $decomposed; |
30
|
|
|
|
|
|
|
} |
31
|
|
|
|
|
|
|
} |
32
|
2
|
|
|
|
|
16
|
my $stripped_text = join '', @characters; |
33
|
2
|
|
|
|
|
17
|
return $stripped_text; |
34
|
|
|
|
|
|
|
} |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
sub decompose |
37
|
|
|
|
|
|
|
{ |
38
|
109
|
|
|
109
|
0
|
223
|
my ($character) = @_; |
39
|
|
|
|
|
|
|
# Get the Unicode::UCD decomposition. |
40
|
109
|
|
|
|
|
305
|
my $charinfo = charinfo (ord $character); |
41
|
109
|
|
|
|
|
855156
|
my $decomposition = $charinfo->{decomposition}; |
42
|
|
|
|
|
|
|
# Give up if there is no decomposition for $character |
43
|
109
|
100
|
|
|
|
294
|
if (! $decomposition) { |
44
|
71
|
|
|
|
|
413
|
return $character; |
45
|
|
|
|
|
|
|
} |
46
|
|
|
|
|
|
|
# Get the first character of the decomposition |
47
|
38
|
|
|
|
|
215
|
my @decomposition_chars = split /\s+/, $decomposition; |
48
|
38
|
|
|
|
|
100
|
$character = chr hex $decomposition_chars[0]; |
49
|
|
|
|
|
|
|
# A character may have multiple decompositions, so repeat this |
50
|
|
|
|
|
|
|
# process until there are none left. |
51
|
38
|
|
|
|
|
130
|
return decompose ($character); |
52
|
|
|
|
|
|
|
} |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
sub strip_alphabet |
55
|
|
|
|
|
|
|
{ |
56
|
1
|
|
|
1
|
1
|
1874
|
my ($diacritics_text, %options) = @_; |
57
|
1
|
|
|
|
|
4
|
my %swaps; |
58
|
1
|
50
|
33
|
|
|
13
|
if (! defined $diacritics_text || length ($diacritics_text) == 0) { |
59
|
0
|
|
|
|
|
0
|
return ($diacritics_text, {}); |
60
|
|
|
|
|
|
|
} |
61
|
1
|
|
|
|
|
59
|
my @characters = split //, $diacritics_text; |
62
|
1
|
|
|
|
|
5
|
my %alphabet; |
63
|
1
|
|
|
|
|
4
|
for my $c (@characters) { |
64
|
295
|
|
|
|
|
515
|
$alphabet{$c} = 1; |
65
|
|
|
|
|
|
|
} |
66
|
1
|
|
|
|
|
18
|
my @c = keys %alphabet; |
67
|
|
|
|
|
|
|
|
68
|
1
|
|
|
|
|
5
|
for my $character (@c) { |
69
|
|
|
|
|
|
|
# Reject non-word characters |
70
|
44
|
100
|
|
|
|
173
|
if ($character !~ /\w/) { |
71
|
6
|
50
|
|
|
|
14
|
if ($options{verbose}) { |
72
|
0
|
|
|
|
|
0
|
print "Not altering non-word character '$character'.\n"; |
73
|
|
|
|
|
|
|
} |
74
|
6
|
|
|
|
|
14
|
next; |
75
|
|
|
|
|
|
|
} |
76
|
38
|
|
|
|
|
113
|
my $decomposed = decompose ($character, %options); |
77
|
38
|
100
|
|
|
|
118
|
if ($character ne $decomposed) { |
78
|
8
|
|
|
|
|
20
|
my $boo = "$decomposed baba"; |
79
|
8
|
|
|
|
|
25
|
$swaps{$character} = $boo; |
80
|
8
|
|
|
|
|
45
|
$swaps{$character} =~ s/ baba$//; |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
# Make the version of the text with all the diacritics removed. |
85
|
|
|
|
|
|
|
|
86
|
1
|
|
|
|
|
3
|
my $stripped_text = $diacritics_text; |
87
|
1
|
|
|
|
|
7
|
for my $k (keys %swaps) { |
88
|
8
|
50
|
|
|
|
23
|
if ($options{verbose}) { |
89
|
0
|
|
|
|
|
0
|
printf "Swapping $k for $swaps{$k} (%X).\n", ord ($swaps{$k}); |
90
|
|
|
|
|
|
|
} |
91
|
8
|
|
|
|
|
103
|
$stripped_text =~ s/$k/$swaps{$k}/g; |
92
|
|
|
|
|
|
|
} |
93
|
1
|
|
|
|
|
59
|
return ($stripped_text, \%swaps); |
94
|
|
|
|
|
|
|
} |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
sub fast_strip |
97
|
|
|
|
|
|
|
{ |
98
|
250
|
|
|
250
|
1
|
144192
|
my ($word) = @_; |
99
|
|
|
|
|
|
|
# Expand ligatures. |
100
|
250
|
|
|
|
|
734
|
$word =~ s/œ/oe/g; |
101
|
|
|
|
|
|
|
# Thorn is "th". |
102
|
250
|
|
|
|
|
834
|
$word =~ s/Þ|þ/th/g; |
103
|
|
|
|
|
|
|
# Remove all diacritics |
104
|
250
|
|
|
|
|
1157
|
$word =~ tr/ÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöùúûüýÿĀāĂ㥹ĆćĈĉĊċČčĎďĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĨĩĪīĬĭĮįİĴĵĶķĹĺĻļĽľŁłŃńŅņŇňŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšŢţŤťŨũŪūŬŭŮůŰűŲųŴŵŶŷŸŹźŻżŽžƠơƯưǍǎǏǐǑǒǓǔǕǖǗǘǙǚǛǜǞǟǠǡǦǧǨǩǪǫǬǭǰǴǵǸǹǺǻȀȁȂȃȄȅȆȇȈȉȊȋȌȍȎȏȐȑȒȓȔȕȖȗȘșȚțȞȟȦȧȨȩȪȫȬȭȮȯȰȱȲȳøØḀḁḂḃḄḅḆḇḈḉḊḋḌḍḎḏḐḑḒḓḔḕḖḗḘḙḚḛḜḝḞḟḠḡḢḣḤḥḦḧḨḩḪḫḬḭḮḯḰḱḲḳḴḵḶḷḸḹḺḻḼḽḾḿṀṁṂṃṄṅṆṇṈṉṊṋṌṍṎṏṐṑṒṓṔṕṖṗṘṙṚṛṜṝṞṟṠṡṢṣṤṥṦṧṨṩṪṫṬṭṮṯṰṱṲṳṴṵṶṷṸṹṺṻṼṽṾṿẀẁẂẃẄẅẆẇẈẉẊẋẌẍẎẏẐẑẒẓẔẕẖẗẘẙẚẛẜẝẠạẢảẤấẦầẨẩẪẫẬậẮắẰằẲẳẴẵẶặẸẹẺẻẼẽẾếỀềỂểỄễỆệỈỉỊịỌọỎỏỐốỒồỔổỖỗỘộỚớỜờỞởỠỡỢợỤụỦủỨứỪừỬửỮữỰựỲỳỴỵỶỷỸỹ/AAAAAACEEEEIIIINOOOOOUUUUYaaaaaaceeeeiiiinooooouuuuyyAaAaAaCcCcCcCcDdEeEeEeEeEeGgGgGgGgHhIiIiIiIiIJjKkLlLlLlLlNnNnNnOoOoOoRrRrRrSsSsSsSsTtTtUuUuUuUuUuUuWwYyYZzZzZzOoUuAaIiOoUuUuUuUuUuAaAaGgKkOoOojGgNnAaAaAaEeEeIiIiOoOoRrRrUuUuSsTtHhAaEeOoOoOoOoYyoOAaBbBbBbCcDdDdDdDdDdEeEeEeEeEeFfGgHhHhHhHhHhIiIiKkKkKkLlLlLlLlMmMmMmNnNnNnNnOoOoOoOoPpPpRrRrRrRrSsSsSsSsSsTtTtTtTtUuUuUuUuUuVvVvWwWwWwWwWwXxXxYyZzZzZzhtwyafffAaAaAaAaAaAaAaAaAaAaAaAaEeEeEeEeEeEeEeEeIiIiOoOoOoOoOoOoOoOoOoOoOoOoOoUuUuUuUuUuUuUuYyYyYyYy/; |
105
|
250
|
|
|
|
|
611
|
return $word; |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
1; |