File Coverage

blib/lib/Text/Transliterator/Unaccent.pm

Criterion	Covered	Total	%
statement	38	59	64.4
branch	6	18	33.3
condition	6	12	50.0
subroutine	7	14	50.0
pod	3	3	100.0
total	60	106	56.6

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Transliterator::Unaccent;
2
3	2			2		70981	use warnings;
	2					11
	2					66
4	2			2		11	use strict;
	2					4
	2					68
5
6							our $VERSION = "1.04";
7
8	2			2		411	use Text::Transliterator;
	2					5
	2					68
9	2			2		2215	use Unicode::UCD qw(charinfo charscript charblock);
	2					109791
	2					182
10	2			2		22	use Unicode::Normalize qw();
	2					4
	2					543
11
12							sub char_map {
13	1			1	1	2	my $class = shift;
14
15	1					3	my @all_ranges;
16	1					2	my $ignore_wide = 0;
17	1					2	my $ignore_upper = 0;
18	1					2	my $ignore_lower = 0;
19
20							# decode arguments to get character ranges and boolean flags
21	1					7	while (my ($kind, $arg) = splice(@_, 0, 2)) {
22	0					0	my $ranges;
23
24							my $todo = {
25	0	0		0		0	script => sub { $ranges = charscript($arg)
26							or die "$arg is not a valid Unicode script" },
27	0	0		0		0	block => sub { $ranges = charblock($arg)
28							or die "$arg is not a valid Unicode block" },
29	0			0		0	ranges => sub { $ranges = $arg },
30	0			0		0	wide => sub { $ignore_wide = !$arg },
31	0			0		0	upper => sub { $ignore_upper = !$arg },
32	0			0		0	lower => sub { $ignore_lower = !$arg },
33	0					0	};
34	0	0				0	my $coderef = $todo->{$kind}
35							or die "invalid argument: $kind";
36	0					0	$coderef->();
37	0	0				0	push @all_ranges, @$ranges if $ranges;
38							}
39
40							# default
41	1	50				3	@all_ranges = @{charscript('Latin')} if !@all_ranges;
	1					6
42
43							# build the map
44	1					17813	my %map;
45	1					4	foreach my $range (@all_ranges) {
46	31					62	my ($start, $end) = @$range;
47
48							# iterate over characters in range
49							CHAR:
50	31					54	for my $c ($start .. $end) {
51
52							# maybe drop that char under some conditions
53	1350	50	33			2332	last CHAR if $ignore_wide and $c > 255;
54	1350	50	33			2297	next CHAR if $ignore_upper and chr($c) =~ /\p{Uppercase_Letter}/;
55	1350	50	33			2275	next CHAR if $ignore_lower and chr($c) =~ /\p{Lowercase_Letter}/;
56
57							# get canonical decomposition (if any)
58	1350					2225	my $canon = Unicode::Normalize::getCanon($c);
59
60							# store into map
61	1350	100	100			3058	if ($canon && length($canon) > 1) {
62							# the unaccented char is the the base (first char) of the decomposition
63	498					858	my $base = substr $canon, 0, 1;
64	498					1431	$map{chr($c)} = $base,
65							}
66							}
67							}
68
69	1					12	return \%map;
70							}
71
72							sub char_map_descr {
73	0			0	1	0	my $class = shift;
74
75	0					0	my $map = $class->char_map(@_);
76
77	0					0	my $txt = "";
78							# while (my ($k, $v) = each %$map) {
79	0					0	foreach my $k (sort {$a cmp $b} keys %$map) {
	0					0
80	0					0	my $v = $map->{$k};
81	0					0	my $accented = ord($k);
82	0					0	my $base = ord($v);
83							$txt .= sprintf "U+%04x %-55s => U+%04x %s\n",
84							$accented,
85							charinfo($accented)->{name},
86							$base,
87	0					0	charinfo($base)->{name};
88							}
89	0					0	return $txt;
90							}
91
92							sub new {
93	1			1	1	888	my $class = shift;
94	1					3	my $map = $class->char_map(@_);
95	1					12	return Text::Transliterator->new($map)
96							}
97
98							1; # End of Text::Transliterator::Unaccent
99
100
101							__END__