File Coverage

blib/lib/App/Greple/xlate/Text.pm

Criterion	Covered	Total	%
statement	83	83	100.0
branch	23	28	82.1
condition	1	3	33.3
subroutine	16	16	100.0
pod	4	8	50.0
total	127	138	92.0

line	stmt	bran	cond	sub	pod	time	code
1							package App::Greple::xlate::Text;
2
3							=encoding utf-8
4
5							=head1 NAME
6
7							App::Greple::xlate::Text - text normalization interface
8
9							=head1 SYNOPSIS
10
11							my $obj = App::Greple::xlate::Text->new($text, paragraph => 1);
12							my $normalized = $obj->normalized;
13
14							$result = process($normalized);
15
16							$obj->unstrip($result);
17
18							=head1 DESCRIPTION
19
20							This is an interface used within L to normalize
21							text.
22
23							To get the normalized text, use the C method.
24
25							During normalization process, any whitespace at the beginning and the
26							end of the line is removed. Therefore, the result of processing the
27							normalized text does not preserve the whitespace in the original
28							string; the C method can be used to restore the removed
29							whitespace.
30
31							=head1 METHODS
32
33							=over 7
34
35							=item B
36
37							Creates an object. The first parameter is the original string; the
38							second and subsequent parameters are pairs of attribute name and values.
39
40							=over 4
41
42							=item B
43
44							Specifies whether or not the text should be treated as a paragraph.
45
46							If true, multiple lines are concatenated into a single line.
47
48							If false, multiple strings are processed as they are.
49
50							In both cases, leading and trailing whitespace is stripped from each
51							line.
52
53							=back
54
55							=item B()
56
57							Returns a normalized string.
58
59							=item B(I<$text>)
60
61							Recover removed white spaces from normalized text or corresponding
62							cooked text.
63
64							If not in paragraph mode, the string to be processed must have the
65							same number of lines as the original string.
66
67							=item B
68
69							Retrieve original text.
70
71							=back
72
73							=cut
74
75	12			12		127686	use v5.14;
	12					95
76	12			12		74	use warnings;
	12					25
	12					785
77	12			12		72	use utf8;
	12					36
	12					93
78
79	12			12		460	use Data::Dumper;
	12					31
	12					921
80	12			12		670	use Unicode::EastAsianWidth;
	12					1602
	12					1256
81	12			12		600	use Hash::Util qw(lock_keys);
	12					3789
	12					87
82
83							sub new {
84	68			68	1	226699	my $class = shift;
85	68					280	my $obj = bless {
86							ATTR => {},
87							TEXT => undef,
88							STRIPPED => undef,
89							NORMALIZED => undef,
90							UNSTRIP => undef,
91							}, $class;
92	68					128	lock_keys %{$obj};
	68					274
93	68					746	$obj->text = shift;
94	68					91	%{$obj->{ATTR}} = (%{$obj->{ATTR}}, @_);
	68					183
	68					145
95	68					239	$obj;
96							}
97
98	212			212	1	497	sub text :lvalue { +shift->{TEXT} }
99	106			106	0	233	sub attr :lvalue { +shift->{ATTR} }
100
101							sub normalize {
102	68			68	0	103	my $obj = shift;
103	68					145	my $paragraph = $obj->attr->{paragraph};
104	68					167	local $_ = $obj->text;
105	68					78	my $normalized = do {
106	68	100				162	if (not $paragraph) {
107	8					61	s{^.+}{
108	14					38	local $_ = ${^MATCH};
109	14					84	s/\A(\h)(.?)(\h*?)\z/$2/;
110	14					75	$_;
111							}pmger;
112							} else {
113	60					277	s{^.+(?:\n.+)*}{
114	60					139	local $_ = ${^MATCH};
115							# remove leading/trailing spaces
116	60					434	s/\A(\h)(.?)(\h*?)\z/$2/;
117							# remove newline after Japanese Punct char
118	60					112	s/(?<=\p{InFullwidth})(?<=\pP)\n//g;
119							# join Japanese lines without space
120	60					82	s/(?<=\p{InFullwidth})\n(?=\p{InFullwidth})//g;
121							# join ASCII lines with single space
122	60					2038	s/\s+/ /g;
123	60					1920	$_;
124							}pmger;
125							}
126							};
127	68					2149	return $normalized;
128							}
129
130							sub normalized {
131	68			68	1	118	my $obj = shift;
132	68		33			250	$obj->{NORMALIZED} //= $obj->normalize;
133							}
134
135							sub strip {
136	38			38	0	53	my $obj = shift;
137	38					87	my $text = $obj->text;
138	38	100				75	if ($obj->attr->{paragraph}) {
139	30					85	return $obj->paragraph_strip;
140							}
141	8					37	my $line_re = qr/.*\n\|.+\z/;
142	8					91	my @text = $text =~ /$line_re/g;
143							my @space = map {
144	8	100				23	[ s/\A(\s+)// ? $1 : '', s/(\h+)$// ? $1 : '' ]
	14	100				142
145							} @text;
146	8					28	$obj->{STRIPPED} = join '', @text;
147							$obj->{UNSTRIP} = sub {
148	8			8		19	for (@_) {
149	8					45	my @text = /.*\n\|.+\z/g;
150	8	100				28	if (@space == @text + 1) {
151	2					4	push @text, '';
152							}
153	8	50				24	die "UNMATCH:\n".Dumper(\@text, \@space) if @text != @space;
154	8					26	for my $i (keys @text) {
155	14					23	my($head, $tail) = @{$space[$i]};
	14					44
156	14	100				58	$text[$i] =~ s/\A/$head/ if length $head > 0;
157	14	100				51	$text[$i] =~ s/\Z/$tail/ if length $tail > 0;
158							}
159	8					38	$_ = join '', @text;
160							}
161	8					57	};
162	8					29	$obj;
163							}
164
165							sub paragraph_strip {
166	30			30	0	38	my $obj = shift;
167	30					46	local *_ = \($obj->{STRIPPED} = $obj->text);
168	30	100				103	my $head = s/\A(\s+)// ? $1 : '' ;
169	30	50				95	my $tail = s/(\h+)$// ? $1 : '' ;
170							$obj->{UNSTRIP} = sub {
171	30			30		66	for (@_) {
172	30	100				122	s/\A/$head/ if length $head;
173	30	50				68	s/\Z/$tail/ if length $tail;
174							}
175	30					118	};
176	30					65	$obj;
177							}
178
179							sub unstrip {
180	38			38	1	8087	my $obj = shift;
181	38	50				143	$obj->strip if not $obj->{UNSTRIP};
182	38	50				94	if (my $unstrip = $obj->{UNSTRIP}) {
183	38					96	$unstrip->(@_);
184							}
185	38					89	$obj;
186							}
187
188							1;