File Coverage

blib/lib/App/Greple/xlate/Text.pm

Criterion	Covered	Total	%
statement	83	83	100.0
branch	23	28	82.1
condition	1	3	33.3
subroutine	16	16	100.0
pod	4	8	50.0
total	127	138	92.0

line	stmt	bran	cond	sub	pod	time	code
1							package App::Greple::xlate::Text;
2
3							=encoding utf-8
4
5							=head1 NAME
6
7							App::Greple::xlate::Text - text normalization interface
8
9							=head1 SYNOPSIS
10
11							my $obj = App::Greple::xlate::Text->new($text, paragraph => 1);
12							my $normalized = $obj->normalized;
13
14							$result = process($normalized);
15
16							$obj->unstrip($result);
17
18							=head1 DESCRIPTION
19
20							This is an interface used within L to normalize
21							text.
22
23							To get the normalized text, use the C method.
24
25							During normalization process, any whitespace at the beginning and the
26							end of the line is removed. Therefore, the result of processing the
27							normalized text does not preserve the whitespace in the original
28							string; the C method can be used to restore the removed
29							whitespace.
30
31							=head1 METHODS
32
33							=over 7
34
35							=item B
36
37							Creates an object. The first parameter is the original string; the
38							second and subsequent parameters are pairs of attribute name and values.
39
40							=over 4
41
42							=item B
43
44							Specifies whether or not the text should be treated as a paragraph.
45
46							If true, multiple lines are concatenated into a single line.
47
48							If false, multiple strings are processed as they are.
49
50							In both cases, leading and trailing whitespace is stripped from each
51							line.
52
53							=back
54
55							=item B()
56
57							Returns a normalized string.
58
59							=item B(I<$text>)
60
61							Recover removed white spaces from normalized text or corresponding
62							cooked text.
63
64							If not in paragraph mode, the string to be processed must have the
65							same number of lines as the original string.
66
67							=item B
68
69							Retrieve original text.
70
71							=back
72
73							=cut
74
75	12			12		141248	use v5.14;
	12					60
76	12			12		89	use warnings;
	12					21
	12					787
77	12			12		64	use utf8;
	12					21
	12					84
78
79	12			12		459	use Data::Dumper;
	12					21
	12					864
80	12			12		486	use Unicode::EastAsianWidth;
	12					1232
	12					1029
81	12			12		450	use Hash::Util qw(lock_keys);
	12					2926
	12					104
82
83							sub new {
84	66			66	1	162890	my $class = shift;
85	66					345	my $obj = bless {
86							ATTR => {},
87							TEXT => undef,
88							STRIPPED => undef,
89							NORMALIZED => undef,
90							UNSTRIP => undef,
91							}, $class;
92	66					126	lock_keys %{$obj};
	66					337
93	66					968	$obj->text = shift;
94	66					108	%{$obj->{ATTR}} = (%{$obj->{ATTR}}, @_);
	66					224
	66					186
95	66					192	$obj;
96							}
97
98	206			206	1	535	sub text :lvalue { +shift->{TEXT} }
99	103			103	0	247	sub attr :lvalue { +shift->{ATTR} }
100
101							sub normalize {
102	66			66	0	103	my $obj = shift;
103	66					172	my $paragraph = $obj->attr->{paragraph};
104	66					169	local $_ = $obj->text;
105	66					114	my $normalized = do {
106	66	100				173	if (not $paragraph) {
107	8					35	s{^.+}{
108	14					21	local $_ = ${^MATCH};
109	14					50	s/\A(\h)(.?)(\h*?)\z/$2/;
110	14					64	$_;
111							}pmger;
112							} else {
113	58					434	s{^.+(?:\n.+)*}{
114	58					188	local $_ = ${^MATCH};
115							# remove leading/trailing spaces
116	58					660	s/\A(\h)(.?)(\h*?)\z/$2/;
117							# remove newline after Japanese Punct char
118	58					152	s/(?<=\p{InFullwidth})(?<=\pP)\n//g;
119							# join Japanese lines without space
120	58					120	s/(?<=\p{InFullwidth})\n(?=\p{InFullwidth})//g;
121							# join ASCII lines with single space
122	58					2269	s/\s+/ /g;
123	58					1908	$_;
124							}pmger;
125							}
126							};
127	66					413	return $normalized;
128							}
129
130							sub normalized {
131	66			66	1	166	my $obj = shift;
132	66		33			315	$obj->{NORMALIZED} //= $obj->normalize;
133							}
134
135							sub strip {
136	37			37	0	56	my $obj = shift;
137	37					103	my $text = $obj->text;
138	37	100				106	if ($obj->attr->{paragraph}) {
139	29					94	return $obj->paragraph_strip;
140							}
141	8					23	my $line_re = qr/.*\n\|.+\z/;
142	8					53	my @text = $text =~ /$line_re/g;
143							my @space = map {
144	8	100				20	[ s/\A(\s+)// ? $1 : '', s/(\h+)$// ? $1 : '' ]
	14	100				80
145							} @text;
146	8					16	$obj->{STRIPPED} = join '', @text;
147							$obj->{UNSTRIP} = sub {
148	8			8		13	for (@_) {
149	8					21	my @text = /.*\n\|.+\z/g;
150	8	100				15	if (@space == @text + 1) {
151	2					3	push @text, '';
152							}
153	8	50				12	die "UNMATCH:\n".Dumper(\@text, \@space) if @text != @space;
154	8					17	for my $i (keys @text) {
155	14					14	my($head, $tail) = @{$space[$i]};
	14					25
156	14	100				53	$text[$i] =~ s/\A/$head/ if length $head > 0;
157	14	100				31	$text[$i] =~ s/\Z/$tail/ if length $tail > 0;
158							}
159	8					25	$_ = join '', @text;
160							}
161	8					34	};
162	8					17	$obj;
163							}
164
165							sub paragraph_strip {
166	29			29	0	52	my $obj = shift;
167	29					91	local *_ = \($obj->{STRIPPED} = $obj->text);
168	29	100				167	my $head = s/\A(\s+)// ? $1 : '' ;
169	29	50				136	my $tail = s/(\h+)$// ? $1 : '' ;
170							$obj->{UNSTRIP} = sub {
171	29			29		100	for (@_) {
172	29	100				107	s/\A/$head/ if length $head;
173	29	50				89	s/\Z/$tail/ if length $tail;
174							}
175	29					165	};
176	29					90	$obj;
177							}
178
179							sub unstrip {
180	37			37	1	4551	my $obj = shift;
181	37	50				181	$obj->strip if not $obj->{UNSTRIP};
182	37	50				112	if (my $unstrip = $obj->{UNSTRIP}) {
183	37					87	$unstrip->(@_);
184							}
185	37					83	$obj;
186							}
187
188							1;