| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package App::Greple::xlate::Text; |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=encoding utf-8 |
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 NAME |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
App::Greple::xlate::Text - text normalization interface |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
my $obj = App::Greple::xlate::Text->new($text, paragraph => 1); |
|
12
|
|
|
|
|
|
|
my $normalized = $obj->normalized; |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
$result = process($normalized); |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
$obj->unstrip($result); |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
This is an interface used within L to normalize |
|
21
|
|
|
|
|
|
|
text. |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
To get the normalized text, use the C method. |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
During normalization process, any whitespace at the beginning and the |
|
26
|
|
|
|
|
|
|
end of the line is removed. Therefore, the result of processing the |
|
27
|
|
|
|
|
|
|
normalized text does not preserve the whitespace in the original |
|
28
|
|
|
|
|
|
|
string; the C method can be used to restore the removed |
|
29
|
|
|
|
|
|
|
whitespace. |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=head1 METHODS |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
=over 7 |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
=item B |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
Creates an object. The first parameter is the original string; the |
|
38
|
|
|
|
|
|
|
second and subsequent parameters are pairs of attribute name and values. |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=over 4 |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=item B |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
Specifies whether or not the text should be treated as a paragraph. |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
If true, multiple lines are concatenated into a single line. |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
If false, multiple strings are processed as they are. |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
In both cases, leading and trailing whitespace is stripped from each |
|
51
|
|
|
|
|
|
|
line. |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
=back |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=item B() |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
Returns a normalized string. |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=item B(I<$text>) |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
Recover removed white spaces from normalized text or corresponding |
|
62
|
|
|
|
|
|
|
cooked text. |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
If not in paragraph mode, the string to be processed must have the |
|
65
|
|
|
|
|
|
|
same number of lines as the original string. |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=item B |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
Retrieve original text. |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=back |
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
=cut |
|
74
|
|
|
|
|
|
|
|
|
75
|
12
|
|
|
12
|
|
127686
|
use v5.14; |
|
|
12
|
|
|
|
|
95
|
|
|
76
|
12
|
|
|
12
|
|
74
|
use warnings; |
|
|
12
|
|
|
|
|
25
|
|
|
|
12
|
|
|
|
|
785
|
|
|
77
|
12
|
|
|
12
|
|
72
|
use utf8; |
|
|
12
|
|
|
|
|
36
|
|
|
|
12
|
|
|
|
|
93
|
|
|
78
|
|
|
|
|
|
|
|
|
79
|
12
|
|
|
12
|
|
460
|
use Data::Dumper; |
|
|
12
|
|
|
|
|
31
|
|
|
|
12
|
|
|
|
|
921
|
|
|
80
|
12
|
|
|
12
|
|
670
|
use Unicode::EastAsianWidth; |
|
|
12
|
|
|
|
|
1602
|
|
|
|
12
|
|
|
|
|
1256
|
|
|
81
|
12
|
|
|
12
|
|
600
|
use Hash::Util qw(lock_keys); |
|
|
12
|
|
|
|
|
3789
|
|
|
|
12
|
|
|
|
|
87
|
|
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
sub new { |
|
84
|
68
|
|
|
68
|
1
|
226699
|
my $class = shift; |
|
85
|
68
|
|
|
|
|
280
|
my $obj = bless { |
|
86
|
|
|
|
|
|
|
ATTR => {}, |
|
87
|
|
|
|
|
|
|
TEXT => undef, |
|
88
|
|
|
|
|
|
|
STRIPPED => undef, |
|
89
|
|
|
|
|
|
|
NORMALIZED => undef, |
|
90
|
|
|
|
|
|
|
UNSTRIP => undef, |
|
91
|
|
|
|
|
|
|
}, $class; |
|
92
|
68
|
|
|
|
|
128
|
lock_keys %{$obj}; |
|
|
68
|
|
|
|
|
274
|
|
|
93
|
68
|
|
|
|
|
746
|
$obj->text = shift; |
|
94
|
68
|
|
|
|
|
91
|
%{$obj->{ATTR}} = (%{$obj->{ATTR}}, @_); |
|
|
68
|
|
|
|
|
183
|
|
|
|
68
|
|
|
|
|
145
|
|
|
95
|
68
|
|
|
|
|
239
|
$obj; |
|
96
|
|
|
|
|
|
|
} |
|
97
|
|
|
|
|
|
|
|
|
98
|
212
|
|
|
212
|
1
|
497
|
sub text :lvalue { +shift->{TEXT} } |
|
99
|
106
|
|
|
106
|
0
|
233
|
sub attr :lvalue { +shift->{ATTR} } |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
sub normalize { |
|
102
|
68
|
|
|
68
|
0
|
103
|
my $obj = shift; |
|
103
|
68
|
|
|
|
|
145
|
my $paragraph = $obj->attr->{paragraph}; |
|
104
|
68
|
|
|
|
|
167
|
local $_ = $obj->text; |
|
105
|
68
|
|
|
|
|
78
|
my $normalized = do { |
|
106
|
68
|
100
|
|
|
|
162
|
if (not $paragraph) { |
|
107
|
8
|
|
|
|
|
61
|
s{^.+}{ |
|
108
|
14
|
|
|
|
|
38
|
local $_ = ${^MATCH}; |
|
109
|
14
|
|
|
|
|
84
|
s/\A(\h*)(.*?)(\h*?)\z/$2/; |
|
110
|
14
|
|
|
|
|
75
|
$_; |
|
111
|
|
|
|
|
|
|
}pmger; |
|
112
|
|
|
|
|
|
|
} else { |
|
113
|
60
|
|
|
|
|
277
|
s{^.+(?:\n.+)*}{ |
|
114
|
60
|
|
|
|
|
139
|
local $_ = ${^MATCH}; |
|
115
|
|
|
|
|
|
|
# remove leading/trailing spaces |
|
116
|
60
|
|
|
|
|
434
|
s/\A(\h*)(.*?)(\h*?)\z/$2/; |
|
117
|
|
|
|
|
|
|
# remove newline after Japanese Punct char |
|
118
|
60
|
|
|
|
|
112
|
s/(?<=\p{InFullwidth})(?<=\pP)\n//g; |
|
119
|
|
|
|
|
|
|
# join Japanese lines without space |
|
120
|
60
|
|
|
|
|
82
|
s/(?<=\p{InFullwidth})\n(?=\p{InFullwidth})//g; |
|
121
|
|
|
|
|
|
|
# join ASCII lines with single space |
|
122
|
60
|
|
|
|
|
2038
|
s/\s+/ /g; |
|
123
|
60
|
|
|
|
|
1920
|
$_; |
|
124
|
|
|
|
|
|
|
}pmger; |
|
125
|
|
|
|
|
|
|
} |
|
126
|
|
|
|
|
|
|
}; |
|
127
|
68
|
|
|
|
|
2149
|
return $normalized; |
|
128
|
|
|
|
|
|
|
} |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
sub normalized { |
|
131
|
68
|
|
|
68
|
1
|
118
|
my $obj = shift; |
|
132
|
68
|
|
33
|
|
|
250
|
$obj->{NORMALIZED} //= $obj->normalize; |
|
133
|
|
|
|
|
|
|
} |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
sub strip { |
|
136
|
38
|
|
|
38
|
0
|
53
|
my $obj = shift; |
|
137
|
38
|
|
|
|
|
87
|
my $text = $obj->text; |
|
138
|
38
|
100
|
|
|
|
75
|
if ($obj->attr->{paragraph}) { |
|
139
|
30
|
|
|
|
|
85
|
return $obj->paragraph_strip; |
|
140
|
|
|
|
|
|
|
} |
|
141
|
8
|
|
|
|
|
37
|
my $line_re = qr/.*\n|.+\z/; |
|
142
|
8
|
|
|
|
|
91
|
my @text = $text =~ /$line_re/g; |
|
143
|
|
|
|
|
|
|
my @space = map { |
|
144
|
8
|
100
|
|
|
|
23
|
[ s/\A(\s+)// ? $1 : '', s/(\h+)$// ? $1 : '' ] |
|
|
14
|
100
|
|
|
|
142
|
|
|
145
|
|
|
|
|
|
|
} @text; |
|
146
|
8
|
|
|
|
|
28
|
$obj->{STRIPPED} = join '', @text; |
|
147
|
|
|
|
|
|
|
$obj->{UNSTRIP} = sub { |
|
148
|
8
|
|
|
8
|
|
19
|
for (@_) { |
|
149
|
8
|
|
|
|
|
45
|
my @text = /.*\n|.+\z/g; |
|
150
|
8
|
100
|
|
|
|
28
|
if (@space == @text + 1) { |
|
151
|
2
|
|
|
|
|
4
|
push @text, ''; |
|
152
|
|
|
|
|
|
|
} |
|
153
|
8
|
50
|
|
|
|
24
|
die "UNMATCH:\n".Dumper(\@text, \@space) if @text != @space; |
|
154
|
8
|
|
|
|
|
26
|
for my $i (keys @text) { |
|
155
|
14
|
|
|
|
|
23
|
my($head, $tail) = @{$space[$i]}; |
|
|
14
|
|
|
|
|
44
|
|
|
156
|
14
|
100
|
|
|
|
58
|
$text[$i] =~ s/\A/$head/ if length $head > 0; |
|
157
|
14
|
100
|
|
|
|
51
|
$text[$i] =~ s/\Z/$tail/ if length $tail > 0; |
|
158
|
|
|
|
|
|
|
} |
|
159
|
8
|
|
|
|
|
38
|
$_ = join '', @text; |
|
160
|
|
|
|
|
|
|
} |
|
161
|
8
|
|
|
|
|
57
|
}; |
|
162
|
8
|
|
|
|
|
29
|
$obj; |
|
163
|
|
|
|
|
|
|
} |
|
164
|
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
sub paragraph_strip { |
|
166
|
30
|
|
|
30
|
0
|
38
|
my $obj = shift; |
|
167
|
30
|
|
|
|
|
46
|
local *_ = \($obj->{STRIPPED} = $obj->text); |
|
168
|
30
|
100
|
|
|
|
103
|
my $head = s/\A(\s+)// ? $1 : '' ; |
|
169
|
30
|
50
|
|
|
|
95
|
my $tail = s/(\h+)$// ? $1 : '' ; |
|
170
|
|
|
|
|
|
|
$obj->{UNSTRIP} = sub { |
|
171
|
30
|
|
|
30
|
|
66
|
for (@_) { |
|
172
|
30
|
100
|
|
|
|
122
|
s/\A/$head/ if length $head; |
|
173
|
30
|
50
|
|
|
|
68
|
s/\Z/$tail/ if length $tail; |
|
174
|
|
|
|
|
|
|
} |
|
175
|
30
|
|
|
|
|
118
|
}; |
|
176
|
30
|
|
|
|
|
65
|
$obj; |
|
177
|
|
|
|
|
|
|
} |
|
178
|
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
sub unstrip { |
|
180
|
38
|
|
|
38
|
1
|
8087
|
my $obj = shift; |
|
181
|
38
|
50
|
|
|
|
143
|
$obj->strip if not $obj->{UNSTRIP}; |
|
182
|
38
|
50
|
|
|
|
94
|
if (my $unstrip = $obj->{UNSTRIP}) { |
|
183
|
38
|
|
|
|
|
96
|
$unstrip->(@_); |
|
184
|
|
|
|
|
|
|
} |
|
185
|
38
|
|
|
|
|
89
|
$obj; |
|
186
|
|
|
|
|
|
|
} |
|
187
|
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
1; |