line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
2
|
|
|
2
|
|
46310
|
use strict; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
67
|
|
2
|
2
|
|
|
2
|
|
8
|
use warnings; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
106
|
|
3
|
|
|
|
|
|
|
package Lingua::EN::Tokenizer::Offsets; |
4
|
|
|
|
|
|
|
{ |
5
|
|
|
|
|
|
|
$Lingua::EN::Tokenizer::Offsets::VERSION = '0.03'; |
6
|
|
|
|
|
|
|
} |
7
|
2
|
|
|
2
|
|
1668
|
use utf8::all; |
|
2
|
|
|
|
|
135433
|
|
|
2
|
|
|
|
|
16
|
|
8
|
2
|
|
|
2
|
|
45215
|
use Data::Dump qw/dump/; |
|
2
|
|
|
|
|
17767
|
|
|
2
|
|
|
|
|
181
|
|
9
|
2
|
|
|
2
|
|
20
|
use feature qw/say/; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
87
|
|
10
|
|
|
|
|
|
|
|
11
|
2
|
|
|
2
|
|
10
|
use base 'Exporter'; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
1983
|
|
12
|
|
|
|
|
|
|
our @EXPORT_OK = qw/ |
13
|
|
|
|
|
|
|
initial_offsets |
14
|
|
|
|
|
|
|
token_offsets |
15
|
|
|
|
|
|
|
adjust_offsets |
16
|
|
|
|
|
|
|
get_tokens |
17
|
|
|
|
|
|
|
tokenize |
18
|
|
|
|
|
|
|
offsets2tokens |
19
|
|
|
|
|
|
|
/; |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
# ABSTRACT: Finds word (token) boundaries, and returns their offsets. |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
sub tokenize { |
26
|
0
|
|
|
0
|
1
|
0
|
my ($text) = @_; |
27
|
0
|
|
|
|
|
0
|
my $tokens = get_tokens($text); |
28
|
0
|
|
|
|
|
0
|
return join ' ',@$tokens; |
29
|
|
|
|
|
|
|
} |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
sub token_offsets { |
34
|
2
|
|
|
2
|
0
|
4
|
my ($text) = @_; |
35
|
2
|
50
|
|
|
|
11
|
return [] unless defined $text; |
36
|
2
|
|
|
|
|
9
|
my $offsets = initial_offsets($text); |
37
|
2
|
|
|
|
|
24
|
$offsets = adjust_offsets($text,$offsets); |
38
|
2
|
|
|
|
|
16
|
return $offsets; |
39
|
|
|
|
|
|
|
} |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
sub get_tokens { |
44
|
2
|
|
|
2
|
1
|
26
|
my ($text) = @_; |
45
|
2
|
|
|
|
|
11
|
my $offsets = token_offsets($text); |
46
|
2
|
|
|
|
|
13
|
my $tokens = offsets2tokens($text,$offsets); |
47
|
2
|
|
|
|
|
61
|
return $tokens; |
48
|
|
|
|
|
|
|
} |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
sub adjust_offsets { |
54
|
4
|
|
|
4
|
1
|
10
|
my ($text,$offsets) = @_; |
55
|
4
|
50
|
|
|
|
11
|
$text = $$text if ref($text); |
56
|
4
|
|
|
|
|
8
|
my $size = @$offsets; |
57
|
4
|
|
|
|
|
13
|
for(my $i=0; $i<$size; $i++){ |
58
|
947
|
|
|
|
|
1088
|
my $start = $offsets->[$i][0]; |
59
|
947
|
|
|
|
|
937
|
my $end = $offsets->[$i][1]; |
60
|
947
|
|
|
|
|
917
|
my $length = $end - $start; |
61
|
947
|
100
|
|
|
|
1450
|
if ($length <= 0){ |
62
|
31
|
|
|
|
|
41
|
delete $offsets->[$i]; |
63
|
31
|
|
|
|
|
64
|
next; |
64
|
|
|
|
|
|
|
} |
65
|
916
|
|
|
|
|
1524
|
my $s = substr($text,$start,$length); |
66
|
916
|
100
|
|
|
|
1999
|
if ($s =~ /^\s*$/){ |
67
|
67
|
|
|
|
|
133
|
delete $offsets->[$i]; |
68
|
67
|
|
|
|
|
149
|
next; |
69
|
|
|
|
|
|
|
} |
70
|
849
|
|
|
|
|
3741
|
$s =~ /^(\s*).*?(\s*)$/s; |
71
|
849
|
50
|
|
|
|
1547
|
if(defined($1)){ $start += length($1); } |
|
849
|
|
|
|
|
1071
|
|
72
|
849
|
50
|
|
|
|
1466
|
if(defined($2)){ $end -= length($2); } |
|
849
|
|
|
|
|
1004
|
|
73
|
849
|
|
|
|
|
2535
|
$offsets->[$i] = [$start, $end]; |
74
|
|
|
|
|
|
|
} |
75
|
4
|
|
|
|
|
15
|
my $new_offsets = [ grep { defined } @$offsets ]; |
|
945
|
|
|
|
|
1134
|
|
76
|
4
|
|
|
|
|
15
|
return $new_offsets; |
77
|
|
|
|
|
|
|
} |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
sub initial_offsets { |
81
|
2
|
|
|
2
|
1
|
4
|
my ($text) = @_; |
82
|
2
|
50
|
|
|
|
8
|
$text = $$text if ref($text); |
83
|
2
|
|
|
|
|
4
|
my $end; |
84
|
2
|
|
|
|
|
15
|
my $text_end = length($text); |
85
|
2
|
|
|
|
|
6
|
my $offsets = [[0,$text_end]]; |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
# token patterns |
88
|
2
|
|
|
|
|
44
|
my @patterns = ( |
89
|
|
|
|
|
|
|
qr{([^\p{IsAlnum}\s\.\'\`\,\-’])}, |
90
|
|
|
|
|
|
|
qr{(?
|
91
|
|
|
|
|
|
|
qr{(?<=\p{IsN})(,)(?!\d)}, |
92
|
|
|
|
|
|
|
qr{(?
|
93
|
|
|
|
|
|
|
qr{(?
|
94
|
|
|
|
|
|
|
qr{(?
|
95
|
|
|
|
|
|
|
qr{(?<=\p{isAlpha})(['`’])(?!\p{isAlpha})}, |
96
|
|
|
|
|
|
|
qr{(?<=\p{isAlpha})()['`’](?=\p{isAlpha})}, |
97
|
|
|
|
|
|
|
qr{(?:^|\s)(\S+)(?:$|\s)}, |
98
|
|
|
|
|
|
|
qr{(?:^|[^\.])(\.\.+)(?:$|[^\.])}, |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
qr{(?<=\p{isAlpha})['`]()(?=\p{isAlpha})}, |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
); |
103
|
|
|
|
|
|
|
|
104
|
2
|
|
|
|
|
7
|
for my $pat (@patterns){ |
105
|
22
|
|
|
|
|
30
|
my $size = @$offsets; |
106
|
22
|
|
|
|
|
54
|
for(my $i=0; $i<$size; $i++){ |
107
|
2096
|
|
|
|
|
2895
|
my $start = $offsets->[$i][0]; |
108
|
2096
|
|
|
|
|
2454
|
my $length = $offsets->[$i][1]-$start; |
109
|
2096
|
|
|
|
|
8141
|
my $s = substr($text,$start,$length); |
110
|
|
|
|
|
|
|
|
111
|
2096
|
|
|
|
|
2500
|
my $split_points = []; |
112
|
|
|
|
|
|
|
|
113
|
2096
|
100
|
|
|
|
8785
|
if($s =~ /^$pat(?!$)/g){ |
114
|
37
|
|
|
|
|
81
|
my $first = $-[1]; |
115
|
37
|
|
|
|
|
110
|
push @$split_points,[$start+$first,$start+$first]; |
116
|
37
|
|
|
|
|
70
|
my $second = $+[1]; |
117
|
37
|
50
|
|
|
|
132
|
push @$split_points,[$start+$second,$start+$second] if $first != $second; |
118
|
|
|
|
|
|
|
} |
119
|
2096
|
|
|
|
|
14910
|
while($s =~ /(?
|
120
|
182
|
|
|
|
|
2377
|
my $first = $-[1]; |
121
|
182
|
|
|
|
|
478
|
push @$split_points,[$start+$first,$start+$first]; |
122
|
182
|
|
|
|
|
1169
|
my $second = $+[1]; |
123
|
182
|
100
|
|
|
|
1721
|
push @$split_points,[$start+$second,$start+$second] if $first != $second; |
124
|
|
|
|
|
|
|
} |
125
|
2096
|
100
|
|
|
|
9705
|
if($s =~ /(?
|
126
|
39
|
|
|
|
|
417
|
my $first = $-[1]; |
127
|
39
|
|
|
|
|
103
|
push @$split_points,[$start+$first,$start+$first]; |
128
|
39
|
|
|
|
|
92
|
my $second = $+[1]; |
129
|
39
|
50
|
|
|
|
171
|
push @$split_points,[$start+$second,$start+$second] if $first != $second; |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
|
132
|
2096
|
100
|
|
|
|
10441
|
_split_tokens($offsets,$i,[ sort { $a->[0] <=> $b->[0] } @$split_points ]) if @$split_points; |
|
619
|
|
|
|
|
987
|
|
133
|
|
|
|
|
|
|
} |
134
|
|
|
|
|
|
|
} |
135
|
2
|
|
|
|
|
18
|
return _nonbp($text,$offsets); |
136
|
|
|
|
|
|
|
} |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
sub _split_tokens { |
139
|
50
|
|
|
50
|
|
74
|
my ($offsets,$i,$split_points) = @_; |
140
|
50
|
|
|
|
|
56
|
my ($end,$start) = @{shift @$split_points}; |
|
50
|
|
|
|
|
93
|
|
141
|
50
|
|
|
|
|
69
|
my $last = $offsets->[$i][1]; |
142
|
50
|
|
|
|
|
69
|
$offsets->[$i][1] = $end; |
143
|
50
|
|
|
|
|
129
|
while(my $p = shift @$split_points){ |
144
|
464
|
50
|
|
|
|
1273
|
push @$offsets, [$start,$p->[0]] unless $start == $p->[0]; |
145
|
464
|
|
|
|
|
1140
|
$start = $p->[1]; |
146
|
|
|
|
|
|
|
} |
147
|
50
|
|
|
|
|
351
|
push @$offsets, [$start, $last]; |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
sub offsets2tokens { |
153
|
2
|
|
|
2
|
1
|
5
|
my ($text, $offsets) = @_; |
154
|
2
|
50
|
|
|
|
10
|
$text = $$text if ref($text); |
155
|
2
|
|
|
|
|
5
|
my $tokens = []; |
156
|
2
|
|
|
|
|
14
|
foreach my $o ( sort {$a->[0] <=> $b->[0]} @$offsets) { |
|
439
|
|
|
|
|
483
|
|
157
|
431
|
|
|
|
|
405
|
my $start = $o->[0]; |
158
|
431
|
|
|
|
|
449
|
my $length = $o->[1]-$o->[0]; |
159
|
431
|
|
|
|
|
820
|
push @$tokens, substr($text,$start,$length); |
160
|
|
|
|
|
|
|
} |
161
|
2
|
|
|
|
|
8
|
return $tokens; |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
sub _load_prefixes { |
166
|
2
|
|
|
2
|
|
5
|
my ($prefixref) = @_; |
167
|
2
|
|
|
|
|
15
|
$INC{'Lingua/EN/Tokenizer/Offsets.pm'} =~ m{\.pm$}; |
168
|
2
|
|
|
|
|
13
|
my $prefixfile = "$`/nonbreaking_prefix.en"; |
169
|
|
|
|
|
|
|
|
170
|
2
|
50
|
|
|
|
253
|
open my $prefix, '<', $prefixfile or die "Could not open file '$prefixfile'!"; |
171
|
2
|
|
|
|
|
290
|
while (<$prefix>) { |
172
|
214
|
100
|
100
|
|
|
1103
|
next if /^#/ or /^\s*$/; |
173
|
186
|
|
|
|
|
215
|
my $item = $_; |
174
|
186
|
|
|
|
|
195
|
chomp($item); |
175
|
186
|
100
|
|
|
|
279
|
if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) { $prefixref->{$1} = 2; } |
|
6
|
|
|
|
|
39
|
|
176
|
180
|
|
|
|
|
660
|
else { $prefixref->{$item} = 1; } |
177
|
|
|
|
|
|
|
} |
178
|
2
|
|
|
|
|
76
|
close($prefix); |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
sub _nonbp { |
182
|
2
|
|
|
2
|
|
7
|
my ($text,$offsets) = @_; |
183
|
2
|
50
|
|
|
|
12
|
$text = $$text if ref($text); |
184
|
2
|
|
|
|
|
6
|
my $nonbpref = {}; |
185
|
2
|
|
|
|
|
39
|
_load_prefixes($nonbpref); |
186
|
2
|
|
|
|
|
8
|
my $new_offsets = adjust_offsets($text,$offsets); |
187
|
2
|
|
|
|
|
16
|
$new_offsets = [ sort { $a->[0] <=> $b->[0] } @$new_offsets ]; |
|
969
|
|
|
|
|
1023
|
|
188
|
2
|
|
|
|
|
14
|
my $size = @$new_offsets; |
189
|
2
|
|
|
|
|
5
|
my $extra = []; |
190
|
2
|
|
|
|
|
14
|
for(my $i=0; $i<$size-1; $i++){ |
191
|
416
|
|
|
|
|
456
|
my $start = $new_offsets->[$i][0]; |
192
|
416
|
|
|
|
|
404
|
my $end = $new_offsets->[$i][1]; |
193
|
416
|
|
|
|
|
392
|
my $length = $end-$start; |
194
|
416
|
|
|
|
|
631
|
my $s = substr($text,$start,$length); |
195
|
416
|
|
|
|
|
370
|
my $j=$i+1; |
196
|
416
|
|
|
|
|
743
|
my $t = substr($text,$new_offsets->[$j][0], $new_offsets->[$j][1]-$new_offsets->[$j][0]); |
197
|
|
|
|
|
|
|
|
198
|
416
|
100
|
|
|
|
1246
|
if($s =~ /^(\S+)\.\s?$/){ |
199
|
24
|
|
|
|
|
39
|
my $pre = $1; |
200
|
24
|
0
|
100
|
|
|
259
|
unless ( |
|
|
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
66
|
|
|
|
|
201
|
|
|
|
|
|
|
($pre =~ /\./ and $pre =~ /\p{IsAlpha}/) |
202
|
|
|
|
|
|
|
or ($nonbpref->{$pre} and $nonbpref->{$pre}==1) |
203
|
|
|
|
|
|
|
or ($t =~ /^[\p{IsLower}]/) |
204
|
|
|
|
|
|
|
or ( |
205
|
|
|
|
|
|
|
$nonbpref->{$pre} |
206
|
|
|
|
|
|
|
and $nonbpref->{$pre}==2 |
207
|
|
|
|
|
|
|
and $t =~ /^\d+/) |
208
|
|
|
|
|
|
|
){ |
209
|
15
|
|
|
|
|
50
|
$s =~ /^(.*[^\s\.])\.\s*?$/; |
210
|
15
|
100
|
|
|
|
53
|
next unless defined($+[1]); |
211
|
13
|
|
|
|
|
38
|
push @$extra, [$start+$+[1],$end]; |
212
|
13
|
|
|
|
|
59
|
$new_offsets->[$i][1] = $start+$+[1]; |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
} |
215
|
|
|
|
|
|
|
} |
216
|
2
|
|
|
|
|
13
|
return [ sort { $a->[0] <=> $b->[0] } (@$new_offsets,@$extra) ]; |
|
639
|
|
|
|
|
764
|
|
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
1; |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
__END__ |