line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::EN::Segment; |
2
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
9571
|
use strict; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
90
|
|
4
|
3
|
|
|
3
|
|
16
|
use warnings; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
80
|
|
5
|
3
|
|
|
3
|
|
15
|
no warnings 'uninitialized'; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
194
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our $VERSION = '0.004'; |
8
|
|
|
|
|
|
|
$VERSION = eval $VERSION; |
9
|
|
|
|
|
|
|
|
10
|
3
|
|
|
3
|
|
79
|
use Carp; |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
232
|
|
11
|
3
|
|
|
3
|
|
1661
|
use English qw(-no_match_vars); |
|
3
|
|
|
|
|
3392
|
|
|
3
|
|
|
|
|
16
|
|
12
|
3
|
|
|
3
|
|
1579
|
use File::ShareDir; |
|
3
|
|
|
|
|
24759
|
|
|
3
|
|
|
|
|
152
|
|
13
|
3
|
|
|
3
|
|
21
|
use List::Util qw(min); |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
261
|
|
14
|
3
|
|
|
3
|
|
1974
|
use Memoize; |
|
3
|
|
|
|
|
7510
|
|
|
3
|
|
|
|
|
2953
|
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 NAME |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
Lingua::EN::Segment - Split English-language domain names etc. into words |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 SYNOPSIS |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
my $segmenter = Lingua::EN::Segment->new; |
23
|
|
|
|
|
|
|
for my $domain (<>) { |
24
|
|
|
|
|
|
|
chomp $domain; |
25
|
|
|
|
|
|
|
my @words = $segmenter->segment($domain); |
26
|
|
|
|
|
|
|
print "$domain: ", join(', ', @words), "\n"; |
27
|
|
|
|
|
|
|
} |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
=head1 DESCRIPTION |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
Sometimes you have a string that to a human eye is clearly made up of many |
32
|
|
|
|
|
|
|
words glommed together without spaces or hyphens. This module uses some mild |
33
|
|
|
|
|
|
|
cunning and a large list of known words from Google to try and work out how |
34
|
|
|
|
|
|
|
the string should be split into words. |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=head2 new |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
Out: $segmenter |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
Returns a Lingua::EN::Segment object. |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=cut |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
sub new { |
45
|
2
|
|
|
2
|
1
|
1115
|
my ($package, %args) = @_; |
46
|
|
|
|
|
|
|
|
47
|
2
|
|
33
|
|
|
17
|
return bless \%args => ref($package) || $package; |
48
|
|
|
|
|
|
|
} |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head2 dist_dir |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
Out: $dist_dir |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
Returns the name of the directory where distribution-specific files are |
55
|
|
|
|
|
|
|
installed. |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=cut |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
sub dist_dir { |
60
|
3
|
|
|
3
|
1
|
16
|
my ($self) = @_; |
61
|
|
|
|
|
|
|
|
62
|
3
|
|
66
|
|
|
29
|
$self->{dist_dir} ||= File::ShareDir::dist_dir('Lingua-EN-Segment'); |
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=head2 segment |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
In: $unsegmented_string |
68
|
|
|
|
|
|
|
Out: @words |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
Supplied with an unsegmented string - e.g. a domain name - returns a list of |
71
|
|
|
|
|
|
|
words that are most statistically likely to be the words that make up this |
72
|
|
|
|
|
|
|
string. |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
=cut |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
sub segment { |
77
|
19
|
|
|
19
|
1
|
14467
|
my ($self, $unsegmented_string) = @_; |
78
|
|
|
|
|
|
|
|
79
|
19
|
100
|
|
|
|
63
|
return if !length($unsegmented_string); |
80
|
18
|
|
|
|
|
395
|
my $combination = $self->_best_combination($unsegmented_string, ''); |
81
|
18
|
|
|
|
|
187
|
return @{ $combination->{words} }; |
|
18
|
|
|
|
|
179
|
|
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
# Supplied with an unsegmented string and the previous word (or '' |
85
|
|
|
|
|
|
|
# if this is the beginning of the input string), splits up the unsegmented |
86
|
|
|
|
|
|
|
# string into a word and a remainder, segments the remainder in turn, |
87
|
|
|
|
|
|
|
# and returns the most likely match. |
88
|
|
|
|
|
|
|
memoize('_best_combination', NORMALIZER => sub { "$_[1] $_[2]" }); |
89
|
|
|
|
|
|
|
sub _best_combination { |
90
|
|
|
|
|
|
|
my ($self, $unsegmented_string, $previous_word) = @_; |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# Work out all the possible words at the beginning of this string. |
93
|
|
|
|
|
|
|
# (31 characters is the longest word in our corpus that is genuinely |
94
|
|
|
|
|
|
|
# a real word, and not other words glommed together.) |
95
|
|
|
|
|
|
|
# Then run this whole algorithm on the remainder, thus effectively |
96
|
|
|
|
|
|
|
# working on the string from both the front and the back. |
97
|
|
|
|
|
|
|
my @possible_combinations; |
98
|
|
|
|
|
|
|
for my $prefix_length (1..min(length($unsegmented_string), 31)) { |
99
|
|
|
|
|
|
|
my $current_word = substr($unsegmented_string, 0, $prefix_length); |
100
|
|
|
|
|
|
|
my $current_probability |
101
|
|
|
|
|
|
|
= $self->_probability($current_word, $previous_word); |
102
|
|
|
|
|
|
|
my $remainder_word = substr($unsegmented_string, $prefix_length); |
103
|
|
|
|
|
|
|
if ($remainder_word |
104
|
|
|
|
|
|
|
and my $remainder |
105
|
|
|
|
|
|
|
= $self->_best_combination($remainder_word, $current_word)) |
106
|
|
|
|
|
|
|
{ |
107
|
|
|
|
|
|
|
my $combination = { |
108
|
|
|
|
|
|
|
current => { |
109
|
|
|
|
|
|
|
words => [$current_word], |
110
|
|
|
|
|
|
|
probability => $current_probability, |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
}, |
113
|
|
|
|
|
|
|
remainder => $remainder |
114
|
|
|
|
|
|
|
}; |
115
|
|
|
|
|
|
|
$combination->{words} = [map { @{ $combination->{$_}{words} } } |
116
|
|
|
|
|
|
|
qw(current remainder)]; |
117
|
|
|
|
|
|
|
$combination->{probability} = $combination->{current}{probability} |
118
|
|
|
|
|
|
|
* $combination->{remainder}{probability}; |
119
|
|
|
|
|
|
|
push @possible_combinations, $combination; |
120
|
|
|
|
|
|
|
} else { |
121
|
|
|
|
|
|
|
push @possible_combinations, |
122
|
|
|
|
|
|
|
{ |
123
|
|
|
|
|
|
|
probability => $current_probability, |
124
|
|
|
|
|
|
|
words => [$current_word], |
125
|
|
|
|
|
|
|
}; |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
} |
128
|
|
|
|
|
|
|
return (sort { $b->{probability} <=> $a->{probability} } |
129
|
|
|
|
|
|
|
@possible_combinations)[0]; |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
# Supplied with a word and the previous word, returns the probability of it |
133
|
|
|
|
|
|
|
# matching something legitimate, either from the bigram corpus, or falling back |
134
|
|
|
|
|
|
|
# to the unigram corpus. |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
memoize('_probability', NORMALIZER => sub { "$_[1] $_[2]" }); |
137
|
|
|
|
|
|
|
sub _probability { |
138
|
|
|
|
|
|
|
my ($self, $word, $previous_word) = @_; |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
my $biword = $previous_word . ' ' . $word; |
141
|
|
|
|
|
|
|
if ( exists $self->bigrams->{$biword} |
142
|
|
|
|
|
|
|
&& exists $self->unigrams->{$previous_word}) |
143
|
|
|
|
|
|
|
{ |
144
|
|
|
|
|
|
|
return $self->bigrams->{$biword} |
145
|
|
|
|
|
|
|
/ $self->_unigram_probability($previous_word); |
146
|
|
|
|
|
|
|
} else { |
147
|
|
|
|
|
|
|
return $self->_unigram_probability($word); |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
sub _unigram_probability { |
152
|
360632
|
|
|
360632
|
|
615384
|
my ($self, $word) = @_; |
153
|
|
|
|
|
|
|
|
154
|
360632
|
|
66
|
|
|
654863
|
return $self->unigrams->{$word} || $self->unigrams->{__unknown__}->($word); |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=head2 unigrams |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
Out: \%unigrams |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
Returns a hashref of word => likelihood to appear in Google's huge list of |
162
|
|
|
|
|
|
|
words that they got off the Internet. The higher the likelihood, the more |
163
|
|
|
|
|
|
|
likely that this is a genuine regularly-used word, rather than an obscure |
164
|
|
|
|
|
|
|
word or a typo. |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
=cut |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
sub unigrams { |
169
|
663179
|
|
|
663179
|
1
|
967932
|
my ($self) = @_; |
170
|
|
|
|
|
|
|
|
171
|
663179
|
|
66
|
|
|
2352135
|
return $self->{unigrams} ||= $self->_read_file('count_1w.txt'); |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
=head2 bigrams |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
Out: \%bigrams |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
As L, but returns a lookup table of "word1 word2" => likelihood |
179
|
|
|
|
|
|
|
for combinations of words. |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
=cut |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
sub bigrams { |
184
|
360959
|
|
|
360959
|
1
|
523331
|
my ($self) = @_; |
185
|
|
|
|
|
|
|
|
186
|
360959
|
|
66
|
|
|
1554091
|
return $self->{bigrams} ||= $self->_read_file('count_2w.txt'); |
187
|
|
|
|
|
|
|
} |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
sub _read_file { |
190
|
3
|
|
|
3
|
|
11
|
my ($self, $filename) = @_; |
191
|
|
|
|
|
|
|
|
192
|
3
|
|
|
|
|
21
|
my $full_filename = $self->dist_dir . '/' . $filename; |
193
|
3
|
50
|
|
|
|
436
|
open(my $fh, '<', $full_filename) |
194
|
|
|
|
|
|
|
or croak "Couldn't read unigrams from $full_filename: $OS_ERROR"; |
195
|
3
|
|
|
|
|
10
|
my (%count, $total_count); |
196
|
3
|
|
|
|
|
41304
|
while (<$fh>) { |
197
|
953024
|
|
|
|
|
1329870
|
chomp; |
198
|
953024
|
|
|
|
|
2889644
|
my ($word, $count) = split(/\t+/, $_); |
199
|
953024
|
|
|
|
|
3171815
|
$count{$word} = $count; |
200
|
953024
|
|
|
|
|
2315544
|
$total_count += $count; |
201
|
|
|
|
|
|
|
} |
202
|
3
|
|
|
|
|
446089
|
my %likelihood = map { $_ => $count{$_} / $total_count } %count; |
|
1906048
|
|
|
|
|
6354721
|
|
203
|
|
|
|
|
|
|
$likelihood{__unknown__} = sub { |
204
|
302178
|
|
|
302178
|
|
489076
|
my $word = shift; |
205
|
302178
|
|
|
|
|
1425005
|
return 10 / ($total_count * 10**length($word)); |
206
|
3
|
|
|
|
|
407404
|
}; |
207
|
3
|
|
|
|
|
510488
|
return \%likelihood; |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
This code is based on |
214
|
|
|
|
|
|
|
L. |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
=cut |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
1; |