File Coverage

blib/lib/Lingua/EN/Segment.pm

Criterion	Covered	Total	%
statement	54	54	100.0
branch	3	4	75.0
condition	9	15	60.0
subroutine	16	16	100.0
pod	5	5	100.0
total	87	94	92.5

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::EN::Segment;
2
3	3			3		9571	use strict;
	3					6
	3					90
4	3			3		16	use warnings;
	3					6
	3					80
5	3			3		15	no warnings 'uninitialized';
	3					5
	3					194
6
7							our $VERSION = '0.004';
8							$VERSION = eval $VERSION;
9
10	3			3		79	use Carp;
	3					7
	3					232
11	3			3		1661	use English qw(-no_match_vars);
	3					3392
	3					16
12	3			3		1579	use File::ShareDir;
	3					24759
	3					152
13	3			3		21	use List::Util qw(min);
	3					7
	3					261
14	3			3		1974	use Memoize;
	3					7510
	3					2953
15
16							=head1 NAME
17
18							Lingua::EN::Segment - Split English-language domain names etc. into words
19
20							=head1 SYNOPSIS
21
22							my $segmenter = Lingua::EN::Segment->new;
23							for my $domain (<>) {
24							chomp $domain;
25							my @words = $segmenter->segment($domain);
26							print "$domain: ", join(', ', @words), "\n";
27							}
28
29							=head1 DESCRIPTION
30
31							Sometimes you have a string that to a human eye is clearly made up of many
32							words glommed together without spaces or hyphens. This module uses some mild
33							cunning and a large list of known words from Google to try and work out how
34							the string should be split into words.
35
36							=head2 new
37
38							Out: $segmenter
39
40							Returns a Lingua::EN::Segment object.
41
42							=cut
43
44							sub new {
45	2			2	1	1115	my ($package, %args) = @_;
46
47	2		33			17	return bless \%args => ref($package) \|\| $package;
48							}
49
50							=head2 dist_dir
51
52							Out: $dist_dir
53
54							Returns the name of the directory where distribution-specific files are
55							installed.
56
57							=cut
58
59							sub dist_dir {
60	3			3	1	16	my ($self) = @_;
61
62	3		66			29	$self->{dist_dir} \|\|= File::ShareDir::dist_dir('Lingua-EN-Segment');
63							}
64
65							=head2 segment
66
67							In: $unsegmented_string
68							Out: @words
69
70							Supplied with an unsegmented string - e.g. a domain name - returns a list of
71							words that are most statistically likely to be the words that make up this
72							string.
73
74							=cut
75
76							sub segment {
77	19			19	1	14467	my ($self, $unsegmented_string) = @_;
78
79	19	100				63	return if !length($unsegmented_string);
80	18					395	my $combination = $self->_best_combination($unsegmented_string, '~~');~~
81	18					187	return @{ $combination->{words} };
	18					179
82							}
83
84							# Supplied with an unsegmented string and the previous word (or ''
85							# if this is the beginning of the input string), splits up the unsegmented
86							# string into a word and a remainder, segments the remainder in turn,
87							# and returns the most likely match.
88							memoize('_best_combination', NORMALIZER => sub { "$_[1] $_[2]" });
89							sub _best_combination {
90							my ($self, $unsegmented_string, $previous_word) = @_;
91
92							# Work out all the possible words at the beginning of this string.
93							# (31 characters is the longest word in our corpus that is genuinely
94							# a real word, and not other words glommed together.)
95							# Then run this whole algorithm on the remainder, thus effectively
96							# working on the string from both the front and the back.
97							my @possible_combinations;
98							for my $prefix_length (1..min(length($unsegmented_string), 31)) {
99							my $current_word = substr($unsegmented_string, 0, $prefix_length);
100							my $current_probability
101							= $self->_probability($current_word, $previous_word);
102							my $remainder_word = substr($unsegmented_string, $prefix_length);
103							if ($remainder_word
104							and my $remainder
105							= $self->_best_combination($remainder_word, $current_word))
106							{
107							my $combination = {
108							current => {
109							words => [$current_word],
110							probability => $current_probability,
111
112							},
113							remainder => $remainder
114							};
115							$combination->{words} = [map { @{ $combination->{$_}{words} } }
116							qw(current remainder)];
117							$combination->{probability} = $combination->{current}{probability}
118							* $combination->{remainder}{probability};
119							push @possible_combinations, $combination;
120							} else {
121							push @possible_combinations,
122							{
123							probability => $current_probability,
124							words => [$current_word],
125							};
126							}
127							}
128							return (sort { $b->{probability} <=> $a->{probability} }
129							@possible_combinations)[0];
130							}
131
132							# Supplied with a word and the previous word, returns the probability of it
133							# matching something legitimate, either from the bigram corpus, or falling back
134							# to the unigram corpus.
135
136							memoize('_probability', NORMALIZER => sub { "$_[1] $_[2]" });
137							sub _probability {
138							my ($self, $word, $previous_word) = @_;
139
140							my $biword = $previous_word . ' ' . $word;
141							if ( exists $self->bigrams->{$biword}
142							&& exists $self->unigrams->{$previous_word})
143							{
144							return $self->bigrams->{$biword}
145							/ $self->_unigram_probability($previous_word);
146							} else {
147							return $self->_unigram_probability($word);
148							}
149							}
150
151							sub _unigram_probability {
152	360632			360632		615384	my ($self, $word) = @_;
153
154	360632		66			654863	return $self->unigrams->{$word} \|\| $self->unigrams->{__unknown__}->($word);
155							}
156
157							=head2 unigrams
158
159							Out: \%unigrams
160
161							Returns a hashref of word => likelihood to appear in Google's huge list of
162							words that they got off the Internet. The higher the likelihood, the more
163							likely that this is a genuine regularly-used word, rather than an obscure
164							word or a typo.
165
166							=cut
167
168							sub unigrams {
169	663179			663179	1	967932	my ($self) = @_;
170
171	663179		66			2352135	return $self->{unigrams} \|\|= $self->_read_file('count_1w.txt');
172							}
173
174							=head2 bigrams
175
176							Out: \%bigrams
177
178							As L, but returns a lookup table of "word1 word2" => likelihood
179							for combinations of words.
180
181							=cut
182
183							sub bigrams {
184	360959			360959	1	523331	my ($self) = @_;
185
186	360959		66			1554091	return $self->{bigrams} \|\|= $self->_read_file('count_2w.txt');
187							}
188
189							sub _read_file {
190	3			3		11	my ($self, $filename) = @_;
191
192	3					21	my $full_filename = $self->dist_dir . '/' . $filename;
193	3	50				436	open(my $fh, '<', $full_filename)
194							or croak "Couldn't read unigrams from $full_filename: $OS_ERROR";
195	3					10	my (%count, $total_count);
196	3					41304	while (<$fh>) {
197	953024					1329870	chomp;
198	953024					2889644	my ($word, $count) = split(/\t+/, $_);
199	953024					3171815	$count{$word} = $count;
200	953024					2315544	$total_count += $count;
201							}
202	3					446089	my %likelihood = map { $_ => $count{$_} / $total_count } %count;
	1906048					6354721
203							$likelihood{__unknown__} = sub {
204	302178			302178		489076	my $word = shift;
205	302178					1425005	return 10 / ($total_count * 10**length($word));
206	3					407404	};
207	3					510488	return \%likelihood;
208							}
209
210
211							=head1 ACKNOWLEDGEMENTS
212
213							This code is based on
214							L.
215
216							=cut
217
218							1;