File Coverage

blib/lib/Lingua/Word/Parser.pm

Criterion	Covered	Total	%
statement	62	75	82.6
branch	8	20	40.0
condition	5	9	55.5
subroutine	12	13	92.3
pod	2	2	100.0
total	89	119	74.7

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::Word::Parser;
2							our $AUTHORITY = 'cpan:GENE';
3
4							# ABSTRACT: Parse a word into scored known and unknown parts
5
6	1			1		641	use strict;
	1					2
	1					26
7	1			1		4	use warnings;
	1					1
	1					34
8
9							our $VERSION = '0.0805';
10
11	1			1		388	use Bit::Vector;
	1					827
	1					37
12	1			1		1253	use DBI;
	1					15260
	1					66
13	1			1		526	use List::PowerSet qw( powerset_lazy );
	1					415
	1					53
14	1			1		409	use IO::File;
	1					7701
	1					109
15
16	1			1		651	use Memoize;
	1					2167
	1					1999
17							memoize('_does_not_overlap');
18							memoize('power');
19							memoize('_reconstruct');
20							memoize('_grouping');
21							memoize('score');
22							memoize('score_parts');
23							memoize('_rle');
24							memoize('_or_together');
25
26
27
28							sub new {
29	2			2	1	1012	my $class = shift;
30	2					6	my %args = @_;
31							my $self = {
32							file => $args{file},
33							dbhost => $args{dbhost} \|\| 'localhost',
34							dbtype => $args{dbtype} \|\| 'mysql',
35							dbname => $args{dbname},
36							dbuser => $args{dbuser},
37							dbpass => $args{dbpass},
38							lex => $args{lex},
39							word => $args{word},
40	2		50			21	known => {},
			50
41							masks => {},
42							combos => [],
43							score => {},
44							};
45	2					4	bless $self, $class;
46	2					7	$self->_init(%args);
47	2					11	return $self;
48							}
49							sub _init {
50	2			2		18	my ($self, %args) = @_;
51
52							# Set the length of our word.
53	2					7	$self->{wlen} = length $self->{word};
54
55							# Set lex if given data.
56	2	100	66			46	if ( $self->{file} && -e $self->{file} ) {
		50
57	1					6	$self->_fetch_lex;
58							}
59							elsif( $self->{dbname} )
60							{
61	0					0	$self->_db_fetch;
62							}
63							}
64
65							sub _fetch_lex {
66	1			1		2	my $self = shift;
67
68							# Open the given file for reading...
69	1					9	my $fh = IO::File->new();
70	1	50				48	$fh->open( "< $self->{file}" ) or die "Can't read file: '$self->{file}'";
71	1					95	for ( <$fh> ) {
72							# Split space-separated entries.
73	9					15	chomp;
74	9					37	my ($re, $defn) = split /\s+/, $_, 2;
75							# Add the entry to the lexicon.
76	9					103	$self->{lex}{$re} = { defn => $defn, re => qr/$re/ };
77							}
78	1					19	$fh->close;
79
80	1					21	return $self->{lex};
81							}
82
83							sub _db_fetch {
84	0			0		0	my $self = shift;
85
86	0					0	my $dsn = "DBI:$self->{dbtype}:$self->{dbname};$self->{dbhost}";
87
88	0	0				0	my $dbh = DBI->connect( $dsn, $self->{dbuser}, $self->{dbpass}, { RaiseError => 1, AutoCommit => 1 } )
89							or die "Unable to connect to $self->{dbname}: $DBI::errstr\n";
90
91	0					0	my $sql = 'SELECT affix, definition FROM fragment';
92
93	0					0	my $sth = $dbh->prepare($sql);
94	0	0				0	$sth->execute or die "Unable to execute '$sql': $DBI::errstr\n";
95
96	0					0	while( my @row = $sth->fetchrow_array ) {
97	0					0	my $part = $row[0];
98	0					0	$self->{lex}{$part} = { re => qr/$part/, defn => $row[1] };
99							}
100	0	0				0	die "Fetch terminated early: $DBI::errstr\n" if $DBI::errstr;
101
102	0	0				0	$sth->finish or die "Unable to finish '$sql': $DBI::errstr\n";
103
104	0	0				0	$dbh->disconnect or die "Unable to disconnect from $self->{dbname}: $DBI::errstr\n";
105							}
106
107
108							sub knowns {
109	1			1	1	560	my $self = shift;
110
111							# The identifier for the known and masks lists.
112	1					2	my $id = 0;
113
114	1					2	for my $i (values %{ $self->{lex} }) {
	1					3
115	9					37	while ($self->{word} =~ /$i->{re}/g) {
116							# Match positions.
117	10					27	my ($m, $n) = ($-[0], $+[0]);
118							# Get matched word-part.
119	10					30	my $part = substr $self->{word}, $m, $n - $m;
120
121							# Create the part-of-word bitmask.
122	10					18	my $mask = 0 x $m; # Before known
123	10		50			20	$mask .= 1 x (($n - $m) \|\| 1); # Known part
124	10					17	$mask .= 0 x ($self->{wlen} - $n); # After known
125
126							# Output our progress.
127							# warn sprintf "%s %s - %s, %s (%d %d), %s\n",
128							# $mask,
129							# $i->{re},
130							# substr($self->{word}, 0, $m),
131							# $part,
132							# $m,
133							# $n - 1,
134							# substr($self->{word}, $n),
135							# ;
136
137							# Save the known as a member of a list keyed by starting position.
138							$self->{known}{$id} = {
139							part => $part,
140							span => [$m, $n - 1],
141							defn => $i->{defn},
142	10					44	mask => $mask,
143							};
144
145							# Save the relationship between mask and id.
146	10					45	$self->{masks}{$mask} = $id++;
147							}
148							}
149
150	1					4	return $self->{known};
151							}
152
153
154							sub power {
155							my $self = shift;
156
157							# Get a new powerset generator.
158							my $power = powerset_lazy(sort keys %{ $self->{masks} });
159
160							# Consider each member of the powerset.. to save or skip?
161							while (my $collection = $power->()) {
162							# warn "C: @$collection\n";
163
164							# Save this collection if it has only one item.
165							if (1 == @$collection) {
166							# warn "\t\tE: only 1 mask\n";
167							push @{ $self->{combos} }, $collection;
168							next;
169							}
170
171							# Compare each mask against the others.
172							LOOP: for my $i (0 .. @$collection - 1) {
173
174							# Set the comparison mask.
175							my $compare = $collection->[$i];
176
177							for my $j ($i + 1 .. @$collection - 1) {
178
179							# Set the current mask.
180							my $mask = $collection->[$j];
181							# warn "\tP:$compare v $mask\n";
182
183							# Skip this collection if an overlap is found.
184							if (not $self->_does_not_overlap($compare, $mask)) {
185							# warn "\t\tO:$compare v $mask\n";
186							last LOOP;
187							}
188
189							# Save this collection if we made it to the last pair.
190							if ($i == @$collection - 2 && $j == @$collection - 1) {
191							# warn "\t\tE:$compare v $mask\n";
192							push @{ $self->{combos} }, $collection;
193							}
194							}
195							}
196							}
197
198							# Hand back the "non-overlapping powerset."
199							return $self->{combos};
200							}
201
202
203							sub score {
204							my $self = shift;
205							my ( $open_separator, $close_separator ) = @_;
206
207							my $parts = $self->score_parts( $open_separator, $close_separator );
208
209							for my $mask ( keys %$parts ) {
210							my $familiarity = sprintf "%.2f chunks / %.2f chars", @{ $self->_familiarity($mask) };
211
212							for my $element ( @{ $parts->{$mask} } ) {
213							my $score = sprintf "%d:%d chunks / %d:%d chars",
214							$element->{score}{knowns}, $element->{score}{unknowns},
215							$element->{score}{knownc}, $element->{score}{unknownc};
216
217							my $part = join ', ', @{ $element->{partition} };
218
219							my $defn = join ', ', @{ $element->{definition} };
220
221							push @{ $self->{score}{$mask} }, {
222							score => $score,
223							familiarity => $familiarity,
224							partition => $part,
225							definition => $defn,
226							};
227							}
228							}
229
230							return $self->{score};
231							}
232
233							sub _familiarity {
234	609			609		881	my ( $self, $mask ) = @_;
235
236	609					2589	my @chunks = grep { $_ ne "" } split /(0+)/, $mask;
	2826					4892
237
238							# Figure out how many chars are only 1s and
239							# Figure out how many chunks are made up of 1s:
240	609					973	my $char_1s = 0;
241	609					688	my $chunk_1s = 0;
242	609					787	for my $chunk (@chunks) {
243	2577	100				3916	$char_1s += $chunk =~ /0/ ? 0 : length($chunk);
244	2577	100				4026	$chunk_1s += $chunk =~ /0/ ? 0 : 1;
245							}
246
247	609					3383	return [ $chunk_1s / @chunks, $char_1s / length($mask) ];
248							}
249
250
251							sub score_parts {
252							my $self = shift;
253							my ( $open_separator, $close_separator, $line_terminator ) = @_;
254
255							$line_terminator = '' unless defined $line_terminator;
256
257							# Visit each combination...
258							my $i = 0;
259							for my $c (@{ $self->{combos} }) {
260							$i++;
261							my $together = $self->_or_together(@$c);
262
263							# Breakdown knowns vs unknowns and knowncharacters vs unknowncharacters.
264							my %count = (
265							knowns => 0,
266							unknowns => 0,
267							knownc => 0,
268							unknownc => 0,
269							);
270
271							for my $x ( reverse sort @$c ) {
272							# Run-length encode an "un-digitized" string.
273							my $y = _rle($x);
274							my ( $knowns, $unknowns, $knownc, $unknownc ) = _grouping($y);
275							# Accumulate the counters!
276							$count{knowns} += $knowns;
277							$count{unknowns} += $unknowns;
278							$count{knownc} += $knownc;
279							$count{unknownc} += $unknownc;
280							}
281
282							my ( $s, $m ) = _reconstruct( $self->{word}, $c, $open_separator, $close_separator );
283
284							my $defn = [];
285							for my $i ( @$m )
286							{
287							for my $j ( keys %{ $self->{known} } )
288							{
289							push @$defn, $self->{known}{$j}{defn} if $self->{known}{$j}{mask} eq $i;
290							}
291							}
292
293							push @{ $self->{score_parts}{$together} }, {
294							score => \%count,
295							partition => $s,
296							definition => $defn,
297							familiarity => $self->_familiarity($together),
298							};
299							}
300
301							return $self->{score_parts};
302							}
303
304							sub _grouping {
305							my $scored = shift;
306							my @groups = $scored =~ /([ku]\d+)/g;
307							my ( $knowns, $unknowns ) = ( 0, 0 );
308							my ( $knownc, $unknownc ) = ( 0, 0 );
309							for ( @groups ) {
310							if ( /k(\d+)/ ) {
311							$knowns++;
312							$knownc += $1;
313							}
314							if ( /u(\d+)/ ) {
315							$unknowns++;
316							$unknownc += $1;
317							}
318							}
319							return $knowns, $unknowns, $knownc, $unknownc;
320							}
321
322							sub _rle {
323							my $scored = shift;
324							# Run-length encode an "un-digitized" string.
325							$scored =~ s/1/k/g; # Undigitize
326							$scored =~ s/0/u/g; # "
327							# Count contiguous chars.
328							$scored =~ s/(.)\1*/$1 . length(substr($scored, $-[0], $+[0]-$-[0]))/ge;
329							return $scored;
330							}
331
332							sub _does_not_overlap {
333							my $self = shift;
334
335							# Get our masks to check.
336							my ($mask, $check) = @_;
337
338							# Create the bitstrings to compare.
339							my $bitmask = Bit::Vector->new_Bin($self->{wlen}, $mask);
340							my $orclone = Bit::Vector->new_Bin($self->{wlen}, $check);
341							my $xorclone = Bit::Vector->new_Bin($self->{wlen}, $check);
342
343							# Compute or and xor for the strings.
344							$orclone->Or($bitmask, $orclone);
345							$xorclone->Xor($bitmask, $xorclone);
346
347							# Return the "or & xor equivalent sibling."
348							return $xorclone->equal($orclone) ? $orclone->to_Bin : 0;
349							}
350
351							sub _or_together {
352							my $self = shift;
353
354							# Get our masks to score.
355							my @masks = @_;
356
357							# Initialize the bitmask to return, to zero.
358							my $result = Bit::Vector->new_Bin($self->{wlen}, (0 x $self->{wlen}));
359
360							for my $mask (@masks) {
361							# Create the bitstrings to compare.
362							my $bitmask = Bit::Vector->new_Bin($self->{wlen}, $mask);
363
364							# Get the union of the bit strings.
365							$result->Or($result, $bitmask);
366							}
367
368							# Return the "or sum."
369							return $result->to_Bin;
370							}
371
372							sub _reconstruct {
373							my ( $word, $masks, $open_separator, $close_separator ) = @_;
374
375							$open_separator = '<' unless defined $open_separator;
376							$close_separator = '>' unless defined $close_separator;
377
378							my $strings = [];
379							my $my_masks = [];
380
381							for my $mask (reverse sort @$masks) {
382							my $i = 0;
383							my $last = 0;
384							my $string = '';
385							for my $m ( split //, $mask ) {
386							if ( $m ) {
387							$string .= $open_separator unless $last;
388							$string .= substr( $word, $i, 1 );
389							$last = 1;
390							}
391							else {
392							$string .= $close_separator if $last;
393							$string .= substr( $word, $i, 1 );
394							$last = 0;
395							}
396							$i++;
397							}
398							$string .= $close_separator if $last;
399							push @$strings, $string;
400							push @$my_masks, $mask;
401							}
402
403							return $strings, $my_masks;
404							}
405
406							1;
407
408							__END__