File Coverage

blib/lib/Lingua/Word/Parser.pm

Criterion	Covered	Total	%
statement	64	77	83.1
branch	8	20	40.0
condition	5	9	55.5
subroutine	12	13	92.3
pod	2	2	100.0
total	91	121	75.2

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::Word::Parser;
2							our $AUTHORITY = 'cpan:GENE';
3
4							# ABSTRACT: Parse a word into scored known and unknown parts
5
6	1			1		898	use strict;
	1					3
	1					36
7	1			1		6	use warnings;
	1					2
	1					50
8
9							our $VERSION = '0.0806';
10
11	1			1		498	use Bit::Vector;
	1					1085
	1					63
12	1			1		1672	use DBI;
	1					18943
	1					106
13	1			1		751	use List::PowerSet qw( powerset_lazy );
	1					540
	1					68
14	1			1		535	use IO::File;
	1					9663
	1					138
15
16	1			1		878	use Memoize;
	1					2617
	1					2491
17							memoize('_does_not_overlap');
18							memoize('power');
19							memoize('_reconstruct');
20							memoize('_grouping');
21							memoize('score');
22							memoize('score_parts');
23							memoize('_rle');
24							memoize('_or_together');
25
26
27
28							sub new {
29	2			2	1	1480	my $class = shift;
30	2					7	my %args = @_;
31							my $self = {
32							file => $args{file},
33							dbhost => $args{dbhost} \|\| 'localhost',
34							dbtype => $args{dbtype} \|\| 'mysql',
35							dbname => $args{dbname},
36							dbuser => $args{dbuser},
37							dbpass => $args{dbpass},
38							lex => $args{lex},
39							word => $args{word},
40	2		50			34	known => {},
			50
41							masks => {},
42							combos => [],
43							score => {},
44							};
45	2					5	bless $self, $class;
46	2					9	$self->_init(%args);
47	2					7	return $self;
48							}
49							sub _init {
50	2			2		22	my ($self, %args) = @_;
51
52							# Set the length of our word.
53	2					8	$self->{wlen} = length $self->{word};
54
55							# Set lex if given data.
56	2	100	66			55	if ( $self->{file} && -e $self->{file} ) {
		50
57	1					6	$self->_fetch_lex;
58							}
59							elsif( $self->{dbname} )
60							{
61	0					0	$self->_db_fetch;
62							}
63							}
64
65							sub _fetch_lex {
66	1			1		2	my $self = shift;
67
68	1					3	my $i = 0;
69
70							# Open the given file for reading...
71	1					11	my $fh = IO::File->new;
72	1	50				72	$fh->open( "< $self->{file}" ) or die "Can't read file: '$self->{file}'";
73	1					116	for ( <$fh> ) {
74	9					21	$i++;
75							# Split space-separated entries.
76	9					13	chomp;
77	9					39	my ($re, $defn) = split /\s+/, $_, 2;
78							# Add the entry to the lexicon.
79	9					135	$self->{lex}{$i} = { defn => $defn, re => qr/$re/ };
80							}
81	1					16	$fh->close;
82
83	1					28	return $self->{lex};
84							}
85
86							sub _db_fetch {
87	0			0		0	my $self = shift;
88
89	0					0	my $dsn = "DBI:$self->{dbtype}:$self->{dbname};$self->{dbhost}";
90
91	0	0				0	my $dbh = DBI->connect( $dsn, $self->{dbuser}, $self->{dbpass}, { RaiseError => 1, AutoCommit => 1 } )
92							or die "Unable to connect to $self->{dbname}: $DBI::errstr\n";
93
94	0					0	my $sql = 'SELECT id, affix, definition FROM fragment';
95
96	0					0	my $sth = $dbh->prepare($sql);
97	0	0				0	$sth->execute or die "Unable to execute '$sql': $DBI::errstr\n";
98
99	0					0	while( my @row = $sth->fetchrow_array ) {
100	0					0	my ($id, $part, $defn) = @row;
101	0					0	$self->{lex}{$id} = { re => qr/$part/, defn => $defn };
102							}
103	0	0				0	die "Fetch terminated early: $DBI::errstr\n" if $DBI::errstr;
104
105	0	0				0	$sth->finish or die "Unable to finish '$sql': $DBI::errstr\n";
106
107	0	0				0	$dbh->disconnect or die "Unable to disconnect from $self->{dbname}: $DBI::errstr\n";
108							}
109
110
111							sub knowns {
112	1			1	1	1113	my $self = shift;
113
114							# The identifier for the known and masks lists.
115	1					3	my $id = 0;
116
117	1					3	for my $i (values %{ $self->{lex} }) {
	1					6
118	9					46	while ($self->{word} =~ /$i->{re}/g) {
119							# Match positions.
120	10					34	my ($m, $n) = ($-[0], $+[0]);
121							# Get matched word-part.
122	10					27	my $part = substr $self->{word}, $m, $n - $m;
123
124							# Create the part-of-word bitmask.
125	10					20	my $mask = 0 x $m; # Before known
126	10		50			26	$mask .= 1 x (($n - $m) \|\| 1); # Known part
127	10					17	$mask .= 0 x ($self->{wlen} - $n); # After known
128
129							# Output our progress.
130							# warn sprintf "%s %s - %s, %s (%d %d), %s\n",
131							# $mask,
132							# $i->{re},
133							# substr($self->{word}, 0, $m),
134							# $part,
135							# $m,
136							# $n - 1,
137							# substr($self->{word}, $n),
138							# ;
139
140							# Save the known as a member of a list keyed by starting position.
141							$self->{known}{$id} = {
142							part => $part,
143							span => [$m, $n - 1],
144							defn => $i->{defn},
145	10					56	mask => $mask,
146							};
147
148							# Save the relationship between mask and id.
149	10					57	$self->{masks}{$mask} = $id++;
150							}
151							}
152
153	1					5	return $self->{known};
154							}
155
156
157							sub power {
158							my $self = shift;
159
160							# Get a new powerset generator.
161							my $power = powerset_lazy(sort keys %{ $self->{masks} });
162
163							# Consider each member of the powerset.. to save or skip?
164							while (my $collection = $power->()) {
165							# warn "C: @$collection\n";
166
167							# Save this collection if it has only one item.
168							if (1 == @$collection) {
169							# warn "\t\tE: only 1 mask\n";
170							push @{ $self->{combos} }, $collection;
171							next;
172							}
173
174							# Compare each mask against the others.
175							LOOP: for my $i (0 .. @$collection - 1) {
176
177							# Set the comparison mask.
178							my $compare = $collection->[$i];
179
180							for my $j ($i + 1 .. @$collection - 1) {
181
182							# Set the current mask.
183							my $mask = $collection->[$j];
184							# warn "\tP:$compare v $mask\n";
185
186							# Skip this collection if an overlap is found.
187							if (not $self->_does_not_overlap($compare, $mask)) {
188							# warn "\t\tO:$compare v $mask\n";
189							last LOOP;
190							}
191
192							# Save this collection if we made it to the last pair.
193							if ($i == @$collection - 2 && $j == @$collection - 1) {
194							# warn "\t\tE:$compare v $mask\n";
195							push @{ $self->{combos} }, $collection;
196							}
197							}
198							}
199							}
200
201							# Hand back the "non-overlapping powerset."
202							return $self->{combos};
203							}
204
205
206							sub score {
207							my $self = shift;
208							my ( $open_separator, $close_separator ) = @_;
209
210							my $parts = $self->score_parts( $open_separator, $close_separator );
211
212							for my $mask ( keys %$parts ) {
213							my $familiarity = sprintf "%.2f chunks / %.2f chars", @{ $self->_familiarity($mask) };
214
215							for my $element ( @{ $parts->{$mask} } ) {
216							my $score = sprintf "%d:%d chunks / %d:%d chars",
217							$element->{score}{knowns}, $element->{score}{unknowns},
218							$element->{score}{knownc}, $element->{score}{unknownc};
219
220							my $part = join ', ', @{ $element->{partition} };
221
222							my $defn = join ', ', @{ $element->{definition} };
223
224							push @{ $self->{score}{$mask} }, {
225							score => $score,
226							familiarity => $familiarity,
227							partition => $part,
228							definition => $defn,
229							};
230							}
231							}
232
233							return $self->{score};
234							}
235
236							sub _familiarity {
237	609			609		1203	my ( $self, $mask ) = @_;
238
239	609					3298	my @chunks = grep { $_ ne "" } split /(0+)/, $mask;
	2826					6580
240
241							# Figure out how many chars are only 1s and
242							# Figure out how many chunks are made up of 1s:
243	609					1169	my $char_1s = 0;
244	609					838	my $chunk_1s = 0;
245	609					993	for my $chunk (@chunks) {
246	2577	100				4846	$char_1s += $chunk =~ /0/ ? 0 : length($chunk);
247	2577	100				4562	$chunk_1s += $chunk =~ /0/ ? 0 : 1;
248							}
249
250	609					4427	return [ $chunk_1s / @chunks, $char_1s / length($mask) ];
251							}
252
253
254							sub score_parts {
255							my $self = shift;
256							my ( $open_separator, $close_separator, $line_terminator ) = @_;
257
258							$line_terminator = '' unless defined $line_terminator;
259
260							# Visit each combination...
261							my $i = 0;
262							for my $c (@{ $self->{combos} }) {
263							$i++;
264							my $together = $self->_or_together(@$c);
265
266							# Breakdown knowns vs unknowns and knowncharacters vs unknowncharacters.
267							my %count = (
268							knowns => 0,
269							unknowns => 0,
270							knownc => 0,
271							unknownc => 0,
272							);
273
274							for my $x ( reverse sort @$c ) {
275							# Run-length encode an "un-digitized" string.
276							my $y = _rle($x);
277							my ( $knowns, $unknowns, $knownc, $unknownc ) = _grouping($y);
278							# Accumulate the counters!
279							$count{knowns} += $knowns;
280							$count{unknowns} += $unknowns;
281							$count{knownc} += $knownc;
282							$count{unknownc} += $unknownc;
283							}
284
285							my ( $s, $m ) = _reconstruct( $self->{word}, $c, $open_separator, $close_separator );
286
287							my $defn = [];
288							for my $i ( @$m )
289							{
290							for my $j ( keys %{ $self->{known} } )
291							{
292							push @$defn, $self->{known}{$j}{defn} if $self->{known}{$j}{mask} eq $i;
293							}
294							}
295
296							push @{ $self->{score_parts}{$together} }, {
297							score => \%count,
298							partition => $s,
299							definition => $defn,
300							familiarity => $self->_familiarity($together),
301							};
302							}
303
304							return $self->{score_parts};
305							}
306
307							sub _grouping {
308							my $scored = shift;
309							my @groups = $scored =~ /([ku]\d+)/g;
310							my ( $knowns, $unknowns ) = ( 0, 0 );
311							my ( $knownc, $unknownc ) = ( 0, 0 );
312							for ( @groups ) {
313							if ( /k(\d+)/ ) {
314							$knowns++;
315							$knownc += $1;
316							}
317							if ( /u(\d+)/ ) {
318							$unknowns++;
319							$unknownc += $1;
320							}
321							}
322							return $knowns, $unknowns, $knownc, $unknownc;
323							}
324
325							sub _rle {
326							my $scored = shift;
327							# Run-length encode an "un-digitized" string.
328							$scored =~ s/1/k/g; # Undigitize
329							$scored =~ s/0/u/g; # "
330							# Count contiguous chars.
331							$scored =~ s/(.)\1*/$1 . length(substr($scored, $-[0], $+[0]-$-[0]))/ge;
332							return $scored;
333							}
334
335							sub _does_not_overlap {
336							my $self = shift;
337
338							# Get our masks to check.
339							my ($mask, $check) = @_;
340
341							# Create the bitstrings to compare.
342							my $bitmask = Bit::Vector->new_Bin($self->{wlen}, $mask);
343							my $orclone = Bit::Vector->new_Bin($self->{wlen}, $check);
344							my $xorclone = Bit::Vector->new_Bin($self->{wlen}, $check);
345
346							# Compute or and xor for the strings.
347							$orclone->Or($bitmask, $orclone);
348							$xorclone->Xor($bitmask, $xorclone);
349
350							# Return the "or & xor equivalent sibling."
351							return $xorclone->equal($orclone) ? $orclone->to_Bin : 0;
352							}
353
354							sub _or_together {
355							my $self = shift;
356
357							# Get our masks to score.
358							my @masks = @_;
359
360							# Initialize the bitmask to return, to zero.
361							my $result = Bit::Vector->new_Bin($self->{wlen}, (0 x $self->{wlen}));
362
363							for my $mask (@masks) {
364							# Create the bitstrings to compare.
365							my $bitmask = Bit::Vector->new_Bin($self->{wlen}, $mask);
366
367							# Get the union of the bit strings.
368							$result->Or($result, $bitmask);
369							}
370
371							# Return the "or sum."
372							return $result->to_Bin;
373							}
374
375							sub _reconstruct {
376							my ( $word, $masks, $open_separator, $close_separator ) = @_;
377
378							$open_separator = '<' unless defined $open_separator;
379							$close_separator = '>' unless defined $close_separator;
380
381							my $strings = [];
382							my $my_masks = [];
383
384							for my $mask (reverse sort @$masks) {
385							my $i = 0;
386							my $last = 0;
387							my $string = '';
388							for my $m ( split //, $mask ) {
389							if ( $m ) {
390							$string .= $open_separator unless $last;
391							$string .= substr( $word, $i, 1 );
392							$last = 1;
393							}
394							else {
395							$string .= $close_separator if $last;
396							$string .= substr( $word, $i, 1 );
397							$last = 0;
398							}
399							$i++;
400							}
401							$string .= $close_separator if $last;
402							push @$strings, $string;
403							push @$my_masks, $mask;
404							}
405
406							return $strings, $my_masks;
407							}
408
409							1;
410
411							__END__