File Coverage

blib/lib/Lingua/Orthon.pm

Criterion	Covered	Total	%
statement	68	78	87.1
branch	22	34	64.7
condition	3	6	50.0
subroutine	13	16	81.2
pod	8	8	100.0
total	114	142	80.2

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::Orthon;
2	2			2		48679	use Math::MatrixReal;
	2					74194
	2					145
3	2			2		2096	use List::AllUtils qw(mesh);
	2					6196
	2					197
4	2			2		38	use 5.006;
	2					13
	2					72
5	2			2		12	use strict;
	2					4
	2					60
6	2			2		11	use warnings;
	2					4
	2					2794
7							our $VERSION = '0.02';
8
9							=pod
10
11							=head1 NAME
12
13							Lingua-Orthon - Various measures of orthographic relatedness between two letter strings
14
15							=head1 VERSION
16
17							Version 0.02
18
19							=head1 SYNOPSIS
20
21							use Lingua::Orthon 0.02;
22							my $ortho = Lingua::Orthon->new();
23							my $n = $ortho->index_identical('BANG', 'BARN');
24
25							=head1 DESCRIPTION
26
27							Lingua-Orthon - Various measures of orthographic relatedness between two letter strings
28
29							=head1 METHODS
30
31							=head2 new
32
33							my $ortho = Lingua::Orthon->new();
34
35							Constructs/returns class object for accessing and passing params to other methods.
36
37							=cut
38
39							sub new {
40	1			1	1	11	my ($class, %attribs) = @_;
41	1					2	my $self = {};
42	1					3	bless $self, $class;
43							# $self->{'_proddat'} = _productivity_dat();
44							# $self->{'_frqdat'} = _frequency_dat();
45	1					7	my @lets = ('A'..'Z');
46	1					3	my @nums = (0 .. 25);
47	1					36	$self->{'_alphahash'} = { mesh(@lets, @nums ) };
48	1					19	return $self;
49							}
50
51							=head3 are_orthons
52
53							Returns 0 or 1 if the two strings qualify as 1-mismatch (Coltheart-type) orthons: same size, and only one discrepancy by substitution (no additions, deletions or transpositions).
54
55							=cut
56
57							sub are_orthons {
58	0			0	1	0	my $self = shift;
59	0	0				0	return 0 if length($_[0]) != length($_[1]);
60	0	0				0	return index_identical($self, $_[0], $_[1]) == length($_[0]) - 1 ? 1 : 0;
61							}
62
63							=head3 index_identical
64
65							$val = $orthon->index_identical($w1, $w2);
66
67							Returns the number of letters that are both identical and in the same serial position. So for BANG and BARN, 2 would be returned, for B and A, ignoring the common N as it is the third letter in BANG, the fourth letter in BARN, and so not in the same serial position across these two words.
68
69							=cut
70
71							sub index_identical { # "Coltheart" orthons
72	4			4	1	1317	my $self = shift;
73
74	4					7	my ($w1, $w2, $n, $i, $j) = (shift, shift, 0); # BENCHMARK: ~10%-25% faster than by list and separate decs
75	4					13	for ($i = 0; $i < length($w1); $i++) {
76	18	100	100			90	$n++ if substr($w1, $i, 1) eq (substr($w2, $i, 1) or last); # BENCHMARK: ~10%-20% faster than or by \|\|
77							# run the length of the second word and see how many common letters anyway, and how many index positions apart:
78							#my @cmn = ();
79							#for ($j = 0; $i < length($w2); $j++) {
80							# if ( substr($w1, $i, 1) eq (substr($w2, $j, 1) or last) ) {
81							#
82							# }
83							#}
84							}
85	4					11	return $n;
86							}
87
88							=head3 hdist
89
90							$val = $orthon->hdist('String1', 'String2');
91
92							Return the Hamming Distance between two letter strings.
93
94							=cut
95
96							sub hdist {
97	1			1	1	492	shift(@_);
98							#String length is assumed to be equal
99	1					6	return ($_[0] ^ $_[1]) =~ tr/\001-\255//; # thanks to: http://www.perlmonks.org/?node_id=500235
100							#return length( $_[ 0 ] ) - ( ( $_[ 0 ] ^ $_[ 1 ] ) =~ tr[\0][\0] );
101							}
102
103							=head3 ldist
104
105							$val = $orthon->ldist('String1', 'String2');
106
107							Return the Levenshtein Distance between two letter strings.
108
109							=cut
110
111							sub ldist {
112	3			3	1	1033	my ($self, $w1, $w2) = @_; # e.g. TAP, TAR
113
114	3					4	my ($identity_cost, $addition_cost, $deletion_cost, $substitution_cost, $permutation_cost) = (0, 1, 1, 1, 1);
115	3					8	my $refc = [$identity_cost, $deletion_cost, $addition_cost, $substitution_cost];
116
117	3	50				7	return 0 if $w1 eq $w2; # Zero for total equality
118
119							# The length of the other string if no length to this one:
120	3					5	my $n1 = length($w1);
121	3					4	my $n2 = length($w2);
122	3	50				7	return $addition_cost * $n2 if !$n1; # $w2 is a complete addition to $w1
123	3	50				7	return $deletion_cost * $n1 if !$n2; # $w2 is a complete deletion of $w1
124
125	3					8	my $d_matrix = _load_matrix_weighted($self, $w1, $w2, $n1, $n2, $refc);
126	3	50				13	return ref $d_matrix ? $d_matrix->element($n1+1, $n2+1) : $d_matrix;
127
128							# add 1 to every element in the matrix: passed the element, the row index and the column index, i
129							#$matrix = $matrix->each ( sub { (shift) + 1 } );
130							#$new_matrix = $matrix->each_diag( \&function );
131
132							}
133
134							=head3 len_maxseq
135
136							$val = $orthon->len_maxseq('String1', 'String2');
137
138							Return the length of the longest common subsequence between two letter strings. This subsequence is found by the C method in L.
139
140							=cut
141
142							sub len_maxseq {
143	1			1	1	325	my ($self, $w1, $w2) = @_; # e.g. TAP, TAR
144	1					922	require String::LCSS_XS;
145	1					652	my @lcs = String::LCSS_XS::lcss($w1, $w2);
146	1		50			6	return length($lcs[0]) \|\| 0;
147							}
148
149							=head3 unique_abbrevs
150
151							$val = $orthon->unique_abbrevs('String1', 'String2');
152
153							Return the number of unique abbreviations that can be made between two letter strings. This subsequence is found by the C method in L.
154
155							=cut
156
157							sub unique_abbrevs {
158	0			0	1	0	my ($self, $w1, $w2) = @_; # e.g. TAP, TAR
159	0					0	require Text::Abbrev;
160	0					0	my $href = Text::Abbrev::abbrev($w1, $w2);
161	0		0			0	return scalar(keys(%{$href})) \|\| 0;
162							}
163
164							=head3 myers_ukkonen
165
166							$val = $orthon->myers_ukkonen('String1', 'String2');
167
168							Return the Myers-Ukkonen distance between two letter strings, as found by the C method in L.
169
170							=cut
171
172							sub myers_ukkonen {
173	0			0	1	0	my ($self, $w1, $w2) = @_; # e.g. TAP, TAR
174	0					0	require String::Similarity;
175	0					0	return String::Similarity::similarity($w1, $w2);
176							}
177
178							# private methods
179
180							sub _load_matrix_weighted { # without early out-clauses:
181	3			3		5	my ($self, $w1, $w2, $n1, $n2, $refc) = @_;
182
183	3					16	my $d_matrix = new Math::MatrixReal($n1+1, $n2+1);
184
185	3					143	$d_matrix->assign(1, 1, 0);
186
187	3					118	foreach (1 .. $n1) {$d_matrix->assign($_+1,1, $_*$refc->[1]);}
	20					256
188	3					35	foreach (1 .. $n2) {$d_matrix->assign(1,$_+1,$_*$refc->[1]);}
	20					203
189
190	3					34	for my $i (1 .. $n1) {
191	20					230	my $w1_i = substr($w1, $i-1, 1);
192							#print "before one inner row:\n", $d_matrix, "\n";
193	20					28	for my $j(1 .. $n2) {
194	134					2124	my $w2_i = substr($w2, $j-1, 1);
195	134					542	$d_matrix->assign($i+1, $j+1, # starts at column 2 row 2
196							&_min(
197							$d_matrix->element($i, $j+1) + _weight($self, $w1_i,'-' . $w2_i, $refc), # deletion
198							$d_matrix->element($i+1, $j) + _weight($self, '-' . $w1_i, $w2_i, $refc), # addition
199							$d_matrix->element($i, $j) + _weight($self, $w1_i, $w2_i, $refc) # substitution
200							)
201							);
202							}
203							#print "after one inner row:\n", $d_matrix, "\n";
204							}
205							#print "final:\n", $d_matrix, "\n";
206	3					39	return $d_matrix;
207							}
208
209							sub _weight {
210							#the cost function
211	402			402		3859	my ($self, $w1_i, $w2_i, $refc) = @_;
212	402					586	my $type = '_frqdat';
213	402					576	my $apply = 0;
214
215	402	100				1655	if ($w1_i eq $w2_i) { #print "identity $w1_i $w2_i\n";
		100
		100
216	17	50				45	return $apply ? ($refc->[0]+1) * $self->{$type}->[$self->{'_alphahash'}->{$w1_i}] : 0; # cost for letter match
217							}
218							elsif ($w2_i =~ s/^\-//) {#print "deletion $w1_i $w2_i\n";
219							# inversion of free-production for the lost letter: loss of the more familiar = less competition; makes the cost less, so "closer" distance
220	134	50				195	my $prod = $apply ? $self->{$type}->[$self->{'_alphahash'}->{$w2_i}] : 1;
221	134					500	return $refc->[1] * 1/$prod; # cost for deletion
222							}
223							elsif ($w1_i =~ s/^\-//) { #print "addition $w1_i $w2_i\n";
224	134	50				179	my $prod = $apply ? $self->{$type}->[$self->{'_alphahash'}->{$w1_i}] : 1;
225	134					3729	return $refc->[2] * 1/$prod; # cost for addition
226							}
227							else { #print "mismatch $w1_i $w2_i\n";# $w1_i ne $w2_i
228	117	50				150	my $prod = $apply ? $self->{$type}->[$self->{'_alphahash'}->{$w2_i}] / $self->{$type}->[$self->{'_alphahash'}->{$w1_i}] : 1;
229	117					514	return $refc->[3] * $prod; # cost for letter mismatch
230							}
231							}
232
233							sub _min {
234	134	100		134		663	return $_[0] < $_[1]
		100
		100
235							? $_[0] < $_[2] ? $_[0] : $_[2]
236							: $_[1] < $_[2] ? $_[1] : $_[2];
237							}
238
239							=head1 AUTHOR
240
241							Roderick Garton, C<< >>
242
243							=head1 BUGS
244
245							Please report any bugs or feature requests to C, or through
246							the web interface at L. I will be notified, and then you'll
247							automatically be notified of progress on your bug as I make changes.
248
249							=head1 SUPPORT
250
251							You can find documentation for this module with the perldoc command.
252
253							perldoc Lingua::Orthon
254
255
256							You can also look for information at:
257
258							=over 4
259
260							=item * RT: CPAN's request tracker (report bugs here)
261
262							L
263
264							=item * AnnoCPAN: Annotated CPAN documentation
265
266							L
267
268							=item * CPAN Ratings
269
270							L
271
272							=item * Search CPAN
273
274							L
275
276							=back
277
278							=head1 LICENSE AND COPYRIGHT
279
280							Copyright 2011-2012 Roderick Garton.
281
282							This program is free software; you can redistribute it and/or modify it
283							under the terms of either: the GNU General Public License as published
284							by the Free Software Foundation; or the Artistic License.
285
286							See http://dev.perl.org/licenses/ for more information.
287
288							=cut
289
290							__DATA__