|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
  
 
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package Lingua::Orthon;
  | 
| 
2
 | 
7
 | 
 
 | 
 
 | 
  
7
  
 | 
 
 | 
395360
 | 
 use 5.006;
  | 
| 
 
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
70
 | 
    | 
| 
3
 | 
7
 | 
 
 | 
 
 | 
  
7
  
 | 
 
 | 
44
 | 
 use strict;
  | 
| 
 
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
16
 | 
    | 
| 
 
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
129
 | 
    | 
| 
4
 | 
7
 | 
 
 | 
 
 | 
  
7
  
 | 
 
 | 
25
 | 
 use warnings FATAL => 'all';
  | 
| 
 
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
    | 
| 
 
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
195
 | 
    | 
| 
5
 | 
7
 | 
 
 | 
 
 | 
  
7
  
 | 
 
 | 
188
 | 
 use Carp qw(croak);
  | 
| 
 
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
    | 
| 
 
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
335
 | 
    | 
| 
6
 | 
7
 | 
 
 | 
 
 | 
  
7
  
 | 
 
 | 
2728
 | 
 use List::AllUtils qw(any);
  | 
| 
 
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
79667
 | 
    | 
| 
 
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
433
 | 
    | 
| 
7
 | 
7
 | 
 
 | 
 
 | 
  
7
  
 | 
 
 | 
2625
 | 
 use Number::Misc qw(is_numeric);
  | 
| 
 
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6833
 | 
    | 
| 
 
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
334
 | 
    | 
| 
8
 | 
7
 | 
 
 | 
 
 | 
  
7
  
 | 
 
 | 
2580
 | 
 use Statistics::Lite qw(mean);
  | 
| 
 
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8263
 | 
    | 
| 
 
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
377
 | 
    | 
| 
9
 | 
7
 | 
 
 | 
 
 | 
  
7
  
 | 
 
 | 
2915
 | 
 use String::Util qw(hascontent nocontent);
  | 
| 
 
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
30037
 | 
    | 
| 
 
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
419
 | 
    | 
| 
10
 | 
7
 | 
 
 | 
 
 | 
  
7
  
 | 
 
 | 
4097
 | 
 use Unicode::Collate;
  | 
| 
 
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
46837
 | 
    | 
| 
 
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8462
 | 
    | 
| 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 $Lingua::Orthon::VERSION = '0.03';
  | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =pod
  | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =encoding CP-1252
  | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 NAME
  | 
| 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Lingua-Orthon - Orthographic similarity of string to one or more others by Coltheart's N and related measures
  | 
| 
21
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 VERSION
  | 
| 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
24
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 This is documentation for B of Lingua::Orthon.
  | 
| 
25
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 SYNOPSIS
  | 
| 
27
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
28
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  use Lingua::Orthon 0.03;
  | 
| 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  my $orthon = Lingua::Orthon->new();
  | 
| 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  my $bool = $orthon->are_orthons('BANG', 'BARN'); # 0
  | 
| 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  $bool = $orthon->are_orthons('BANG', 'BONG'); # 1
  | 
| 
32
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  my $idx = $orthon->index_diff('BANK', 'BARK'); # 2
  | 
| 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  my $count = $orthon->index_identical('BANG', 'BARN'); # 2
  | 
| 
34
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  my (@diff) = $orthon->char_diff('BANG', 'BONG'); # (qw/A O/)
  | 
| 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  $count = $orthon->onc(
  | 
| 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     test => 'BANG',
  | 
| 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     sample => [qw/BAND COCO BING RANG BONG SONG/]); # 4
  | 
| 
38
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  my $aref = $orthon->list_orthons(
  | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     test => 'BANG',
  | 
| 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     sample => [qw/BAND COCO BING RANG BONG SONG/]); # BAND, BING, RANG, BONG
  | 
| 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  $count = $orthon->levenshtein('BANG', 'BARN'); # 2
  | 
| 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  my $float = $orthon->old(
  | 
| 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     test => 'BANG',
  | 
| 
44
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     sample => [qw/BAND COCO BING RANG BONG SONG/]); # ~= 1.67
  | 
| 
45
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
46
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 DESCRIPTION
  | 
| 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
48
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Lingua-Orthon provides measures of similarity of character strings based on their orthographic identity, as relevant to psycholinguistic research. Case- and mark-sensitivity for determining character equality can be controlled. Wraps to Levenshtein Distance methods, extended to the OLD-20 metric, are provided for convenience of comparison. No methods are explicitly exported; all methods are called in the object-oriented way.
  | 
| 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 SUBROUTINES/METHODS
  | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 new
  | 
| 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
54
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  my $ortho = Lingua::Orthon->new();
  | 
| 
55
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Constructs/returns class object for accessing other methods.
  | 
| 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Optionally, set the argument B to an integer value ranging from 0 to 3 to control case- and mark-sensitivity. See L.
  | 
| 
59
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
60
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut
  | 
| 
61
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
62
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub new {
  | 
| 
63
 | 
9
 | 
 
 | 
 
 | 
  
9
  
 | 
  
1
  
 | 
1673
 | 
     my ( $class, %args ) = @_;
  | 
| 
64
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
23
 | 
     my $self = {};
  | 
| 
65
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
20
 | 
     bless $self, $class;
  | 
| 
66
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
38
 | 
     $self->set_eq( match_level => $args{'match_level'} );
  | 
| 
67
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1115
 | 
     return $self;
  | 
| 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }
  | 
| 
69
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 are_orthons
  | 
| 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
72
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  $bool = $orthon->are_orthons('String1', 'String2');
  | 
| 
73
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
74
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Returns 0 or 1 (Coltheart's Boolean) if two given strings are orthographic neighbours by a 1-mismatch I: i.e., the strings are of the same size (are equal in character count) and there is only one discrepancy between them by a single substitution of a character in the same ordinal position (no additions, deletions or transpositions). So I and I are orthons by this measure (they differ only in the final letter), but I and I are not (the letter I is an I to I via I, or a I from I to I).
  | 
| 
75
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 I: If two identical letter strings are given (I, I), they are defined as I being orthons: the number of index identical characters must be at least one less than the length of the string(s).
  | 
| 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
78
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 I: By default, identity is defined case-insensitively; e.g., I and I, and I and I are orthons. However, if B has been set (in L or L) to a higher level than 1 (or as undef or 0), then case is respected; e.g., I and I are orthons, but I and I are NOT orthons (they involve substituting both the Is and the second letters (I and I) ... but I and I, or I and I, are orthons. (This usefully applies to putting L|Lingua::Orthon/onc, coltheart_n> (the sum of single-substitution orthons a string has within a lexicon) to questions of the featural versus lexical basis of neighbourhood effects).
  | 
| 
79
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
80
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 See Coltheart et al. (1977) (in L). The measure is computationally simple and economical, relative to other measures, such as based on a wider array of edit-types (e.g., Levenshtein Distance), that, while having greater explanatory power (Yarkoni et al., 2008), can tax resources on the order of days to effectively compute for a single string relative to a humanly memorable corpus.
  | 
| 
81
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
82
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut
  | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
84
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub are_orthons {
  | 
| 
85
 | 
13
 | 
 
 | 
 
 | 
  
13
  
 | 
  
1
  
 | 
2836
 | 
     my ( $self, $w1, $w2 ) = @_;
  | 
| 
86
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
32
 | 
     return _are_orthons( $w1, $w2, $self->{'_EQ'} );
  | 
| 
87
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }
  | 
| 
88
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
89
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 index_identical
  | 
| 
90
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
91
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  $count = $orthon->index_identical('String1', 'String2');
  | 
| 
92
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
93
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Returns a count: the number of letters that are identical and in the same serial position among two given letter-strings.
  | 
| 
94
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
95
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 For example, given I and I, 2 is returned for the two index-identical letters, I and I; I is in both strings, but it is ignored as it is the third letter in I but the fourth letter in I, and so not in the same serial position across the two words.
  | 
| 
96
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
97
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut
  | 
| 
98
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
99
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub index_identical {
  | 
| 
100
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
  
1
  
 | 
1418
 | 
     my ( $self, $w1, $w2 ) = @_;
  | 
| 
101
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14
 | 
     return _index_identical( $w1, $w2, $self->{'_EQ'} );
  | 
| 
102
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }
  | 
| 
103
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 index_diff
  | 
| 
105
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
106
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  $posint = $orthon->index_diff('String1', 'String2');
  | 
| 
107
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
108
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Assuming the two strings are single-substitution orthons, returns the single index (anchored at zero) at which their letters differ.  So if the two strings are "bring" and "being", the returned value is 1.
  | 
| 
109
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
110
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut
  | 
| 
111
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
112
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub index_diff {
  | 
| 
113
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
  
1
  
 | 
425
 | 
     my ( $self, $w1, $w2 ) = @_;
  | 
| 
114
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
     my $idx = 0;
  | 
| 
115
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
     for my $i ( 0 .. _smallest_len( $w1, $w2 ) - 1 ) {
  | 
| 
116
 | 
7
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
701
 | 
         if ( not $self->{'_EQ'}->( map { substr $_, $i, 1 } ( $w1, $w2 ) ) ) {
  | 
| 
 
 | 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
24
 | 
    | 
| 
117
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
157
 | 
             $idx = $i;
  | 
| 
118
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
             last;
  | 
| 
119
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }
  | 
| 
120
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }
  | 
| 
121
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
128
 | 
     return $idx;
  | 
| 
122
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }
  | 
| 
123
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
124
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 char_diff
  | 
| 
125
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
126
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  @ari = $orthon->char_diff('String1', 'String2');
  | 
| 
127
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
128
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Returns a list of the first two characters (letters) that, reading from left to right, differ between two given strings. If the strings are single-substitution orthons, these are the characters that make them so. So if the two strings are "bring" and "being", the returned list is ('r', 'e') - the order of these characters in the returned list respecting the order of the given strings. The search across the strings terminates as soon there is a mismatch; otherwise, it continues only for as long as the length of the shortest string.
  | 
| 
129
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
130
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The identity match (or mismatch) is sensitive to the setting of the equality function per case and marks; see L.
  | 
| 
131
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
132
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut
  | 
| 
133
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
134
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub char_diff {
  | 
| 
135
 | 
4
 | 
 
 | 
 
 | 
  
4
  
 | 
  
1
  
 | 
926
 | 
     my ( $self, $w1, $w2 ) = @_;
  | 
| 
136
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
     my @ds = ();
  | 
| 
137
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
     for my $i ( 0 .. _smallest_len( $w1, $w2 ) - 1 ) {
  | 
| 
138
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1492
 | 
         my @tmp = map { substr $_, $i, 1 } ( $w1, $w2 );
  | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
56
 | 
    | 
| 
139
 | 
13
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
23
 | 
         if ( not $self->{'_EQ'}->(@tmp) ) {
  | 
| 
140
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
264
 | 
             @ds = @tmp;
  | 
| 
141
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
             last;
  | 
| 
142
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }
  | 
| 
143
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }
  | 
| 
144
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
282
 | 
     return @ds;
  | 
| 
145
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }
  | 
| 
146
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
147
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 onc, coltheart_n
  | 
| 
148
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
149
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  $int = $orthon->onc(test => CHARSTR, sample => AREF); 
  | 
| 
150
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
151
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Returns the I (ONC), a.k.a. Coltheart's I: the number of single-letter substitution orthons a particular string has with respect to a list of strings (or "lexicon") (Coltheart et al., 1977). So I has two orthons (I and I) in the list (I, I, I and I).
  | 
| 
152
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
153
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut
  | 
| 
154
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
155
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub onc {
  | 
| 
156
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
  
1
  
 | 
12
 | 
     my ( $self, %args ) = @_;
  | 
| 
157
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $test_str =
  | 
| 
158
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       hascontent( $args{'test'} )
  | 
| 
159
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
5
 | 
       ? $args{'test'}
  | 
| 
160
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       : croak 'Need a single character string to test for orthons';
  | 
| 
161
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $sample_aref =
  | 
| 
162
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       ref $args{'sample'}
  | 
| 
163
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
20
 | 
       ? $args{'sample'}
  | 
| 
164
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       : croak
  | 
| 
165
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       'Need a list (aref) of character-strings to sample for orthon listing';
  | 
| 
166
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
     my $count = 0;
  | 
| 
167
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
     for ( 0 .. ( scalar @{$sample_aref} - 1 ) ) {
  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
    | 
| 
168
 | 
6
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
11
 | 
         if ( _are_orthons( $test_str, $sample_aref->[$_], $self->{'_EQ'} ) ) {
  | 
| 
169
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
             $count++;
  | 
| 
170
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }
  | 
| 
171
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }
  | 
| 
172
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
     return $count;
  | 
| 
173
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }
  | 
| 
174
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 *coltheart_n = \&index_indentical;
  | 
| 
175
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
176
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 list_orthons
  | 
| 
177
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
178
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  $aref = $orthon->list_orthons(test => CHARSTR, sample => AREF);
  | 
| 
179
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  
  | 
| 
180
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Returns a reference to an array of single-substitution orthographic neighbours of a given B character-string that are among a given list of B character-strings. The referenced is to an empty array if no orthons are found. The order of items in the returned array follows that in which they appear in the B.
  | 
| 
181
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
182
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut
  | 
| 
183
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
184
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub list_orthons {
  | 
| 
185
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
  
1
  
 | 
463
 | 
     my ( $self, %args ) = @_;
  | 
| 
186
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $test_str =
  | 
| 
187
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       hascontent( $args{'test'} )
  | 
| 
188
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
4
 | 
       ? $args{'test'}
  | 
| 
189
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       : croak 'Need a single character string to test for orthons';
  | 
| 
190
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $sample_aref =
  | 
| 
191
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       ref $args{'sample'}
  | 
| 
192
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
13
 | 
       ? $args{'sample'}
  | 
| 
193
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       : croak
  | 
| 
194
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       'Need a list (aref) of character-strings to sample for orthon listing';
  | 
| 
195
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
     my @orthon_list = ();
  | 
| 
196
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
     for ( 0 .. ( scalar @{$sample_aref} - 1 ) ) {
  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
    | 
| 
197
 | 
6
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
10
 | 
         if ( _are_orthons( $test_str, $sample_aref->[$_], $self->{'_EQ'} ) ) {
  | 
| 
198
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
             push @orthon_list, $sample_aref->[$_];
  | 
| 
199
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }
  | 
| 
200
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }
  | 
| 
201
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
     return \@orthon_list;
  | 
| 
202
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }
  | 
| 
203
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
204
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 ldist, levenshtein
  | 
| 
205
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
206
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  $count = $orthon->ldist('String1', 'String2'); # minimal, strings will be lower-cased
  | 
| 
207
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
208
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Returns the Levenshtein Distance between two given letter strings, wrapping to various Perl module's that more or less implement the Levenshtein algorithm for efficiency and case-sensitivity. Specifically, if the match level has been set at 1 (to ignore case and diacritics), the method uses L (which offers "ignoring diacritics"); otherwise, it uses L to ignore case but not marks (given present limitations of this module). The required case- and mark-sensitivity are set in the L or L methods. By default, the match is made case- and mark-Isensitively (by canned Perl L).
  | 
| 
209
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
210
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut
  | 
| 
211
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
212
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub ldist {
  | 
| 
213
 | 
135
 | 
 
 | 
 
 | 
  
135
  
 | 
  
1
  
 | 
2902
 | 
     my ( $self, $w1, $w2 ) = @_;
  | 
| 
214
 | 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
150
 | 
     my $ldist;
  | 
| 
215
 | 
135
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
233
 | 
     if ( $self->{'_MATCH_LEVEL'} == 1 ) {
  | 
| 
216
 | 
91
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1378
 | 
         require Text::Levenshtein;
  | 
| 
217
 | 
91
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1639
 | 
         $ldist =
  | 
| 
218
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           Text::Levenshtein::distance( $w1, $w2, { ignore_diacritics => 1 } )
  | 
| 
219
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           ;    # also ignores case
  | 
| 
220
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }
  | 
| 
221
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     else {
  | 
| 
222
 | 
44
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1133
 | 
         require Text::Levenshtein::XS;
  | 
| 
223
 | 
44
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
1085
 | 
         if ( $self->{'_MATCH_LEVEL'} == 2 ) {
  | 
| 
224
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
             ( $w1, $w2 ) = map { lc } ( $w1, $w2 );  # ignore case but not marks
  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
    | 
| 
225
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }
  | 
| 
226
 | 
44
 | 
 
 | 
 
 | 
 
 | 
 
 | 
68
 | 
         $ldist = Text::Levenshtein::XS::distance( $w1, $w2 )
  | 
| 
227
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           ;    # ignores nothing on its own
  | 
| 
228
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }
  | 
| 
229
 | 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
969881
 | 
     return $ldist;
  | 
| 
230
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }
  | 
| 
231
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 old
  | 
| 
233
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
234
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  $mean = $orthon->old(test => CHARSTR, sample => AREF, lim => INT);
  | 
| 
235
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
236
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Returns the mean orthographic Levenshtein distance (OLD) of the smallest B such edit distances for a given B string to all the strings in a B list. Based on Yarkoni et al. (2008), where, with the value of B is set to 20, the measure substantially contributes to prediction of performance in word recognition tasks. Here, if B is not defined, not numeric, or greater than the size of the B, then it is set by default to the size of the sample.
  | 
| 
237
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
238
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Levenshtein distance is calculated per the method L, wrapping to external modules with respect to the conditions of string equality set in L or L. Different settings lead to different speed of calculation. The slowest calculation (by far) occurs if B => 1 so that case- and mark-insensitive matching occurs; this relies on the pure Perl implementation in Text::Levenshtein with its argument B => 1. The fastest calculation (the default) occurs by setting B => 3, when exact characters are matched, e.g., I in the test-string and I in a sample-string at the same index across them are taken as unequal and so will count as a substitution. This relies on the C-implementation in Text::Levenshtein::XS. Ignore case but not marks with B => 2.
  | 
| 
239
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
240
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut
  | 
| 
241
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
242
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub old {
  | 
| 
243
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
  
1
  
 | 
769
 | 
     my ( $self, %args ) = @_;
  | 
| 
244
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $test_str =
  | 
| 
245
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       hascontent( $args{'test'} )
  | 
| 
246
 | 
3
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
15
 | 
       ? $args{'test'}
  | 
| 
247
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       : croak 'Need a single character string to calculate OLD';
  | 
| 
248
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $sample_aref =
  | 
| 
249
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       ref $args{'sample'}
  | 
| 
250
 | 
3
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
57
 | 
       ? $args{'sample'}
  | 
| 
251
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       : croak 'Need a list (aref) of character-strings to calculate OLD';
  | 
| 
252
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
     my @ldists = ();
  | 
| 
253
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
     for ( 0 .. ( scalar @{$sample_aref} - 1 ) ) {
  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
31
 | 
    | 
| 
254
 | 
123
 | 
 
 | 
 
 | 
 
 | 
 
 | 
246
 | 
         push @ldists, $self->ldist( $test_str, $sample_aref->[$_] );
  | 
| 
255
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }
  | 
| 
256
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $lim =
  | 
| 
257
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       ( is_numeric( $args{'lim'} ) and $args{'lim'} <= scalar @ldists )
  | 
| 
258
 | 
3
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
15
 | 
       ? $args{'lim'}
  | 
| 
259
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       : scalar @ldists;
  | 
| 
260
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
85
 | 
     return mean( ( sort { $a <=> $b } @ldists )[ 0 .. int $lim - 1 ] )
  | 
| 
 
 | 
380
 | 
 
 | 
 
 | 
 
 | 
 
 | 
359
 | 
    | 
| 
261
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       ;    # mean of first/smallest $lim-th values
  | 
| 
262
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }
  | 
| 
263
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
264
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 set_eq
  | 
| 
265
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
266
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  $orthon->set_eq(match_level => INT); # undef, 0, 1, 2 or 3
  | 
| 
267
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
268
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Sets the string-matching level used in the above methods. This is called implicitly in L when given a B, or with the default value of 0. This is adopted and slightly adapted from how L controls for case/diacritic-sensitive matching.
  | 
| 
269
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
270
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over 4
  | 
| 
271
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
272
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item match_level = undef, 0
  | 
| 
273
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
274
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Match with respect to case and diacritics: same as B<3> but simply by Perl's eq. So, e.g., I<éclair> and I would be taken as non-identical, just as would I and I.
  | 
| 
275
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
276
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 This is the fastest option. The higher levels, as follow, use the C() function in L.
  | 
| 
277
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
278
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item match_level = 1
  | 
| 
279
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
280
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Match ignoring case and diacritics: I to I involves 1 edit (from I to I only)
  | 
| 
281
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  
  | 
| 
282
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item match_level = 2
  | 
| 
283
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
284
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Match ignoring case but respect diacritics: "ber" to "BéZ" involves 2 edits (the "er" to "éZ") 
  | 
| 
285
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
286
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item match_level = 3
  | 
| 
287
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
288
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Match with respect to case and diacritics: "ber" to "BéZ" involves 3 edits (of all its letters)
  | 
| 
289
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
290
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =back
  | 
| 
291
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
292
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 So, for example, if the test string is "abbé", it could be picked up as having the single-substitution orthographic neighbour "able" if the match level is 1, but not if it is 0, 2 or 3.
  | 
| 
293
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
294
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut
  | 
| 
295
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
296
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub set_eq {
  | 
| 
297
 | 
12
 | 
 
 | 
 
 | 
  
12
  
 | 
  
1
  
 | 
846
 | 
     my ( $self, %args ) = @_;
  | 
| 
298
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
27
 | 
     my $match_level_arg = $args{'match_level'};
  | 
| 
299
 | 
12
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
63
 | 
     if ( nocontent($match_level_arg) or $match_level_arg == 0 ) {
  | 
| 
 
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
300
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $self->{'_MATCH_LEVEL'} = 0;
  | 
| 
301
 | 
  
0
  
 | 
 
 | 
 
 | 
  
0
  
 | 
 
 | 
0
 | 
         $self->{'_EQ'} = sub { return $_[0] eq $_[1] };
  | 
| 
 
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
302
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }
  | 
| 
303
 | 
22
 | 
 
 | 
 
 | 
  
22
  
 | 
 
 | 
383
 | 
     elsif ( any { $match_level_arg == $_ } ( 1 .. 3 ) ) {
  | 
| 
304
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
40
 | 
         $self->{'_MATCH_LEVEL'} = $match_level_arg;
  | 
| 
305
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
66
 | 
         my $collator = Unicode::Collate->new(
  | 
| 
306
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             normalization => undef,
  | 
| 
307
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             level         => $match_level_arg
  | 
| 
308
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         );
  | 
| 
309
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $self->{'_EQ'} = sub {
  | 
| 
310
 | 
134
 | 
 
 | 
 
 | 
  
134
  
 | 
 
 | 
260
 | 
             return $collator->eq(@_);
  | 
| 
311
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
440947
 | 
         };
  | 
| 
312
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }
  | 
| 
313
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     else {
  | 
| 
314
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         croak "Invalid value '$match_level_arg' given as a match level";
  | 
| 
315
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }
  | 
| 
316
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
64
 | 
     return;
  | 
| 
317
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }
  | 
| 
318
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
319
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # private methods
  | 
| 
320
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
321
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _smallest_len {
  | 
| 
322
 | 
35
 | 
 
 | 
 
 | 
  
35
  
 | 
 
 | 
57
 | 
     my @strs = @_;
  | 
| 
323
 | 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
62
 | 
     return ( sort { $a <=> $b } map { length } @strs )[0];
  | 
| 
 
 | 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
117
 | 
    | 
| 
 
 | 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
155
 | 
    | 
| 
324
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }
  | 
| 
325
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
326
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _are_orthons {
  | 
| 
327
 | 
25
 | 
 
 | 
 
 | 
  
25
  
 | 
 
 | 
39
 | 
     my ( $w1, $w2, $eq_fn ) = @_;
  | 
| 
328
 | 
25
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
48
 | 
     return 0 if length $w1 != length $w2;
  | 
| 
329
 | 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
37
 | 
     return _index_identical( $w1, $w2, $eq_fn ) == ( length $w1 ) - 1;
  | 
| 
330
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }
  | 
| 
331
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
332
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _index_identical {
  | 
| 
333
 | 
29
 | 
 
 | 
 
 | 
  
29
  
 | 
 
 | 
41
 | 
     my ( $w1, $w2, $eq_fn ) = @_;
  | 
| 
334
 | 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
32
 | 
     my $count = 0;
  | 
| 
335
 | 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
62
 | 
     for my $i ( 0 .. _smallest_len( $w1, $w2 ) - 1 ) {
  | 
| 
336
 | 
114
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
4851
 | 
         if ( $eq_fn->( map { substr $_, $i, 1 } ( $w1, $w2 ) ) ) {
  | 
| 
 
 | 
228
 | 
 
 | 
 
 | 
 
 | 
 
 | 
410
 | 
    | 
| 
337
 | 
62
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8479
 | 
             $count++;
  | 
| 
338
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }
  | 
| 
339
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }
  | 
| 
340
 | 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2011
 | 
     return $count;
  | 
| 
341
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }
  | 
| 
342
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
343
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 DIAGNOSTICS
  | 
| 
344
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
345
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over 4
  | 
| 
346
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
347
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item Invalid value '...' given as a match level
  | 
| 
348
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
349
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Argument B in new() or set_eq() needs to be an integer in range 0 .. 3, or undefined.
  | 
| 
350
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
351
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item Need a single character string to test for orthons
  | 
| 
352
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
353
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Argument B for calculating ONC and OLD, and listing orthons, needs to be defined and not empty.
  | 
| 
354
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
355
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item Need a single character string to test for orthons
  | 
| 
356
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
357
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Argument B should reference an array of character-strings when calculating ONC and OLD, and listing orthons.
  | 
| 
358
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
359
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =back
  | 
| 
360
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
361
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 REFERENCES
  | 
| 
362
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
363
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Coltheart, M., Davelaar, E., Jonasson, J. T., & Besner, D. (1977). Access to the internal lexicon. In S. Dornic (Ed.), I (Vol. 6, pp. 535-555). London, UK: Academic.
  | 
| 
364
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
365
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Yarkoni, T., Balota, D. A., & Yap, M. (2008). Moving beyond Coltheart's I: A new measure of orthographic similarity. I, I<15>, 971-979. doi: L<10.3758/PBR.15.5.971|http://dx.doi.org/10.3758/PBR.15.5.971>.
  | 
| 
366
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
367
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 DEPENDENCIES
  | 
| 
368
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
369
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L
  | 
| 
370
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
371
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L
  | 
| 
372
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
373
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L
  | 
| 
374
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
375
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L
  | 
| 
376
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
377
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L
  | 
| 
378
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
379
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L
  | 
| 
380
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
381
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L
  | 
| 
382
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
383
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 AUTHOR
  | 
| 
384
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
385
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Roderick Garton, C<<  >>
  | 
| 
386
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
387
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 SEE ALSO
  | 
| 
388
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
389
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L
  | 
| 
390
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
391
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L
  | 
| 
392
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
393
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L
  | 
| 
394
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
395
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 BUGS AND LIMITATIONS
  | 
| 
396
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
397
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Please report any bugs or feature requests to C, or through
  | 
| 
398
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 the web interface at L.  I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.
  | 
| 
399
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
400
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 SUPPORT
  | 
| 
401
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
402
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 You can find documentation for this module with the perldoc command.
  | 
| 
403
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
404
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     perldoc Lingua::Orthon
  | 
| 
405
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
406
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 You can also look for information at:
  | 
| 
407
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
408
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over 4
  | 
| 
409
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
410
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item * RT: CPAN's request tracker (report bugs here)
  | 
| 
411
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
412
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L
  | 
| 
413
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
414
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item * AnnoCPAN: Annotated CPAN documentation
  | 
| 
415
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
416
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L
  | 
| 
417
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
418
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item * CPAN Ratings
  | 
| 
419
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
420
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L
  | 
| 
421
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
422
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item * Search CPAN
  | 
| 
423
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
424
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L
  | 
| 
425
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
426
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =back
  | 
| 
427
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
428
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 LICENSE AND COPYRIGHT
  | 
| 
429
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
430
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Copyright 2011-2018 Roderick Garton.
  | 
| 
431
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
432
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 This program is free software; you can redistribute it and/or modify it
  | 
| 
433
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 under the terms of either: the GNU General Public License as published
  | 
| 
434
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 by the Free Software Foundation; or the Artistic License.
  | 
| 
435
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
436
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 See L for more information.
  | 
| 
437
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
438
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut
  | 
| 
439
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
  | 
| 
440
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;    # End of Lingua::Orthon
  |