line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::Norms::SUBTLEX;
|
2
|
8
|
|
|
8
|
|
216806
|
use 5.006;
|
|
8
|
|
|
|
|
23
|
|
|
8
|
|
|
|
|
277
|
|
3
|
8
|
|
|
8
|
|
36
|
use strict;
|
|
8
|
|
|
|
|
10
|
|
|
8
|
|
|
|
|
274
|
|
4
|
8
|
|
|
8
|
|
43
|
use warnings FATAL => 'all';
|
|
8
|
|
|
|
|
18
|
|
|
8
|
|
|
|
|
329
|
|
5
|
8
|
|
|
8
|
|
32
|
use Config;
|
|
8
|
|
|
|
|
9
|
|
|
8
|
|
|
|
|
297
|
|
6
|
8
|
|
|
8
|
|
32
|
use Carp qw(carp croak);
|
|
8
|
|
|
|
|
13
|
|
|
8
|
|
|
|
|
387
|
|
7
|
8
|
|
|
8
|
|
4010
|
use English '-no_match_vars';
|
|
8
|
|
|
|
|
23463
|
|
|
8
|
|
|
|
|
35
|
|
8
|
8
|
|
|
8
|
|
6311
|
use File::Slurp qw(read_dir);
|
|
8
|
|
|
|
|
76935
|
|
|
8
|
|
|
|
|
474
|
|
9
|
8
|
|
|
8
|
|
55
|
use File::Spec;
|
|
8
|
|
|
|
|
9
|
|
|
8
|
|
|
|
|
137
|
|
10
|
8
|
|
|
8
|
|
4026
|
use List::AllUtils qw(none);
|
|
8
|
|
|
|
|
84164
|
|
|
8
|
|
|
|
|
661
|
|
11
|
8
|
|
|
8
|
|
2663
|
use Statistics::Lite qw(max mean median stddev);
|
|
8
|
|
|
|
|
6883
|
|
|
8
|
|
|
|
|
553
|
|
12
|
8
|
|
|
8
|
|
4006
|
use String::Util qw(hascontent nocontent);
|
|
8
|
|
|
|
|
31749
|
|
|
8
|
|
|
|
|
597
|
|
13
|
8
|
|
|
8
|
|
3885
|
use Text::CSV::Separator qw(get_separator);
|
|
8
|
|
|
|
|
12772
|
|
|
8
|
|
|
|
|
452
|
|
14
|
8
|
|
|
8
|
|
3777
|
use Readonly;
|
|
8
|
|
|
|
|
18589
|
|
|
8
|
|
|
|
|
26064
|
|
15
|
|
|
|
|
|
|
Readonly my $YARKONI_MAX => 20;
|
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
$Lingua::Norms::SUBTLEX::VERSION = '0.05';
|
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=head1 NAME
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
Lingua::Norms::SUBTLEX - Retrieve frequency values and frequency-based lists for words from Subtitles Corpora
|
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=head1 VERSION
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
Version 0.05
|
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
=head1 SYNOPSIS
|
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
use feature qw(say);
|
30
|
|
|
|
|
|
|
use Lingua::Norms::SUBTLEX;
|
31
|
|
|
|
|
|
|
my $subtlex = Lingua::Norms::SUBTLEX->new(lang => 'US'); # or NL, UK, DE
|
32
|
|
|
|
|
|
|
my $bool = $subtlex->is_normed(string => 'fuip'); # isa_word ?
|
33
|
|
|
|
|
|
|
my $frq = $subtlex->frq_opm(string => 'frog'); # freq. per million, or get log/zipf
|
34
|
|
|
|
|
|
|
my $href = $subtlex->freqhash(words => [qw/frog fish ape/]); # freqs. for a list of words
|
35
|
|
|
|
|
|
|
say "$_ freq per mill = $href->{$_}" for keys %{$href};
|
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
# stats, parts-of-speech, orthographic relations:
|
38
|
|
|
|
|
|
|
say "mean freq per mill = ", $subtlex->mean_frq(words => [qw/frog fish ape/]); # or median, std-dev.
|
39
|
|
|
|
|
|
|
say "frog part-of-speech = ", $subtlex->pos(string => 'frog');
|
40
|
|
|
|
|
|
|
my ($count, $orthons_aref) = $subtlex->on_count(string => 'frog'); # or scalar context for count only; or freq_max/mean
|
41
|
|
|
|
|
|
|
say "orthon of frog = $_" for @{$orthons_aref}; # e.g., from
|
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# retrieve (list of) words to certain specs:
|
44
|
|
|
|
|
|
|
my $aref = $subtlex->list_words(freq => [2, 400], onc => [1,], length => [4, 4], cv_pattern => 'CCVC', regex => '^f');
|
45
|
|
|
|
|
|
|
my $string = $subltex->random_word();
|
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=head1 DESCRIPTION
|
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
The module facilitates access to raw data and descriptive statistics on word-frequency and parts-of-speech, as provided in the SUBTLEX-DE, SUBTLEX-NL, SUBTLEX-UK and SUBTLEX-US databases (see L). For example, the SUBTLEX-US database is based on a study of 74,286 letter-strings, with frequencies of occurrence within a corpus of some 30 million words from the subtitles of 8,388 film and television episodes. The frequency data obtained in this way have been shown to offer more psychologically predictive measures than those derived from books, newsgroup posts, and similar.
|
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
There are three groups of retrievable stats and sampling rules: (1) frequency; (2)contextual diversity (number of films/episodes appeared in); and (3) parts-of-speech. Depending on the source language, frequency is given as a count (L), occurrences per million (L), logarithm of the opm (L), and/or 7-point scaled (L); contextual diversity is given as a count (L), a percentage (L), or a logarithm (L). For parts-of-speech, L returns a string giving the dominant part. Sampling is given by the same labels, with keys with min/max values (or a whitelist of acceptable parts-of-speech).
|
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
A small sample from each of the databases is included in the installation distribution for testing purposes. The complete files need to be downloaded via the following URLs. The local directory location or actual pathname of these files can be given in class construction (by the arguments B and B); otherwise the default location--the directory "SUBTLEX" alongside the module itself in the locally configured Perl sitelib--will be used, and the correct file determined by inclusion of B value within its filename. The filenames of the original files downloaded from the following sites are supported in this way, and it does not matter if (as varies between the files) the fields are comma-separated or tab-delimited.
|
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
The three databases (comprised of one file per language) do not provide values for all methods. All three provide values for only the methods frq_count, cd_count, cd_pct, and pos. Further details of unsupported methods per database/lang are given below.
|
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=over 4
|
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=item SUBTLEX-US
|
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
For the B norms, install the file "SUBTLEXusExcel2007.csv" from L. All methods are supported by this database.
|
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=item SUBTLEX-UK
|
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
For the B norms, install the file "SUBTLEX-UK.txt" from within the "SUBTLEX-UK.zip" archive via L. This database does not define values for occurrences per million (or log occurrences per million); the methods for these stats will return an empty string.
|
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=item SUBTLEX-NL
|
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
For the B norms, install the file "SUBTLEX-NL.with-pos.txt" from within the archive "SUBTLEX-NL.with-pos.txt.zip" via L. This database does not define a value for Zipf frequency, so the "zipf" method will return an empty string if called with NL as the "lang".
|
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=item SUBTLEX-DE
|
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
For the B norms, dowload the file "SUBTLEX-DE_cleaned_with_Google00.txt" via L. There is no CD, POS or Zipf data at this point, so only the "frq_" methods, and the "on_" methods (based on realtime calculation work with this language. The file contains other information, including Google-based frequencies, for which this module does not provide retrieval at this time.
|
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
=back
|
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
There are several other languages from this project which might be supported by this module in a later version (originally, only SUBTLEX-US was supported).
|
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
=head1 SUBROUTINES/METHODS
|
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
All methods are called via the class object, and with named (hash of) arguments, usually B, where relevant.
|
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=head2 new
|
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
$subtlex = Lingua::Norms::SUBTLEX->new(lang => 'US'); # or 'UK', 'NL', 'DE' - looking in Perl sitelib
|
86
|
|
|
|
|
|
|
$subtlex = Lingua::Norms::SUBTLEX->new(lang => 'US', dir => 'file_location'); # where to look
|
87
|
|
|
|
|
|
|
$subtlex = Lingua::Norms::SUBTLEX->new(lang => 'US', path => 'actual_file');
|
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
Returns a class object for accessing other methods. The parameter B should be set to specify the particular language database: DE (German), NL (Dutch), UK (British) or US (American); otherwise US (being the first published in the series) is the default. Optional arguments B or B can be given to specify the location or actual file (respectively) of the database. The default location is within the "Lingua/Norms/SUBTLEX" directory within the 'sitelib' configured for the local Perl installation (as per L). The method will C if the given B or default location cannot be found.
|
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=cut
|
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
sub new {
|
94
|
9
|
|
|
9
|
1
|
3235
|
my ( $class, %args ) = @_;
|
95
|
9
|
50
|
|
|
|
38
|
my $self = bless {}, ref($class) ? ref($class) : $class;
|
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
# determine database location:
|
98
|
9
|
100
|
|
|
|
31
|
my $lang = $args{'lang'} ? delete $args{'lang'} : 'US';
|
99
|
9
|
|
|
|
|
4588
|
my $mod_dir =
|
100
|
|
|
|
|
|
|
File::Spec->catdir( $Config{'sitelib'}, qw/Lingua Norms SUBTLEX/ );
|
101
|
9
|
50
|
|
|
|
16945
|
if ( $args{'path'} ) { # -by specific arg:
|
102
|
9
|
|
|
|
|
47
|
$self->{'path'} = $args{'path'};
|
103
|
|
|
|
|
|
|
}
|
104
|
|
|
|
|
|
|
else { # by dir and lang args:
|
105
|
0
|
|
|
|
|
0
|
my $dir;
|
106
|
0
|
0
|
|
|
|
0
|
if ( $args{'dir'} ) { # check it's a dir:
|
107
|
0
|
0
|
|
|
|
0
|
croak "Value for argument 'dir' ($args{'dir'}) is not a directory"
|
108
|
|
|
|
|
|
|
if !-d $args{'dir'};
|
109
|
0
|
|
|
|
|
0
|
$dir = delete $args{'dir'};
|
110
|
|
|
|
|
|
|
}
|
111
|
|
|
|
|
|
|
else { # use module's dir :
|
112
|
0
|
|
|
|
|
0
|
$dir = $mod_dir;
|
113
|
|
|
|
|
|
|
}
|
114
|
0
|
|
|
|
|
0
|
for ( read_dir($dir) ) {
|
115
|
0
|
0
|
|
|
|
0
|
if (/(?:SUBTLEX\-)?\Q$lang/imsx) {
|
116
|
0
|
|
|
|
|
0
|
$self->{'path'} = File::Spec->catfile( $dir, $_ );
|
117
|
0
|
|
|
|
|
0
|
last;
|
118
|
|
|
|
|
|
|
}
|
119
|
|
|
|
|
|
|
}
|
120
|
|
|
|
|
|
|
}
|
121
|
9
|
50
|
|
|
|
47
|
croak "Cannot find required database for language $lang"
|
122
|
|
|
|
|
|
|
if nocontent( $self->{'path'} ); # or !-T $self->{'path'};
|
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
# determine delimiter:
|
125
|
9
|
|
|
|
|
151
|
$self->{'delim'} = get_separator( path => $self->{'path'}, lucky => 1 );
|
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
# identify needed field indices within this file:
|
128
|
|
|
|
|
|
|
# - arg 'fieldpath' only used for testing, assumed not useful for user:
|
129
|
9
|
|
|
|
|
1694
|
my %fields = ( US => 1, UK => 2, NL => 3, DE => 4 );
|
130
|
9
|
50
|
|
|
|
35
|
croak 'Cannot determine fields for given language'
|
131
|
|
|
|
|
|
|
if nocontent( $fields{$lang} );
|
132
|
9
|
50
|
|
|
|
125
|
my $fieldpath =
|
133
|
|
|
|
|
|
|
$args{'fieldpath'}
|
134
|
|
|
|
|
|
|
? $args{'fieldpath'}
|
135
|
|
|
|
|
|
|
: File::Spec->catfile( $mod_dir, 'fields.csv' );
|
136
|
9
|
50
|
|
|
|
338
|
open( my $fh, q{<}, $fieldpath ) or carp 'Cannot determine field indices';
|
137
|
9
|
|
|
|
|
90
|
while (<$fh>) {
|
138
|
117
|
100
|
|
|
|
850
|
next if $INPUT_LINE_NUMBER == 1;
|
139
|
108
|
|
|
|
|
90
|
chomp;
|
140
|
108
|
|
|
|
|
239
|
my @dat = split m/[,]/msx;
|
141
|
108
|
50
|
|
|
|
209
|
$self->{'IDX'}->{ $dat[0] } =
|
142
|
|
|
|
|
|
|
hascontent( $dat[ $fields{$lang} ] ) ? $dat[ $fields{$lang} ] : q{};
|
143
|
|
|
|
|
|
|
}
|
144
|
9
|
50
|
|
|
|
149
|
close $fh or croak $OS_ERROR;
|
145
|
9
|
|
|
|
|
53
|
return $self;
|
146
|
|
|
|
|
|
|
}
|
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
=head2 Frequencies and POS for individual words or word-lists
|
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=head3 is_normed
|
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
$bool = $subtlex->is_normed(string => $word);
|
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
I: isa_word
|
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
Returns a boolean value to specify whether or not the letter-string passed as B is represented in the SUBTLEX corpus. This might be thought of as a lexical decision ("is this string a word?") but note that some very low frequency letter-strings in the corpus would not be considered words in the average context (perhaps, in part, because of misspelt subtitles).
|
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
=cut
|
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
sub is_normed {
|
161
|
2
|
|
|
2
|
1
|
11
|
my ( $self, %args ) = @_;
|
162
|
2
|
50
|
|
|
|
8
|
croak 'No string to test; pass a string to the function'
|
163
|
|
|
|
|
|
|
if nocontent( $args{'string'} );
|
164
|
2
|
|
|
|
|
15
|
my $str = $args{'string'};
|
165
|
2
|
|
|
|
|
2
|
my $res = 0; # boolean to return from this sub
|
166
|
2
|
50
|
|
|
|
54
|
open my $fh, q{<}, $self->{'path'} or croak $OS_ERROR;
|
167
|
2
|
|
|
|
|
17
|
while (<$fh>) {
|
168
|
35
|
100
|
|
|
|
46
|
next if $INPUT_LINE_NUMBER == 1; # skip headings
|
169
|
33
|
50
|
|
|
|
108
|
/^([^\Q$self->{'delim'}\E]+)/msx
|
170
|
|
|
|
|
|
|
or next; # isolate first token ahead of delimiter as $1 in this file
|
171
|
33
|
100
|
|
|
|
96
|
if ( $str eq $1 ) { # first token equals given string?
|
172
|
1
|
|
|
|
|
2
|
$res = 1; # set result to return as true
|
173
|
1
|
|
|
|
|
2
|
last; # got it, so abort look-up
|
174
|
|
|
|
|
|
|
}
|
175
|
|
|
|
|
|
|
}
|
176
|
2
|
50
|
|
|
|
14
|
close $fh or croak $OS_ERROR;
|
177
|
2
|
|
|
|
|
14
|
return $res; # zero if string not found in file
|
178
|
|
|
|
|
|
|
}
|
179
|
|
|
|
|
|
|
*isa_word = \&is_normed;
|
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
=head3 frq_count
|
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
$val = $subtlex->frq_count(string => 'aword');
|
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
Returns the raw number of occurrences in all the films/TV episodes for the word passed as B, or the empty-string if the string is not represented in the norms.
|
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
=cut
|
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
sub frq_count {
|
190
|
0
|
|
|
0
|
1
|
0
|
my ( $self, %args ) = @_;
|
191
|
0
|
|
|
|
|
0
|
return _get_fieldvalue( $self, $args{'string'},
|
192
|
|
|
|
|
|
|
$self->{'IDX'}->{'frq_count'} );
|
193
|
|
|
|
|
|
|
}
|
194
|
|
|
|
|
|
|
*freq = \&frq_opm;
|
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=head3 frq_opm
|
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
$val = $subtlex->frq_opm(string => 'aword');
|
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
I: opm
|
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
Returns frequency per million for the word passed as B, or the empty-string if the string is not represented in the norms.
|
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
=cut
|
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
sub frq_opm {
|
207
|
2
|
|
|
2
|
1
|
351
|
my ( $self, %args ) = @_;
|
208
|
2
|
|
|
|
|
11
|
return _get_fieldvalue( $self, $args{'string'},
|
209
|
|
|
|
|
|
|
$self->{'IDX'}->{'frq_opm'} );
|
210
|
|
|
|
|
|
|
}
|
211
|
|
|
|
|
|
|
*freq = \&frq_opm; # legacy only
|
212
|
|
|
|
|
|
|
*opm = \&frq_opm;
|
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
=head3 frq_log
|
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
$val = $subtlex->frq_log(string => 'aword');
|
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
Returns log frequency per million for the word passed as B, or the empty-string if the string is not represented in the norms.
|
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=cut
|
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
sub frq_log {
|
223
|
2
|
|
|
2
|
1
|
7
|
my ( $self, %args ) = @_;
|
224
|
2
|
|
|
|
|
7
|
return _get_fieldvalue( $self, $args{'string'},
|
225
|
|
|
|
|
|
|
$self->{'IDX'}->{'frq_log'} );
|
226
|
|
|
|
|
|
|
}
|
227
|
|
|
|
|
|
|
*lfreq = \&frq_log; # legacy only
|
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
=head3 frq_zipf
|
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
$val = $subtlex->frq_zipf(string => 'aword');
|
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
Returns zipf frequency for the word passed as B, or the empty-string if the string is not represented in the norms. The Zipf scale ranges from 1 to 7, with values of 1-3 representing low frequency words, and values of 4-7 representing high frequency words. See Van Heuven et al. (2014) and L.
|
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
=cut
|
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
sub frq_zipf {
|
238
|
2
|
|
|
2
|
1
|
7
|
my ( $self, %args ) = @_;
|
239
|
2
|
|
|
|
|
7
|
return _get_fieldvalue( $self, $args{'string'},
|
240
|
|
|
|
|
|
|
$self->{'IDX'}->{'frq_zipf'} );
|
241
|
|
|
|
|
|
|
}
|
242
|
|
|
|
|
|
|
*zipf = \&frq_zipf; # legacy only
|
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
=head3 cd_count
|
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
$cd = $subtlex->cd_count(string => 'aword');
|
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
Corresponds to the column labelled "CDcount" in the datafile.
|
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
=cut
|
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
sub cd_count {
|
253
|
0
|
|
|
0
|
1
|
0
|
my ( $self, %args ) = @_;
|
254
|
0
|
|
|
|
|
0
|
return _get_fieldvalue( $self, $args{'string'},
|
255
|
|
|
|
|
|
|
$self->{'IDX'}->{'cd_count'} );
|
256
|
|
|
|
|
|
|
}
|
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
=head3 cd_pct
|
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
$cd = $subtlex->cd_pct(string => 'aword');
|
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
Returns a percentage measure to two decimal places of the number of films/TV episodes in which the given string was included in its subtitles. This corresponds to the measure "SUBTLCD" described in Brysbaert and New (2009). Note: where "cd" stands for "contextual diversity."
|
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
=cut
|
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
sub cd_pct {
|
267
|
2
|
|
|
2
|
1
|
7
|
my ( $self, %args ) = @_;
|
268
|
2
|
|
|
|
|
9
|
return _get_fieldvalue( $self, $args{'string'},
|
269
|
|
|
|
|
|
|
$self->{'IDX'}->{'cd_pct'} );
|
270
|
|
|
|
|
|
|
}
|
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
=head3 cd_log
|
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
Returns log10(L + 1) for the given string, with 4-digit precision. This corresponds to the measure "Lg10CD" described in Brysbaert and New (2009), where it is stated that "this is the best value to use if one wants to match words on word frequency" (p. 988). Note: "cd" stands for "contextual diversity," which is based on the number of films and TV episodes in which the string was represented.
|
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
=cut
|
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
sub cd_log {
|
279
|
2
|
|
|
2
|
1
|
6
|
my ( $self, %args ) = @_;
|
280
|
2
|
|
|
|
|
9
|
return _get_fieldvalue( $self, $args{'string'},
|
281
|
|
|
|
|
|
|
$self->{'IDX'}->{'cd_log'} );
|
282
|
|
|
|
|
|
|
}
|
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
=head3 frq_hash
|
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
$href = $subtlex->frq_hash(strings => [qw/word1 word2/], scale => opm|log|zipf);
|
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
Returns frequency as values within a reference to a hash keyed by the words passed as B. By default, the values in the hash are corpus frequency per million. If the optional argument B is defined, and it equals I, then the values are log-frequency; similarly, I yields zipf-frequency. Note, however, that some databases do not support all types of scales; in which case the returned value will be the empty string.
|
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
=cut
|
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
sub frq_hash {
|
293
|
8
|
|
|
8
|
1
|
2204
|
my ( $self, %args ) = @_;
|
294
|
8
|
50
|
|
|
|
23
|
croak
|
295
|
|
|
|
|
|
|
'No string(s) to test; pass one or more letter-strings named \'strings\' as a referenced array'
|
296
|
|
|
|
|
|
|
if !$args{'strings'};
|
297
|
8
|
50
|
|
|
|
20
|
my $strs =
|
298
|
|
|
|
|
|
|
ref $args{'strings'}
|
299
|
|
|
|
|
|
|
? $args{'strings'}
|
300
|
|
|
|
|
|
|
: croak 'No reference to an array of letter-strings found';
|
301
|
8
|
100
|
|
|
|
26
|
my $col_i =
|
302
|
|
|
|
|
|
|
hascontent( $args{'scale'} )
|
303
|
|
|
|
|
|
|
? $self->{'IDX'}->{ $args{'scale'} }
|
304
|
|
|
|
|
|
|
: $self->{'IDX'}->{'frq_opm'};
|
305
|
21
|
|
|
|
|
57
|
my %frq = map { lc($_) => [ undef, $_ ] }
|
|
8
|
|
|
|
|
16
|
|
306
|
8
|
|
|
|
|
63
|
@{$strs}; # keep lower-case to search associated with original case
|
307
|
8
|
50
|
|
|
|
282
|
open my $fh, q{<}, $self->{'path'} or croak "$OS_ERROR\n";
|
308
|
8
|
|
|
|
|
74
|
while (<$fh>) {
|
309
|
240
|
100
|
|
|
|
1417
|
next if $INPUT_LINE_NUMBER == 1;
|
310
|
232
|
|
|
|
|
185
|
chomp;
|
311
|
232
|
|
|
|
|
1089
|
my @line = split m/\Q$self->{'delim'}\E/msx;
|
312
|
232
|
100
|
|
|
|
599
|
if ( hascontent( $frq{ lc( $line[0] ) } ) ) {
|
313
|
21
|
|
|
|
|
209
|
$frq{ lc( $line[0] ) }->[0] = $line[$col_i];
|
314
|
|
|
|
|
|
|
}
|
315
|
|
|
|
|
|
|
}
|
316
|
8
|
50
|
|
|
|
153
|
close $fh or croak $OS_ERROR;
|
317
|
8
|
|
|
|
|
17
|
return { map { $_->[1] => _nummify($_->[0]) }
|
|
21
|
|
|
|
|
34
|
|
318
|
|
|
|
|
|
|
values %frq }; # assign values to original case strings
|
319
|
|
|
|
|
|
|
}
|
320
|
|
|
|
|
|
|
*freqhash = \&frq_hash;
|
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
=head3 pos
|
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
$pos_str = $subtlex->pos(string => 'aword');
|
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
Returns part-of-speech string for a given word. The return value is undefined if the word is not found.
|
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
=cut
|
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
sub pos {
|
331
|
1
|
|
|
1
|
1
|
7
|
my ( $self, %args ) = @_;
|
332
|
1
|
50
|
|
|
|
3
|
croak 'No string to test' if !$args{'string'};
|
333
|
1
|
|
|
|
|
1
|
my $word = $args{'string'};
|
334
|
1
|
|
|
|
|
1
|
my $pos;
|
335
|
1
|
50
|
|
|
|
25
|
open my $fh, q{<}, $self->{'path'} or croak "$OS_ERROR\n";
|
336
|
1
|
|
|
|
|
9
|
while (<$fh>) {
|
337
|
3
|
100
|
|
|
|
7
|
next if $INPUT_LINE_NUMBER == 1; # skip column heading line
|
338
|
2
|
|
|
|
|
29
|
/^([^\Q$self->{'delim'}\E]+)/msx;
|
339
|
2
|
100
|
|
|
|
8
|
if ( $word eq $1 ) {
|
340
|
1
|
|
|
|
|
5
|
chomp;
|
341
|
1
|
|
|
|
|
12
|
my @line = split m/\Q$self->{'delim'}\E/msx, $_;
|
342
|
1
|
|
|
|
|
5
|
$pos = $line[ $self->{'IDX'}->{'pos'} ];
|
343
|
1
|
|
|
|
|
2
|
last;
|
344
|
|
|
|
|
|
|
}
|
345
|
|
|
|
|
|
|
}
|
346
|
1
|
50
|
|
|
|
9
|
close $fh or croak $OS_ERROR;
|
347
|
1
|
|
|
|
|
10
|
return $pos;
|
348
|
|
|
|
|
|
|
}
|
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
=head2 Descriptive frequency statistics for lists
|
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
These methods return a descriptive statistic (mean, median or standard deviation) for a list of B. Like L, they take an optional argument B to specify if the returned values should be raw frequencies per million, log frequencies, or zip-frequencies.
|
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
=head3 frq_mean
|
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
$mean = $subtlex->frq_mean(strings => [qw/word1 word2/], scale => 'raw|log|zipf');
|
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
Returns the arithmetic mean of the frequencies for the given B, or mean of the log frequencies if B => 1.
|
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
=cut
|
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
sub frq_mean {
|
363
|
3
|
|
|
3
|
1
|
665
|
my ( $self, %args ) = @_;
|
364
|
3
|
|
|
|
|
3
|
return mean( values %{ $self->freqhash(%args) } );
|
|
3
|
|
|
|
|
9
|
|
365
|
|
|
|
|
|
|
}
|
366
|
|
|
|
|
|
|
*mean_freq = \&frq_mean;
|
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
=head3 frq_median
|
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
$median = $subtlex->frq_median(words => [qw/word1 word2/], scale => 'raw|log|zipf');
|
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
Returns the median of the frequencies for the given B, or median of the log frequencies if B => 1.
|
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
=cut
|
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
sub frq_median {
|
377
|
1
|
|
|
1
|
1
|
268
|
my ( $self, %args ) = @_;
|
378
|
1
|
|
|
|
|
2
|
return median( values %{ $self->freqhash(%args) } );
|
|
1
|
|
|
|
|
4
|
|
379
|
|
|
|
|
|
|
}
|
380
|
|
|
|
|
|
|
*median_freq = \*frq_median;
|
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
=head3 frq_sd
|
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
$sd = $subtlex->frq_sd(words => [qw/word1 word2/], scale => 'raw|log|zipf');
|
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
Returns the standard deviation of the frequencies for the given B, or standard deviation of the log frequencies if B => 1.
|
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
=cut
|
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
sub frq_sd {
|
391
|
1
|
|
|
1
|
1
|
317
|
my ( $self, %args ) = @_;
|
392
|
1
|
|
|
|
|
1
|
return stddev( values %{ $self->freqhash(%args) } );
|
|
1
|
|
|
|
|
4
|
|
393
|
|
|
|
|
|
|
}
|
394
|
|
|
|
|
|
|
*sd_freq = \*frq_sd;
|
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
=head2 Orthographic neighbourhood measures
|
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
These methods return stats re the orthographic relatedness of a specified letter-B to words in the SUBTLEX corpus. Unless otherwise stated, an orthographic neighbour here means letter-strings that are identical except for a single-letter substitution while holding string-length constant, i.e., the Coltheart I of a letter-string, as defined in Coltheart et al. (1977). These measures are calculated in realtime; they are not listed in the datafile for look-up, so expect some extra-normal delay in getting a returned value.
|
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
=head3 on_count
|
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
$n = $subtlex->on_count(string => $letters);
|
403
|
|
|
|
|
|
|
($n, $orthons_aref) = $subtlex->on_count(string => $letters);
|
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
Returns orthographic neighbourhood count (Coltheart I) within the SUBTLEX corpus. Called in array context, also returns a reference to an array of the neighbours retrieved, if any.
|
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
=cut
|
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
sub on_count {
|
410
|
14
|
|
|
14
|
1
|
363
|
my ( $self, %args ) = @_;
|
411
|
14
|
50
|
|
|
|
26
|
croak 'No string to test' if !$args{'string'};
|
412
|
14
|
|
|
|
|
23
|
my $word = lc( $args{'string'} );
|
413
|
14
|
|
|
|
|
961
|
require Lingua::Orthon;
|
414
|
14
|
|
|
|
|
47255
|
my $ortho = Lingua::Orthon->new();
|
415
|
14
|
|
|
|
|
466
|
my ( $z, @orthons ) = (0);
|
416
|
14
|
50
|
|
|
|
489
|
open my $fh, q{<}, $self->{'path'} or die "$OS_ERROR\n";
|
417
|
14
|
|
|
|
|
137
|
while (<$fh>) {
|
418
|
420
|
100
|
|
|
|
2928
|
next if $INPUT_LINE_NUMBER == 1; # skip column heading line
|
419
|
406
|
50
|
|
|
|
1292
|
/^([^\Q$self->{'delim'}\E]+)/msx or next;
|
420
|
406
|
|
|
|
|
470
|
my $test = lc($1);
|
421
|
406
|
100
|
|
|
|
612
|
if ( $ortho->are_orthons( $word, $test ) ) {
|
422
|
39
|
|
|
|
|
440
|
push @orthons, $test;
|
423
|
39
|
|
|
|
|
96
|
$z++;
|
424
|
|
|
|
|
|
|
}
|
425
|
|
|
|
|
|
|
}
|
426
|
14
|
50
|
|
|
|
223
|
close $fh or croak $OS_ERROR;
|
427
|
14
|
100
|
|
|
|
121
|
return wantarray ? ( $z, \@orthons ) : $z;
|
428
|
|
|
|
|
|
|
}
|
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
=head3 on_frq_max
|
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
$m = $subtlex->on_frq_max(string => $letters);
|
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
Returns the maximum SUBTLEX frequency per million among the orthographic neighbours (per Coltheart I) of a particular letter-string. If (unusually) all the frequencies are the same, then that value is returned. If the string has no (Coltheart-type) neighbours, undef is returned.
|
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
=cut
|
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
sub on_frq_max {
|
439
|
1
|
|
|
1
|
1
|
2776
|
my ( $self, %args ) = @_;
|
440
|
1
|
50
|
|
|
|
4
|
croak 'No string to test' if !$args{'string'};
|
441
|
1
|
|
|
|
|
6
|
my $frq_aref = _get_orthon_f(
|
442
|
|
|
|
|
|
|
$args{'string'}, $self->{'path'},
|
443
|
|
|
|
|
|
|
$self->{'IDX'}->{'frq_opm'}, $self->{'delim'}
|
444
|
|
|
|
|
|
|
);
|
445
|
1
|
50
|
|
|
|
1
|
return scalar @{$frq_aref} ? max( @{$frq_aref} ) : undef;
|
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
6
|
|
446
|
|
|
|
|
|
|
}
|
447
|
|
|
|
|
|
|
*on_freq_max = \&on_frq_max;
|
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
=head3 on_frq_opm_mean
|
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
$m = $subtlex->on_frq_mean(string => $letters);
|
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
Returns the mean SUBTLEX frequencies per million of the orthographic neighbours (per Coltheart I) of a particular letter-string. If the string has no (Coltheart-type) neighbours, undef is returned.
|
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
=cut
|
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
sub on_frq_opm_mean {
|
458
|
1
|
|
|
1
|
1
|
270
|
my ( $self, %args ) = @_;
|
459
|
1
|
50
|
|
|
|
4
|
croak 'No string to test' if !$args{'string'};
|
460
|
1
|
|
|
|
|
4
|
my $frq_aref = _get_orthon_f(
|
461
|
|
|
|
|
|
|
$args{'string'}, $self->{'path'},
|
462
|
|
|
|
|
|
|
$self->{'IDX'}->{'frq_opm'}, $self->{'delim'}
|
463
|
|
|
|
|
|
|
);
|
464
|
1
|
50
|
|
|
|
2
|
return scalar @{$frq_aref} ? mean( @{$frq_aref} ) : undef;
|
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
5
|
|
465
|
|
|
|
|
|
|
}
|
466
|
|
|
|
|
|
|
*on_freq_mean = \&on_frq_opm_mean;
|
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
=head3 on_frq_log_mean
|
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
$m = $subtlex->on_frq_log_mean(string => $letters);
|
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
Returns the mean log of SUBTLEX frequencies of the orthographic neighbours (per Coltheart I) of a particular letter-string. If the string has no (Coltheart-type) neighbours, undef is returned.
|
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
=cut
|
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
sub on_frq_log_mean {
|
477
|
0
|
|
|
0
|
1
|
0
|
my ( $self, %args ) = @_;
|
478
|
0
|
0
|
|
|
|
0
|
croak 'No string to test' if !$args{'string'};
|
479
|
0
|
|
|
|
|
0
|
my $frq_aref = _get_orthon_f(
|
480
|
|
|
|
|
|
|
$args{'string'}, $self->{'path'},
|
481
|
|
|
|
|
|
|
$self->{'IDX'}->{'frq_log'}, $self->{'delim'}
|
482
|
|
|
|
|
|
|
);
|
483
|
0
|
0
|
|
|
|
0
|
return scalar @{$frq_aref} ? mean( @{$frq_aref} ) : undef;
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
484
|
|
|
|
|
|
|
}
|
485
|
|
|
|
|
|
|
*on_lfreq_mean = \&on_frq_log_mean;
|
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
=head3 on_frq_zipf_mean
|
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
$m = $subtlex->on_frq_zipf_mean(string => $letters);
|
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
Returns the mean zipf of SUBTLEX frequencies of the orthographic neighbours (per Coltheart I) of a given letter-string. If the string has no (Coltheart-type) neighbours, undef is returned.
|
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
=cut
|
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
sub on_frq_zipf_mean {
|
496
|
1
|
|
|
1
|
1
|
262
|
my ( $self, %args ) = @_;
|
497
|
1
|
50
|
|
|
|
4
|
croak 'No string to test' if !$args{'string'};
|
498
|
1
|
|
|
|
|
5
|
my $frq_aref = _get_orthon_f(
|
499
|
|
|
|
|
|
|
$args{'string'}, $self->{'path'},
|
500
|
|
|
|
|
|
|
$self->{'IDX'}->{'frq_zipf'}, $self->{'delim'}
|
501
|
|
|
|
|
|
|
);
|
502
|
1
|
50
|
|
|
|
2
|
return scalar @{$frq_aref} ? mean( @{$frq_aref} ) : undef;
|
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
3
|
|
503
|
|
|
|
|
|
|
}
|
504
|
|
|
|
|
|
|
*on_zipf_mean = \&on_frq_zipf_mean;
|
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
=head3 on_ldist
|
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
$m = $subtlex->on_ldist(string => $letters, lim => 20);
|
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
I: ldist
|
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
Returns the mean L from a letter-string to its B closest orthographic neighbours. The default Bit is 20, as defined in Yarkoni et al. (2008). The module uses the matrix-based calculation of Levenshtein Distance as implemented in L module. No defined value is returned if no Levenshtein Distance is found (whereas zero would connote "identical to everything").
|
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
=cut
|
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
sub on_ldist {
|
517
|
2
|
|
|
2
|
1
|
474
|
my ( $self, %args ) = @_;
|
518
|
2
|
50
|
|
|
|
8
|
croak 'No string to test' if !$args{'string'};
|
519
|
2
|
|
33
|
|
|
18
|
$args{'lim'} ||= $YARKONI_MAX;
|
520
|
2
|
|
|
|
|
23
|
return _get_orthon_ldist( $args{'string'}, $self->{'path'}, $args{'lim'},
|
521
|
|
|
|
|
|
|
$self->{'delim'} );
|
522
|
|
|
|
|
|
|
}
|
523
|
|
|
|
|
|
|
*ldist = \&on_ldist;
|
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
=head2 Retrieving letter-strings/words
|
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
=head3 list_strings
|
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
$aref = $subtlex->list_words(freq => [1, 20], onc => [0, 3], length => [4, 4], cv_pattern => 'CVCV', regex => '^f');
|
530
|
|
|
|
|
|
|
$aref = $subtlex->list_words(zipf => [0, 2], onc => [0, 3], length => [4, 4], cv_pattern => 'CVCV', regex => '^f');
|
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
I: list_words
|
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
Returns a list of words from the SUBTLEX corpus that satisfies certain criteria: minimum and/or maximum letter-length (specified by the named argument B), minimum and/or maximum frequency (argument B) or zip-frequency (argument B), minimum and/or maximum orthographic neighbourhood count (argument B), a consonant-vowel pattern (argument B), or a specific regular expression (argument B).
|
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
For the minimum/maximum constrained criteria, the two limits are given as a referenced array where the first element is the minimum and the second element is the maximum. For example, [3, 7] would specify letter-strings of 3 to 7 letters in length; [4, 4] specifies letter-strings of only 4 letters in length. If only one of these is to be constrained, then the array would be given as, e.g., [3] to specify a minimum of 3 letters without constraining the maximum, or ['',7] for a maximum of 7 letters without constraining the minimum (checking if the element C as per String::Util).
|
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
The consonant-vowel pattern is specified as a string by the usual convention, e.g., 'CCVCC' defines a 5-letter word starting and ending with pairs of consonants, the pairs separated by a vowel. 'Y' is defined here as a consonant.
|
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
A finer selection of particular letters can be made by giving a regular expression as a string to the B argument. In the example above, only letter-strings starting with the letter 'f', followed by one of more other letters, are specified. Alternatively, for example, '[^aeiouy]$' specifies that the letter-strings must not end with a vowel (here including 'y'). The entire example for '^f', including the shown arguments for B, B, B and B, would return only two words: I and I from SUBTLEX-US.
|
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
The selection procedure will be made particularly slow wherever B is specified (as this has to be calculated in real-time) and no arguments are given for B and B (which are tested ahead of any other criteria).
|
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
Syllable-counts might be added in future; existing algorithms in the Lingua family are not sufficiently reliable for the purposes to which the present module might often be put; an alternative is being worked on.
|
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
The value returned is always a reference to the list of words retrieved (or to an empty list if none was retrieved).
|
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
=cut
|
549
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
sub list_strings {
|
551
|
3
|
|
|
3
|
1
|
5714
|
my ( $self, %args ) = @_;
|
552
|
3
|
|
|
|
|
7
|
my %patterns = ();
|
553
|
3
|
100
|
|
|
|
14
|
if ( hascontent( $args{'regex'} ) ) {
|
554
|
1
|
|
|
|
|
25
|
$patterns{'regex'} = qr/$args{'regex'}/msx;
|
555
|
|
|
|
|
|
|
}
|
556
|
3
|
100
|
|
|
|
18
|
if ( hascontent( $args{'cv_pattern'} ) ) {
|
557
|
1
|
|
|
|
|
7
|
my $tmp = q{};
|
558
|
1
|
|
|
|
|
5
|
my @c = split m//msx, uc( $args{'cv_pattern'} );
|
559
|
1
|
|
|
|
|
3
|
foreach (@c) {
|
560
|
4
|
100
|
|
|
|
9
|
$tmp .= $_ eq 'C' ? '[BCDFGHJKLMNPQRSTVWXYZ]' : '[AEIOU]';
|
561
|
|
|
|
|
|
|
}
|
562
|
1
|
|
|
|
|
74
|
$patterns{'cv_pattern'} = qr/^$tmp$/imsx;
|
563
|
|
|
|
|
|
|
}
|
564
|
|
|
|
|
|
|
|
565
|
3
|
|
|
|
|
13
|
my @list = ();
|
566
|
3
|
50
|
|
|
|
129
|
open my $fh, q{<}, $self->{'path'} or croak $OS_ERROR;
|
567
|
|
|
|
|
|
|
LINES:
|
568
|
3
|
|
|
|
|
41
|
while (<$fh>) {
|
569
|
90
|
100
|
|
|
|
160
|
next if $INPUT_LINE_NUMBER == 1; # skip column heading line
|
570
|
87
|
|
|
|
|
90
|
chomp;
|
571
|
87
|
|
|
|
|
472
|
my @line = split m/\Q$self->{'delim'}\E/msx;
|
572
|
87
|
100
|
|
|
|
103
|
next if !_in_range( length( $line[0] ), @{ $args{'length'} } );
|
|
87
|
|
|
|
|
137
|
|
573
|
33
|
|
|
|
|
67
|
for ( keys %patterns ) {
|
574
|
7
|
100
|
|
|
|
36
|
next LINES if $line[0] !~ $patterns{$_};
|
575
|
|
|
|
|
|
|
}
|
576
|
32
|
|
|
|
|
55
|
for (qw/freq opm frq_opm frq_log log frq_zipf zipf cd_pct cd_log/) {
|
577
|
128
|
100
|
|
|
|
198
|
if ( ref $args{$_} ) {
|
578
|
|
|
|
|
|
|
next LINES
|
579
|
32
|
|
|
|
|
46
|
if !_in_range( _nummify($line[ $self->{'IDX'}->{$_} ]),
|
580
|
32
|
100
|
|
|
|
59
|
@{ $args{$_} } );
|
581
|
|
|
|
|
|
|
}
|
582
|
|
|
|
|
|
|
}
|
583
|
12
|
50
|
|
|
|
23
|
if ( ref $args{'pos'} ) {
|
584
|
|
|
|
|
|
|
next LINES
|
585
|
0
|
|
|
0
|
|
0
|
if none { $_ eq $line[ $self->{'IDX'}->{'pos'} ] }
|
586
|
0
|
0
|
|
|
|
0
|
@{ $args{'pos'} };
|
|
0
|
|
|
|
|
0
|
|
587
|
|
|
|
|
|
|
}
|
588
|
|
|
|
|
|
|
next LINES
|
589
|
12
|
|
|
|
|
34
|
if !_in_range( scalar( $self->on_count( string => $line[0] ) ),
|
590
|
12
|
100
|
|
|
|
29
|
@{ $args{'onc'} } );
|
591
|
10
|
|
|
|
|
62
|
push @list, $line[0];
|
592
|
|
|
|
|
|
|
}
|
593
|
3
|
50
|
|
|
|
22
|
close $fh or croak;
|
594
|
|
|
|
|
|
|
|
595
|
3
|
|
|
|
|
21
|
return \@list;
|
596
|
|
|
|
|
|
|
}
|
597
|
|
|
|
|
|
|
*list_words = \&list_strings;
|
598
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
=head3 all_strings
|
600
|
|
|
|
|
|
|
|
601
|
|
|
|
|
|
|
$aref = $subtlex->all_strings();
|
602
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
I: all_words
|
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
Returns a reference to an array of all letter-strings in the corpus, in their given order.
|
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
=cut
|
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
sub all_strings {
|
610
|
0
|
|
|
0
|
1
|
0
|
my ( $self, %args ) = @_;
|
611
|
0
|
|
|
|
|
0
|
my @list = ();
|
612
|
0
|
0
|
|
|
|
0
|
open my $fh, q{<}, $self->{'path'} or croak $OS_ERROR;
|
613
|
0
|
|
|
|
|
0
|
while (<$fh>) {
|
614
|
0
|
0
|
|
|
|
0
|
next if $INPUT_LINE_NUMBER == 1; # skip column heading line
|
615
|
0
|
0
|
|
|
|
0
|
/^([^\Q$self->{'delim'}\E]+)/msx or next;
|
616
|
0
|
|
|
|
|
0
|
push @list, $1;
|
617
|
|
|
|
|
|
|
}
|
618
|
0
|
0
|
|
|
|
0
|
close $fh or croak;
|
619
|
0
|
|
|
|
|
0
|
return \@list;
|
620
|
|
|
|
|
|
|
}
|
621
|
|
|
|
|
|
|
*all_words = \&all_strings;
|
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
=head3 random_string
|
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
$string = $subtlex->random_string();
|
626
|
|
|
|
|
|
|
@data = $subtlex->random_string();
|
627
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
I: random_word
|
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
Picks a random line from the corpus, using L (except the top header line). Returns the word in that line if called in scalar context; otherwise, the array of data for that line. (A future version might let specifying a match to specific criteria, self-aborting after trying X lines.)
|
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
=cut
|
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
sub random_string {
|
635
|
2
|
|
|
2
|
1
|
372
|
my ( $self, %args ) = @_;
|
636
|
2
|
|
|
|
|
3
|
eval { require File::RandomLine; };
|
|
2
|
|
|
|
|
496
|
|
637
|
2
|
50
|
|
|
|
4964
|
croak 'Need to install and access module File::RandomLine' if $EVAL_ERROR;
|
638
|
2
|
|
|
|
|
20
|
my $rl =
|
639
|
|
|
|
|
|
|
File::RandomLine->new( $self->{'path'}, { algorithm => 'uniform' } );
|
640
|
2
|
|
|
|
|
240
|
my @ari = ();
|
641
|
2
|
|
66
|
|
|
7
|
while ( not scalar @ari or $ari[0] eq 'Word' ) {
|
642
|
2
|
|
|
|
|
20
|
@ari = split m/\Q$self->{'delim'}\E/msx, $rl->next;
|
643
|
|
|
|
|
|
|
}
|
644
|
2
|
100
|
|
|
|
142
|
return wantarray ? @ari : $ari[0];
|
645
|
|
|
|
|
|
|
}
|
646
|
|
|
|
|
|
|
*random_word = \&random_string;
|
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
=head2 Miscellaneous
|
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
=head3 nlines
|
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
$num = $subtlex->nlines();
|
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
Returns the number of lines, less the column headings, in the installed language file. Expects/accepts no arguments.
|
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
=cut
|
657
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
sub nlines {
|
659
|
0
|
|
|
0
|
1
|
0
|
my $self = shift;
|
660
|
0
|
|
|
|
|
0
|
my $z = 0;
|
661
|
0
|
0
|
|
|
|
0
|
open( my $fh, q{<}, $self->{'path'} ) or croak "$OS_ERROR\n";
|
662
|
0
|
|
|
|
|
0
|
while (<$fh>) {
|
663
|
0
|
0
|
|
|
|
0
|
next if $INPUT_LINE_NUMBER == 1; # skip column heading line
|
664
|
0
|
|
|
|
|
0
|
$z++;
|
665
|
|
|
|
|
|
|
}
|
666
|
0
|
0
|
|
|
|
0
|
close $fh or croak $OS_ERROR;
|
667
|
0
|
|
|
|
|
0
|
return $z;
|
668
|
|
|
|
|
|
|
}
|
669
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
### PRIVATMETHODEN:
|
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
sub _get_orthon_f {
|
673
|
3
|
|
|
3
|
|
5
|
my ( $str, $path, $idx, $delim ) = @_;
|
674
|
3
|
|
|
|
|
5
|
my $word = lc($str);
|
675
|
3
|
|
|
|
|
12
|
require Lingua::Orthon;
|
676
|
3
|
|
|
|
|
12
|
my $ortho = Lingua::Orthon->new();
|
677
|
3
|
|
|
|
|
96
|
my @freqs = ();
|
678
|
3
|
50
|
|
|
|
102
|
open( my $fh, q{<}, $path ) or croak $OS_ERROR;
|
679
|
3
|
|
|
|
|
45
|
while (<$fh>) {
|
680
|
90
|
100
|
|
|
|
473
|
next if $INPUT_LINE_NUMBER == 1; # skip column heading line
|
681
|
87
|
50
|
|
|
|
297
|
/^([^\Q$delim\E]+)/xsm or next; # capture first token
|
682
|
87
|
|
|
|
|
113
|
my $test = lc($1);
|
683
|
87
|
100
|
|
|
|
138
|
if ( $ortho->are_orthons( $word, $test ) ) { # Lingua::Orthon method
|
684
|
33
|
|
|
|
|
376
|
chomp;
|
685
|
33
|
|
|
|
|
146
|
my @line = split m/\Q$delim\E/xsm;
|
686
|
33
|
|
|
|
|
65
|
push @freqs, _nummify($line[$idx]);
|
687
|
|
|
|
|
|
|
}
|
688
|
|
|
|
|
|
|
}
|
689
|
3
|
50
|
|
|
|
25
|
close $fh or croak $OS_ERROR;
|
690
|
3
|
|
|
|
|
22
|
return \@freqs;
|
691
|
|
|
|
|
|
|
}
|
692
|
|
|
|
|
|
|
|
693
|
|
|
|
|
|
|
sub _get_orthon_ldist {
|
694
|
2
|
|
|
2
|
|
4
|
my ( $str, $path, $lim, $delim ) = @_;
|
695
|
2
|
|
|
|
|
4
|
my $word = lc($str);
|
696
|
2
|
|
|
|
|
15
|
require Lingua::Orthon;
|
697
|
2
|
|
|
|
|
12
|
my $ortho = Lingua::Orthon->new();
|
698
|
2
|
|
|
|
|
73
|
my @ldists = ();
|
699
|
2
|
50
|
|
|
|
106
|
open( my $fh, q{<}, $path ) or croak $OS_ERROR;
|
700
|
2
|
|
|
|
|
4
|
my @tests = ();
|
701
|
2
|
|
|
|
|
45
|
while (<$fh>) {
|
702
|
86
|
100
|
|
|
|
270633
|
next if $INPUT_LINE_NUMBER == 1; # skip column heading line
|
703
|
84
|
50
|
|
|
|
550
|
/^([^\Q$delim\E]+)/msx or next;
|
704
|
84
|
|
|
|
|
176
|
my $test = lc($1);
|
705
|
84
|
100
|
|
|
|
455
|
next if $word eq $test;
|
706
|
82
|
|
|
|
|
188
|
push @ldists, $ortho->ldist( $word, $test );
|
707
|
|
|
|
|
|
|
}
|
708
|
2
|
50
|
|
|
|
6365
|
close $fh or croak $OS_ERROR;
|
709
|
2
|
|
|
|
|
19
|
my @sorted = sort { $a <=> $b } @ldists;
|
|
252
|
|
|
|
|
169
|
|
710
|
2
|
|
|
|
|
21
|
return mean( @sorted[ 0 .. $lim - 1 ] );
|
711
|
|
|
|
|
|
|
}
|
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
sub _get_fieldvalue {
|
714
|
10
|
|
|
10
|
|
14
|
my ( $self, $str, $col_i ) = @_;
|
715
|
10
|
50
|
|
|
|
24
|
croak
|
716
|
|
|
|
|
|
|
'No word to test; pass a letter-string named \'string\' to the function'
|
717
|
|
|
|
|
|
|
if nocontent($str);
|
718
|
10
|
|
|
|
|
102
|
$str = lc($str);
|
719
|
10
|
|
|
|
|
13
|
my $val = q{}; # default value returned is empty string
|
720
|
10
|
50
|
|
|
|
348
|
open( my $fh, q{<}, $self->{'path'} ) or croak $OS_ERROR;
|
721
|
10
|
|
|
|
|
104
|
while (<$fh>) {
|
722
|
135
|
100
|
|
|
|
220
|
next if $INPUT_LINE_NUMBER == 1; # skip column heading line
|
723
|
125
|
|
|
|
|
439
|
/^([^\Q$self->{'delim'}\E]+)/msx;
|
724
|
125
|
100
|
|
|
|
453
|
if ( $str eq $1 ) {
|
725
|
10
|
|
|
|
|
14
|
chomp; # or zipf will return with "\n" appended
|
726
|
10
|
|
|
|
|
81
|
my @line = split m/\Q$self->{'delim'}\E/msx, $_;
|
727
|
10
|
|
|
|
|
34
|
$val = _nummify($line[$col_i]);
|
728
|
10
|
|
|
|
|
23
|
last;
|
729
|
|
|
|
|
|
|
}
|
730
|
|
|
|
|
|
|
}
|
731
|
10
|
50
|
|
|
|
90
|
close $fh or croak;
|
732
|
10
|
|
|
|
|
101
|
return $val;
|
733
|
|
|
|
|
|
|
}
|
734
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
sub _in_range {
|
736
|
131
|
|
|
131
|
|
104
|
my ( $n, $min, $max ) = @_;
|
737
|
131
|
|
|
|
|
100
|
my $res = 1;
|
738
|
131
|
100
|
100
|
|
|
178
|
if ( hascontent($min) and $n < $min ) { # fails min
|
739
|
48
|
|
|
|
|
372
|
$res = 0;
|
740
|
|
|
|
|
|
|
}
|
741
|
131
|
100
|
100
|
|
|
768
|
if ( $res && ( hascontent($max) and $n > $max ) ) { # fails max and min
|
|
|
|
66
|
|
|
|
|
742
|
28
|
|
|
|
|
178
|
$res = 0;
|
743
|
|
|
|
|
|
|
}
|
744
|
131
|
|
|
|
|
679
|
return $res;
|
745
|
|
|
|
|
|
|
}
|
746
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
sub _nummify {
|
748
|
96
|
|
|
96
|
|
92
|
my $val = shift;
|
749
|
96
|
|
|
|
|
100
|
$val =~ s/,([^,]+)$/.$1/; # replace ultimate , with .
|
750
|
96
|
|
|
|
|
284
|
return $val;
|
751
|
|
|
|
|
|
|
}
|
752
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
=head1 DIAGNOSTICS
|
754
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
=over 4
|
756
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
=item Cannot determine field indices
|
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
When constructing the class object with L, the module needs to read in the contents of a file named "fields.csv" which should be housed within the SUBTLEX directory where the module itself is located (alongside the downloaded SUBTLEX files). This is necessary because the field indices for the various stats vary from one language file to the next. This should have been done with installation of the module itself. Check that this file is indeed within the Perl/site/lib/Lingua/Norms/SUBTLEX directory. If it is not, download and install the file to that location via the L package of this module.
|
760
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
=item Value given to argument 'dir' (VALUE) in new() is not a directory
|
762
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
Croaked from L if called with a value for the argument B, and this value is not actually a directory/folder. This is the directory/folder in which the actual SUBTLEX datafiles should be located.
|
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
=item Cannot find required database for language $lang
|
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
Croaked from L if none of the given values to arguments B, B or B are valid, and even the default site/lib directory and US database are not accessible. Check that your have indeed a file with the given value of B (DE, NL, UK or US) within the Perl/site/lib/Lingua/Norms/SUBTLEX directory, or at least that the SUBTLEX-US file is located within it, and can be read via your script.
|
768
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
=item Cannot determine fields for given language
|
770
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
Croaked upon construction if no fields are recognized for the given language. The value given to B must be one of DE, NL, UK or US.
|
772
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
=item No string to test; pass a string to the function
|
774
|
|
|
|
|
|
|
|
775
|
|
|
|
|
|
|
Croaked by several methods that expect a value for the named argument B, and when no such value is given. These methods require the letter-string to be passed to it as a I => I pair, with the key B followed by the value of the string to test.
|
776
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
=item No string(s) to test; pass one or more letter-strings named \'strings\' as a referenced array
|
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
Same as above but specifically croaked by L which accepts more than one string in a single call.
|
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
=item Need to install and have access to module File::RandomLine
|
782
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
Croaked by method L if the module it depends on (File::RandomLine) is not installed or accessible. This should have been installed (if not already) upon installation of the present module. See L to download and install this module manually.
|
784
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
=back
|
786
|
|
|
|
|
|
|
|
787
|
|
|
|
|
|
|
=head1 DEPENDENCIES
|
788
|
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
L : needed to work L.
|
790
|
|
|
|
|
|
|
|
791
|
|
|
|
|
|
|
L : handy for directory reading when calling L.
|
792
|
|
|
|
|
|
|
|
793
|
|
|
|
|
|
|
L : needed to calculate Levenshtein Distance, assessing orthographic neighbourhood.
|
794
|
|
|
|
|
|
|
|
795
|
|
|
|
|
|
|
L : handy C function.
|
796
|
|
|
|
|
|
|
|
797
|
|
|
|
|
|
|
L : needed for the various statistical methods.
|
798
|
|
|
|
|
|
|
|
799
|
|
|
|
|
|
|
L : utilities for determining valid string values.
|
800
|
|
|
|
|
|
|
|
801
|
|
|
|
|
|
|
L : depended upon to determine the delimiter (comma or tab) within the datafiles.
|
802
|
|
|
|
|
|
|
|
803
|
|
|
|
|
|
|
=head1 REFERENCES
|
804
|
|
|
|
|
|
|
|
805
|
|
|
|
|
|
|
B (2011). The word frequency effect: A review of recent developments and implications for the choice of frequency estimates in German. I, I<58>, 412-424. doi: L<10.1027/1618-3169/a000123|http://dx.doi.org/10.1027/1618-3169/a000123>
|
806
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
B (2009). Moving beyond Kucera and Francis: A critical evaluation of current word frequency norms and the introduction of a new and improved word frequency measure for American English. I, I<41>, 977-990. doi: L<10.3758/BRM.41.4.977|http://dx.doi.org/10.3758/BRM.41.4.977>
|
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
B (2012). Adding part-of-speech information to the SUBTLEX-US word frequencies. I, I<44>, 991-997. doi: L<10.3758/s13428-012-0190-4|http://dx.doi.org/10.3758/s13428-012-0190-4>
|
810
|
|
|
|
|
|
|
|
811
|
|
|
|
|
|
|
B (1977). Access to the internal lexicon. In S. Dornic (Ed.), I (Vol. 6, pp. 535-555). London, UK: Academic.
|
812
|
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
B (2010). SUBTLEX-NL: A new frequency measure for Dutch words based on film subtitles. I, I<42>, 643-650. doi: L<10.3758/BRM.42.3.643|http://dx.doi.org/10.3758/BRM.42.3.643>
|
814
|
|
|
|
|
|
|
|
815
|
|
|
|
|
|
|
B (2014). SUBTLEX-UK: A new and improved word frequency database for British English. I, I<67>, 1176-1190. doi: L<10.1080/17470218.2013.850521|http://dx.doi.org/10.1080/17470218.2013.850521>
|
816
|
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
B (2008). Moving beyond Coltheart's I: A new measure of orthographic similarity. I, I<15>, 971-979. doi: L<10.3758/PBR.15.5.971|http://dx.doi.org/10.3758/PBR.15.5.971>
|
818
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
=head1 AUTHOR
|
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
Roderick Garton, C<< >>
|
822
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
=head1 BUGS AND LIMITATIONS
|
824
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
Please report any bugs or feature requests to C, or through
|
826
|
|
|
|
|
|
|
the web interface at L. I will be notified, and then you'll
|
827
|
|
|
|
|
|
|
automatically be notified of progress on your bug as I make changes.
|
828
|
|
|
|
|
|
|
|
829
|
|
|
|
|
|
|
=head1 SUPPORT
|
830
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command.
|
832
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
perldoc Lingua::Norms::SUBTLEX
|
834
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
You can also look for information at:
|
837
|
|
|
|
|
|
|
|
838
|
|
|
|
|
|
|
=over 4
|
839
|
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker (report bugs here)
|
841
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
L
|
843
|
|
|
|
|
|
|
|
844
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation
|
845
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
L
|
847
|
|
|
|
|
|
|
|
848
|
|
|
|
|
|
|
=item * CPAN Ratings
|
849
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
L
|
851
|
|
|
|
|
|
|
|
852
|
|
|
|
|
|
|
=item * Search CPAN
|
853
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
L
|
855
|
|
|
|
|
|
|
|
856
|
|
|
|
|
|
|
=back
|
857
|
|
|
|
|
|
|
|
858
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT
|
859
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
Copyright 2014-2015 Roderick Garton.
|
861
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it
|
863
|
|
|
|
|
|
|
under the terms of either: the GNU General Public License as published
|
864
|
|
|
|
|
|
|
by the Free Software Foundation; or the Artistic License.
|
865
|
|
|
|
|
|
|
|
866
|
|
|
|
|
|
|
See L for more information.
|
867
|
|
|
|
|
|
|
|
868
|
|
|
|
|
|
|
To the maximum extent permitted by applicable law, the author of this module disclaims all warranties, either express or implied, including but not limited to implied warranties of merchantability and fitness for a particular purpose, with regard to the software and the accompanying documentation.
|
869
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
=cut
|
871
|
|
|
|
|
|
|
|
872
|
|
|
|
|
|
|
1; # End of Lingua::Norms::SUBTLEX
|
873
|
|
|
|
|
|
|
|