line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::Norms::SUBTLEX;
|
2
|
17
|
|
|
17
|
|
1233861
|
use 5.12.0;
|
|
17
|
|
|
|
|
185
|
|
3
|
17
|
|
|
17
|
|
106
|
use strict;
|
|
17
|
|
|
|
|
28
|
|
|
17
|
|
|
|
|
372
|
|
4
|
17
|
|
|
17
|
|
83
|
use warnings FATAL => 'all';
|
|
17
|
|
|
|
|
27
|
|
|
17
|
|
|
|
|
675
|
|
5
|
17
|
|
|
17
|
|
98
|
use base qw(Lingua::Orthon);
|
|
17
|
|
|
|
|
20
|
|
|
17
|
|
|
|
|
9007
|
|
6
|
17
|
|
|
17
|
|
558972
|
use Config;
|
|
17
|
|
|
|
|
40
|
|
|
17
|
|
|
|
|
688
|
|
7
|
17
|
|
|
17
|
|
83
|
use Carp qw(carp croak);
|
|
17
|
|
|
|
|
32
|
|
|
17
|
|
|
|
|
768
|
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
#use Encode qw(encode decode);
|
10
|
|
|
|
|
|
|
#use Encode::Guess;
|
11
|
17
|
|
|
17
|
|
8038
|
use English '-no_match_vars';
|
|
17
|
|
|
|
|
52953
|
|
|
17
|
|
|
|
|
97
|
|
12
|
17
|
|
|
17
|
|
4935
|
use File::Spec;
|
|
17
|
|
|
|
|
33
|
|
|
17
|
|
|
|
|
424
|
|
13
|
17
|
|
|
17
|
|
70
|
use List::AllUtils qw(all any first firstidx none uniq);
|
|
17
|
|
|
|
|
27
|
|
|
17
|
|
|
|
|
889
|
|
14
|
17
|
|
|
17
|
|
93
|
use Number::Misc qw(is_numeric);
|
|
17
|
|
|
|
|
31
|
|
|
17
|
|
|
|
|
836
|
|
15
|
17
|
|
|
17
|
|
12706
|
use Path::Tiny;
|
|
17
|
|
|
|
|
172141
|
|
|
17
|
|
|
|
|
793
|
|
16
|
17
|
|
|
17
|
|
8519
|
use Readonly;
|
|
17
|
|
|
|
|
57378
|
|
|
17
|
|
|
|
|
853
|
|
17
|
17
|
|
|
17
|
|
116
|
use Statistics::Lite qw(count max mean median stddev sum);
|
|
17
|
|
|
|
|
40
|
|
|
17
|
|
|
|
|
847
|
|
18
|
17
|
|
|
17
|
|
6852
|
use String::Trim qw(trim);
|
|
17
|
|
|
|
|
8660
|
|
|
17
|
|
|
|
|
1001
|
|
19
|
17
|
|
|
17
|
|
111
|
use String::Util qw(hascontent crunch fullchomp nocontent unquote);
|
|
17
|
|
|
|
|
44
|
|
|
17
|
|
|
|
|
746
|
|
20
|
17
|
|
|
17
|
|
7980
|
use Text::CSV::Hashify;
|
|
17
|
|
|
|
|
1533986
|
|
|
17
|
|
|
|
|
1168
|
|
21
|
17
|
|
|
17
|
|
7934
|
use Text::CSV::Separator qw(get_separator);
|
|
17
|
|
|
|
|
28486
|
|
|
17
|
|
|
|
|
981
|
|
22
|
17
|
|
|
17
|
|
7981
|
use Text::Unidecode;
|
|
17
|
|
|
|
|
28128
|
|
|
17
|
|
|
|
|
51317
|
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
#use open ':encoding(utf8)';
|
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
$Lingua::Norms::SUBTLEX::VERSION = '0.07';
|
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
=pod
|
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
=encoding utf8
|
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=head1 NAME
|
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
Lingua::Norms::SUBTLEX - Retrieve word frequencies and related values and lists from subtitles corpora
|
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=head1 VERSION
|
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
This is documentation for B of Lingua::Norms::SUBTLEX.
|
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=head1 SYNOPSIS
|
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
use Lingua::Norms::SUBTLEX 0.07;
|
43
|
|
|
|
|
|
|
my $subtlex = Lingua::Norms::SUBTLEX->new(lang => 'UK');
|
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
# Is the string 'frog' in the subtitles corpus?
|
46
|
|
|
|
|
|
|
my $bool = $subtlex->is_normed(string => 'frog');
|
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
# Occurrences-per-million:
|
49
|
|
|
|
|
|
|
# - for a single string:
|
50
|
|
|
|
|
|
|
my $frq = $subtlex->frq_opm(string => 'frog'); # freq. per million; also count, log-f, Zipf
|
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# - for a list of strings:
|
53
|
|
|
|
|
|
|
my $href = $subtlex->frq_hash(strings => [qw/frog fish ape/]); # freqs. for a list of words
|
54
|
|
|
|
|
|
|
print "'$_' opm\t$href->{$_}\n" for keys %{$href};
|
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
# stats:
|
57
|
|
|
|
|
|
|
printf "mean opm\t%f\n", $subtlex->frq_mean(strings => [qw/frog fish ape/]); # or median, std-dev.
|
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
# parts-of-speech:
|
60
|
|
|
|
|
|
|
printf "'frog' part-of-speech = %s\n", $subtlex->pos_dom(string => 'frog');
|
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
# retrieve (list of) words to certain specs, e.g., min/max range:
|
63
|
|
|
|
|
|
|
my $aref = $subtlex->select_words(freq => [2, 400], length => [4, 4], cv_pattern => 'CCVC', regex => '^f');
|
64
|
|
|
|
|
|
|
printf "Number of 4-letter CCVC strings with 2-400 opm starting with 'f' = %d\n", scalar @{$aref};
|
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
printf "A randomly selected subtitles string is '%s'\n", $subtlex->random_string();
|
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
=head1 DESCRIPTION
|
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
This module facilitates access to corpus frequency and other lexical attributes of character strings (generally, words), as provided in the various SUBTLEX and related projects (see L) on the basis of the representation of these strings in film and television subtitles (see L). Word frequencies obtained in this way have been shown to be generally more predictive of performance in word recognition tasks than frequencies derived from books, newsgroup posts, and similar sources (but see Herdagdelen & Marelli, 2017).
|
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
There are three main groups of measures that are potentially retrievable from the SUBTLEX datatables: (1) frequency; (2) contextual diversity (number of films/TV episodes appeared in); and (3) parts-of-speech. The module tries to uniformly offer, across the available files, frequency as a count (L), occurrences per million (L), logarithm of the opm or frequency count (L), and/or the 7-point scaled Zipf frequency (L). "Contextual diversity" is given as a count (L), a percentage (L), and/or a logarithm (L). For parts-of-speech, the module returns, via L, the dominant linguistic syntactical role of the word, as well as all defined parts-of-speech for a word (via L).
|
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
However, not all these methods are available across all projects; e.g., SUBTLEX-NL does not define Zipf frequency, and SUBTLEX-DE does not define CD, POS or Zipf frequency. In these cases, the method in question will return an empty string.
|
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
=head1 CORPORA SPECS and SOURCES
|
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
The SUBTLEX files need to be downloaded via the URLs shown in the table below (only a small sample from each of each of the SUBTLEX corpora is included in the installation distribution for testing purposes). So, for example, for the I norms, install the file named "SUBTLEX-US frequency list with PoS and Zipf information.csv" via L.
|
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
The local directory location or actual pathname of these files can be given in class construction (by the arguments B and B, respectively); or it will be sought from the default location--within the directory "SUBTLEX" alongside the module itself in the locally configured Perl sitelib--given the B argument to L, or to L. The filenames of the original files downloaded from the following sites should be found in this way, but it should uniquely include the "key" shown in the table. The module will attempt to identify the correct field separator for the file (which can be comma-separated or tab-delimited). Only the files specified in the table are likely to be reliably accessed at this time.
|
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
=for html
|
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
Notes regarding these different corpora.
|
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
=over 4
|
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
=item * SUBTLEX-DE
|
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
The file has separate entries for words starting with an uppercase and a lowercase letter (e.g., for when a letter-string is both a noun and an adjective).
|
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=item * Lexique (SUBTLEX-FR)
|
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
If not giving the full path to this file, it should be renamed to include "FR" (e.g., "FR_Lexique.csv") and stored in the default directory. The file also includes frequencies from books.
|
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
=item * SUBTLEX-PT
|
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
The I subtitles data are available as an Excel file (directly from L). This file needs to be saved as a (csv) text file to be usable here.
|
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=item * SUBTLEX-UK
|
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
Includes words that might be spelled with a dash both with a dash and without; so there are separate entries for I and I, and for I and I. It includes some strings with apostrophes (e.g., I, I); but common contractions like I, I and I do not appear; they are stripped of their apostrophes, listed, e.g., as I, I and I. All strings are in lower-case; so I is represented as I.
|
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=item * SUBTLEX-US
|
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
There are no strings with capitalized onsets in this file, or with punctuation marks, including apostrophes and dashes (e.g., I and I are represented as I and I; I as I, and I as I).
|
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
The earlier, original file "SUBTLEXusExcel2007.csv" presents strings as they were originally capitalised: there is, e.g., I and I--but neither I nor I. This file does not provide part-of-speech or Zipf frequencies.
|
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
=back
|
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
There are several other languages from this project which might be supported by this module in a later version (originally, only SUBTLEX-US was supported).
|
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
See the new() method as to how this module handles case-sensitivity and diacritical marks. For files where strings are UTF-8 encoded, the strings being looked up should also be UTF-8 encoded (if they are diacritically marked, e.g. "embâcle")(see L).
|
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
If using Miscrosoft Excel to save any of these files, even if in CSV format, Excel will turn the words "true" and "false" into the Boolean strings "TRUE" and "FALSE", as well as throw them aside from alphabetic sorting (right down to the bottom of an alphabetic sort). That will surely stuff up any neatly intended pattern-matching for these words.
|
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
=head1 SUBROUTINES/METHODS
|
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
All methods are called via the class object, and with named (hash of) arguments, usually B, where relevant.
|
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=head2 new
|
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
$subtlex = Lingua::Norms::SUBTLEX->new(lang => 'DE'); # - looking in Perl sitelib
|
135
|
|
|
|
|
|
|
$subtlex = Lingua::Norms::SUBTLEX->new(lang => 'DE', dir => 'file_directory'); # folder in which file is located
|
136
|
|
|
|
|
|
|
$subtlex = Lingua::Norms::SUBTLEX->new(lang => 'DE', path => 'file/is/here.csv'); # complete path to file for given language
|
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
Returns a class object for accessing other methods. The argument B is required, specifying the particular language datafile by a "key" as given in the above table. Optional arguments B or B can be given to specify the location or filepath of the database. The default location is the "Lingua/Norms/SUBTLEX" directory within the 'sitelib' configured for the local Perl installation (as per L). The method will C if the file cannot be found.
|
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
The optional argument B specifies how string comparison, as when looking up a given word in the SUBTLEX corpus, should be conducted, with the function used to test string equality being derived from the C function in L (part of the standard Perl distribution). This matching level applies to the look-up of strings within all methods, including those specifically assessing orthographic equality. This argument can take one of three values: see L:
|
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
=cut
|
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
sub new {
|
145
|
23
|
|
|
23
|
1
|
13631
|
my ( $class, %args ) = @_;
|
146
|
23
|
50
|
|
|
|
114
|
my $self = bless {}, ref($class) ? ref($class) : $class;
|
147
|
|
|
|
|
|
|
$self->{'_MODULE_DIR'} =
|
148
|
23
|
|
|
|
|
1586
|
File::Spec->catdir( $Config{'sitelib'}, qw/Lingua Norms SUBTLEX/ );
|
149
|
23
|
|
|
|
|
175
|
$self->set_lang(%args);
|
150
|
23
|
|
|
|
|
270
|
$self->set_eq( match_level => $args{'match_level'} );
|
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
#_set_encoding($args{'decode'});
|
153
|
23
|
|
|
|
|
951
|
return $self;
|
154
|
|
|
|
|
|
|
}
|
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=head2 Frequencies and POS for individual words or word-lists
|
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
=head3 is_normed
|
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
$bool = $subtlex->is_normed(string => $word);
|
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
I: isa_word
|
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
Returns 1 or 0 as to whether or not the letter-string passed as B is represented in the subtitles file. For some files, this might be thought of as a lexical decision ("does this string spell a word?"); but others include misspelled words (e.g., "pyscho"), digit strings, abbreviations ...
|
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
=cut
|
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
sub is_normed {
|
169
|
10
|
|
|
10
|
1
|
116431
|
my ( $self, %args ) = @_;
|
170
|
10
|
|
|
|
|
32
|
my $str = _get_usr_str( $args{'string'} );
|
171
|
10
|
|
|
|
|
16
|
my $res = 0; # boolean to return from this sub
|
172
|
10
|
50
|
|
|
|
479
|
open my $fh, q{<}, $self->{'_PATH'} or croak $OS_ERROR;
|
173
|
10
|
|
|
|
|
312
|
while (<$fh>) {
|
174
|
227
|
100
|
|
|
|
60266
|
next if $INPUT_LINE_NUMBER == 1; # skip headings
|
175
|
217
|
|
|
|
|
375
|
my $file_str = _get_file_str( $_, $self->{'_DELIM'} )
|
176
|
|
|
|
|
|
|
; # have to declare as can be empty (!)
|
177
|
217
|
50
|
|
|
|
5089
|
next if nocontent($file_str);
|
178
|
217
|
100
|
|
|
|
1897
|
if ( $self->{'_EQ'}->( $str, $file_str ) )
|
179
|
|
|
|
|
|
|
{ # first token equals given string?
|
180
|
6
|
|
|
|
|
1737
|
$res = 1; # set result to return as true
|
181
|
6
|
|
|
|
|
13
|
last; # got it, so abort look-up
|
182
|
|
|
|
|
|
|
}
|
183
|
|
|
|
|
|
|
}
|
184
|
10
|
50
|
|
|
|
978
|
close $fh or croak $OS_ERROR;
|
185
|
10
|
|
|
|
|
73
|
return $res; # zero if string not found in file
|
186
|
|
|
|
|
|
|
}
|
187
|
|
|
|
|
|
|
*isa_word = \&is_normed;
|
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
=head3 frq_count
|
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
$int = $subtlex->frq_count(string => 'aword');
|
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
Returns the raw number of occurrences in all the films/TV episodes for the word passed as B, or 0 if the string is not found in language file.
|
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=cut
|
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
sub frq_count {
|
198
|
6
|
|
|
6
|
1
|
3104
|
my ( $self, %args ) = @_;
|
199
|
|
|
|
|
|
|
return _val_or_0(
|
200
|
|
|
|
|
|
|
_get_val_for_str(
|
201
|
|
|
|
|
|
|
_get_usr_str( $args{'string'} ),
|
202
|
|
|
|
|
|
|
$self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'frq_count_idx' ),
|
203
|
6
|
|
|
|
|
22
|
map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
|
|
18
|
|
|
|
|
134
|
|
204
|
|
|
|
|
|
|
)
|
205
|
|
|
|
|
|
|
);
|
206
|
|
|
|
|
|
|
}
|
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
=head3 frq_opm
|
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
$val = $subtlex->frq_opm(string => 'aword');
|
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
I: opm
|
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
Returns frequency per million for the word passed as B, or 0 if the string is not found in language file.
|
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
=cut
|
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
sub frq_opm {
|
219
|
30
|
|
|
30
|
1
|
7528
|
my ( $self, %args ) = @_;
|
220
|
|
|
|
|
|
|
return _val_or_0(
|
221
|
|
|
|
|
|
|
_get_val_for_str(
|
222
|
|
|
|
|
|
|
_get_usr_str( $args{'string'} ),
|
223
|
|
|
|
|
|
|
$self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'frq_opm_idx' ),
|
224
|
30
|
|
|
|
|
77
|
map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
|
|
90
|
|
|
|
|
591
|
|
225
|
|
|
|
|
|
|
)
|
226
|
|
|
|
|
|
|
);
|
227
|
|
|
|
|
|
|
}
|
228
|
|
|
|
|
|
|
*freq = \&frq_opm; # legacy only
|
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
=head3 frq_log
|
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
$val = $subtlex->frq_log(string => 'aword');
|
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
Returns log frequency per million for the word passed as B, or the empty-string if the string is not represented in the norms.
|
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
=cut
|
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
sub frq_log {
|
239
|
5
|
|
|
5
|
1
|
1946
|
my ( $self, %args ) = @_;
|
240
|
|
|
|
|
|
|
return _get_val_for_str(
|
241
|
|
|
|
|
|
|
_get_usr_str( $args{'string'} ),
|
242
|
|
|
|
|
|
|
$self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'frq_log_idx' ),
|
243
|
5
|
|
|
|
|
15
|
map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
|
|
15
|
|
|
|
|
104
|
|
244
|
|
|
|
|
|
|
);
|
245
|
|
|
|
|
|
|
}
|
246
|
|
|
|
|
|
|
*lfreq = \&frq_log; # legacy only
|
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
=head3 frq_zipf
|
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
$val = $subtlex->frq_zipf(string => 'aword');
|
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
Returns Zipf frequency for the word passed as B, or the empty-string if the string is not represented in the language file. The Zipf scale ranges from about 1 to 7, with values of 1-3 generally representing low frequency words, and values of generally 4-7+ representing high frequency words, with respect to various recognition measures used in the study of word frequency effects. See Van Heuven et al. (2014) and L for more information.
|
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=cut
|
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
sub frq_zipf {
|
257
|
6
|
|
|
6
|
1
|
1429
|
my ( $self, %args ) = @_;
|
258
|
|
|
|
|
|
|
return _get_val_for_str(
|
259
|
|
|
|
|
|
|
_get_usr_str( $args{'string'} ),
|
260
|
|
|
|
|
|
|
$self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'frq_zipf_idx' ),
|
261
|
6
|
|
|
|
|
19
|
map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
|
|
18
|
|
|
|
|
124
|
|
262
|
|
|
|
|
|
|
);
|
263
|
|
|
|
|
|
|
}
|
264
|
|
|
|
|
|
|
*zipf = \&frq_zipf; # legacy only
|
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
=head3 frq_zipf_calc
|
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
$calc = $subtlex->frq_zipf_calc( string => 'favourite' );
|
269
|
|
|
|
|
|
|
$calc = $subtlex->frq_zipf_calc( string => 'favourite', corpus_size => POS_FLOAT_in_millions, n_wordtypes => POS_INT );
|
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
Returns an estimate of Zipf frequency by calculating its value from the given or retrievable L or L, and the given or retrievable values of the corpus_size and n_wordtypes for the particular SUBTLEX project; i.e., the values of corpus_size and n_wordtypes can be provided as named arguments. As introduced by Van Heuven et al. (2014) (see also L):
|
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
=for html Zipf = log10[ ( frq_count + 1 ) / ( corpus_size + n_wordtypes )/1000000 ] + 3
|
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
How well the returned value satisfies the "border relations" desired of the index (e.g., that up to 1 opm corresponds to Zipf of E 3) depends on the reliability of the corpus size and wordtype counts, and any rounding of these values (where relevant) and (if required) of the opm. Examinations of the returned values show that, when using the canned and reported values (which is the default here), they align with these definitions, and with any canned Zipf values, within the margins of about the third or fourth decimal place.
|
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
=cut
|
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
sub frq_zipf_calc {
|
280
|
0
|
|
|
0
|
1
|
0
|
my ( $self, %args ) = @_;
|
281
|
|
|
|
|
|
|
my $corpus_size =
|
282
|
|
|
|
|
|
|
defined $args{'size_corpus'}
|
283
|
|
|
|
|
|
|
? $args{'size_corpus'}
|
284
|
0
|
0
|
|
|
|
0
|
: $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'size_corpus_mill' );
|
285
|
|
|
|
|
|
|
my $n_wordtypes =
|
286
|
|
|
|
|
|
|
defined $args{'n_wordtypes'}
|
287
|
|
|
|
|
|
|
? $args{'n_wordtypes'}
|
288
|
0
|
0
|
|
|
|
0
|
: $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'n_wordtypes' );
|
289
|
|
|
|
|
|
|
|
290
|
0
|
|
|
|
|
0
|
$n_wordtypes /= 1_000_000;
|
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
my $frq_count =
|
293
|
|
|
|
|
|
|
is_numeric( $args{'frq_count'} )
|
294
|
|
|
|
|
|
|
? $args{'frq_count'}
|
295
|
|
|
|
|
|
|
: is_numeric( $args{'frq_opm'} ) ? sprintf "%.0f",
|
296
|
0
|
0
|
|
|
|
0
|
$args{'frq_opm'} * $corpus_size : eval { $self->frq_count(%args) };
|
|
0
|
0
|
|
|
|
0
|
|
297
|
|
|
|
|
|
|
|
298
|
0
|
0
|
0
|
|
|
0
|
if ($EVAL_ERROR or not is_numeric($frq_count) ) {
|
299
|
0
|
|
|
|
|
0
|
my $frq_opm = eval { $self->frq_opm(%args) };
|
|
0
|
|
|
|
|
0
|
|
300
|
0
|
0
|
0
|
|
|
0
|
if (not $EVAL_ERROR and is_numeric($frq_opm) ) {
|
301
|
0
|
|
|
|
|
0
|
$frq_count = sprintf "%.0f", $frq_opm * $corpus_size;
|
302
|
|
|
|
|
|
|
}
|
303
|
|
|
|
|
|
|
}
|
304
|
0
|
|
0
|
|
|
0
|
$frq_count ||= 0;
|
305
|
|
|
|
|
|
|
|
306
|
0
|
|
|
|
|
0
|
return _log10( ( 1 + $frq_count ) / ( $corpus_size + $n_wordtypes ) ) + 3;
|
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
}
|
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
=head3 frq_opm2count
|
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
$int = $subtlex->frq_opm2count(string => STRING);
|
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
Returns the raw number of occurrences of a string (the frq_count) based on the number of occurrences per million (frq_opm), and the corpus size in millions. Returns 0 if the string is not found in language file.
|
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
The B can be given as a named argument, or it will be retrieved by the L respective method, where this is defined for a particular language file. The B (in millions) can also be given as a named argument, or it will be retrieved from the specifications file (specs.csv in the module's directory), where this value has been obtainable from published reports.
|
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
=cut
|
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
sub frq_opm2count {
|
321
|
5
|
|
|
5
|
1
|
2433
|
my ( $self, %args ) = @_;
|
322
|
|
|
|
|
|
|
my $frq_opm =
|
323
|
5
|
50
|
|
|
|
27
|
defined $args{'frq_opm'} ? $args{'frq_opm'} : $self->frq_opm(%args);
|
324
|
|
|
|
|
|
|
my $corpus_size =
|
325
|
|
|
|
|
|
|
defined $args{'size_corpus'}
|
326
|
|
|
|
|
|
|
? $args{'size_corpus'}
|
327
|
5
|
50
|
|
|
|
115
|
: $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'size_corpus_mill' );
|
328
|
5
|
|
|
|
|
95
|
return sprintf "%.0f", $frq_opm * $corpus_size;
|
329
|
|
|
|
|
|
|
}
|
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
=head3 cd_count
|
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
$cd = $subtlex->cd_count(string => STRING);
|
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
Returns the number of samples (films/TV episodes) comprising the corpus in which the string occurred in its subtitles; so-called "contextual diversity". Returns 0 if the string is not found in language file.
|
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
=cut
|
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
sub cd_count {
|
340
|
5
|
|
|
5
|
1
|
1027
|
my ( $self, %args ) = @_;
|
341
|
|
|
|
|
|
|
return _val_or_0(
|
342
|
|
|
|
|
|
|
_get_val_for_str(
|
343
|
|
|
|
|
|
|
_get_usr_str( $args{'string'} ),
|
344
|
|
|
|
|
|
|
$self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'cd_count_idx' ),
|
345
|
5
|
|
|
|
|
13
|
map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
|
|
15
|
|
|
|
|
96
|
|
346
|
|
|
|
|
|
|
)
|
347
|
|
|
|
|
|
|
);
|
348
|
|
|
|
|
|
|
}
|
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
=head3 cd_pct
|
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
$cd = $subtlex->cd_pct(string => 'aword');
|
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
Returns a percentage measure for the number of samples (films/TV episodes) comprising the corpus in which the B occurred in its subtitles; so-called "contextual diversity". Returns 0 if the string is not found in language file.
|
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
=cut
|
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
sub cd_pct {
|
359
|
5
|
|
|
5
|
1
|
563
|
my ( $self, %args ) = @_;
|
360
|
|
|
|
|
|
|
return _val_or_0(
|
361
|
|
|
|
|
|
|
_get_val_for_str(
|
362
|
|
|
|
|
|
|
_get_usr_str( $args{'string'} ),
|
363
|
|
|
|
|
|
|
$self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'cd_pct_idx' ),
|
364
|
5
|
|
|
|
|
16
|
map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
|
|
15
|
|
|
|
|
120
|
|
365
|
|
|
|
|
|
|
)
|
366
|
|
|
|
|
|
|
);
|
367
|
|
|
|
|
|
|
}
|
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
=head3 cd_log
|
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
$cd = $subtlex->cd_log(string => 'aword');
|
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
Returns log10(L + 1) for the given string, with 4-digit precision. Note: Brysbaert and New (2009) state that "this is the best value to use if one wants to match words on word frequency" (p. 988).
|
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
=cut
|
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
sub cd_log {
|
378
|
5
|
|
|
5
|
1
|
613
|
my ( $self, %args ) = @_;
|
379
|
|
|
|
|
|
|
return _get_val_for_str(
|
380
|
|
|
|
|
|
|
_get_usr_str( $args{'string'} ),
|
381
|
|
|
|
|
|
|
$self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'cd_log_idx' ),
|
382
|
5
|
|
|
|
|
13
|
map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
|
|
15
|
|
|
|
|
93
|
|
383
|
|
|
|
|
|
|
);
|
384
|
|
|
|
|
|
|
}
|
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
=head3 pos_dom
|
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
$pos_str = $subtlex->pos_dom(string => STRING, conform => BOOL);
|
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
Returns the dominant part-of-speech for the given string. The return value is undefined if the string is not found. If the field in the original file (as in SUBTLEX-PT) is actually for all possible parts-of-speech, the first element in the returned string (once split by non-word characters), is returned (assuming, as in SUBTLEX-PT) that this is indeed the most frequent part-of-speech for the particular string.
|
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
For interpretation of the POS codes: for NL, see L ("SPEC" is there defined as "often personal or geographical names" and so similar to "Name" in SUBTLEX-UK).
|
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
To transliterate the various codes into a common two-letter code, then set B => 1 (default is not defined, returning the POS string as given in the original files). The two-letter codes are:
|
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
NN noun (common)
|
397
|
|
|
|
|
|
|
NM name (proper)
|
398
|
|
|
|
|
|
|
PN pronoun
|
399
|
|
|
|
|
|
|
VB verb
|
400
|
|
|
|
|
|
|
AJ adjective
|
401
|
|
|
|
|
|
|
AV adverb
|
402
|
|
|
|
|
|
|
PP proposition
|
403
|
|
|
|
|
|
|
CJ conjunction
|
404
|
|
|
|
|
|
|
IJ interjection
|
405
|
|
|
|
|
|
|
DA determiner or article
|
406
|
|
|
|
|
|
|
NB number
|
407
|
|
|
|
|
|
|
OT other
|
408
|
|
|
|
|
|
|
UK unknown
|
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
The "OT" code includes some rare POS values (e.g., "marker", "ONO"), anomalous values (e.g., "2"), and values not defined in the associated reports. The "UK" code ("unknown") is comprised of values specifically recorded as "unclassified" or similar, or where the POS field is empty.
|
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
=cut
|
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
sub pos_dom {
|
415
|
13
|
|
|
13
|
1
|
2469
|
my ( $self, %args ) = @_;
|
416
|
|
|
|
|
|
|
my $str = _get_val_for_str(
|
417
|
|
|
|
|
|
|
_get_usr_str( $args{'string'} ),
|
418
|
|
|
|
|
|
|
$self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'pos_dom_idx' ),
|
419
|
13
|
|
|
|
|
44
|
map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
|
|
39
|
|
|
|
|
283
|
|
420
|
|
|
|
|
|
|
);
|
421
|
13
|
|
|
|
|
83
|
my @ari = map { trim($_) } grep { hascontent($_) } split /[\W]/xsm, $str;
|
|
15
|
|
|
|
|
161
|
|
|
15
|
|
|
|
|
58
|
|
422
|
|
|
|
|
|
|
return $args{'conform'}
|
423
|
13
|
100
|
|
|
|
311
|
? _pos_is( $ari[0], $self->{'_FIELDS'}, $self->{'_LANG'} )->[0]
|
424
|
|
|
|
|
|
|
: $ari[0];
|
425
|
|
|
|
|
|
|
}
|
426
|
|
|
|
|
|
|
*pos = \&pos_dom;
|
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
=head3 pos_all
|
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
$pos_aref = $subtlex->pos_all(string => STRING, conform => BOOL);
|
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
Returns all parts-of-speech for the given string as a referenced array. The return value is an empty list if the string is not found. If the language file does not define this field, the returned value is simply the same as what would, if possible, be returned from L (i.e., if that value is defined), but now as a referenced array.
|
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
=cut
|
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
sub pos_all {
|
437
|
9
|
|
|
9
|
1
|
2150
|
my ( $self, %args ) = @_;
|
438
|
|
|
|
|
|
|
my $str = _get_val_for_str(
|
439
|
|
|
|
|
|
|
_get_usr_str( $args{'string'} ),
|
440
|
|
|
|
|
|
|
$self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'pos_all_idx' ),
|
441
|
9
|
|
|
|
|
26
|
map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
|
|
27
|
|
|
|
|
167
|
|
442
|
|
|
|
|
|
|
);
|
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
# grep to ensure no empty values as might come from a head/trailing delimiter:
|
445
|
9
|
|
|
|
|
59
|
my @ari = map { trim($_) } grep { hascontent($_) } split /[\W]/xsm, $str;
|
|
26
|
|
|
|
|
316
|
|
|
28
|
|
|
|
|
152
|
|
446
|
|
|
|
|
|
|
return $args{'conform'}
|
447
|
9
|
100
|
|
|
|
166
|
? [ map { @{ _pos_is( $_, $self->{'_FIELDS'}, $self->{'_LANG'} ) } }
|
|
8
|
|
|
|
|
11
|
|
|
8
|
|
|
|
|
16
|
|
448
|
|
|
|
|
|
|
@ari ]
|
449
|
|
|
|
|
|
|
: \@ari;
|
450
|
|
|
|
|
|
|
}
|
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
=head2 Multiple strings/values lists
|
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
Array given as measures to the following methods might include one or more of the following:
|
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
frq_count
|
457
|
|
|
|
|
|
|
frq_opm
|
458
|
|
|
|
|
|
|
frq_log
|
459
|
|
|
|
|
|
|
frq_zipf
|
460
|
|
|
|
|
|
|
cd_count
|
461
|
|
|
|
|
|
|
cd_pct
|
462
|
|
|
|
|
|
|
cd_log
|
463
|
|
|
|
|
|
|
pos_dom
|
464
|
|
|
|
|
|
|
pos_all
|
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
=head3 values_list
|
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
$aref = $subtlex->values_list(string => STRING, values => AREF);
|
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
Returns values for a single letter-string as a referenced array.
|
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
=cut
|
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
sub values_list {
|
475
|
3
|
|
|
3
|
1
|
1893
|
my ( $self, %args ) = @_;
|
476
|
3
|
|
|
|
|
7
|
my @idx_ari;
|
477
|
3
|
50
|
|
|
|
9
|
if ( ref $args{'values'} ) {
|
478
|
3
|
|
|
|
|
5
|
for my $field ( @{ $args{'values'} } ) {
|
|
3
|
|
|
|
|
8
|
|
479
|
|
|
|
|
|
|
push @idx_ari,
|
480
|
6
|
|
|
|
|
64
|
$self->{'_FIELDS'}->datum( $self->{'_LANG'}, $field . '_idx' );
|
481
|
|
|
|
|
|
|
}
|
482
|
|
|
|
|
|
|
}
|
483
|
|
|
|
|
|
|
return _get_val_for_strs( _get_usr_str( $args{'string'} ),
|
484
|
3
|
|
|
|
|
37
|
\@idx_ari, map { $self->{$_} } (qw/_PATH _DELIM _EQ/) );
|
|
9
|
|
|
|
|
21
|
|
485
|
|
|
|
|
|
|
}
|
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
=head3 multi_list
|
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
$hashref = $subtlex->multi_list(strings => AREF_of_char_strings, measures => AREF_of_FIELD_NAMES);
|
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
$frq_hashref = $subtlex->multi_list(strings => [qw/ICH PEA CHOWDER ZEER AIME/], measures => [qw/frq_opm frq_zipf/]);
|
492
|
|
|
|
|
|
|
# $frq_hashref = {
|
493
|
|
|
|
|
|
|
# ICH => {
|
494
|
|
|
|
|
|
|
# frq_opm => 20000,
|
495
|
|
|
|
|
|
|
# frq_zipf => 7.01,
|
496
|
|
|
|
|
|
|
# },
|
497
|
|
|
|
|
|
|
# PEA => {
|
498
|
|
|
|
|
|
|
# frq_opm ...
|
499
|
|
|
|
|
|
|
# },
|
500
|
|
|
|
|
|
|
# ...
|
501
|
|
|
|
|
|
|
# }
|
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
Returns multiple values for a list of strings as a hashref of hashrefs. This is perhaps the most efficient method here for retrieving several values for several words, but only for a small number of words; it could take a long time to return given large lists.
|
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
So, given one or more words in the array ref B, and several measures/values to find for each of them (such as 'frq_opm', 'pos_dom' or any other values defined for the particular language file) in the the array B, the method looks line-by-line through the file to check if the line's string is equal to any of those in B. If so, it collates the relevant measures in a hash keyed by the string, whose values are themselves a hash of the measure-names keying each respective measure-value. The found string is then removed from the look-up list, and the next line is looked-up in the same way. The search stops as soon as there are no more strings in the look-up list (all have been found).
|
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
In this way, there is only one pass through the file for the entire search; no line is looked-up more than once for all strings or their respective measure values. The method could be used for looking up a single string and/or a single value, but the other methods for doing this avoid the overhead of checking an array of strings, and splitting the line against the delimiter; this is only done here to facilitate caching multiple values whereas other methods avoid doing this as they only need to find one value after a known number of delimiters.
|
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
=cut
|
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
sub multi_list {
|
512
|
1
|
|
|
1
|
1
|
245
|
my ( $self, %args ) = @_;
|
513
|
|
|
|
|
|
|
croak 'Need a referenced list of strings to look up'
|
514
|
1
|
50
|
|
|
|
5
|
if !ref $args{'strings'};
|
515
|
1
|
|
|
|
|
2
|
my @strings = map { _get_usr_str($_) } @{ $args{'strings'} };
|
|
2
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
4
|
|
516
|
1
|
|
|
|
|
2
|
my %idx_hash = ();
|
517
|
1
|
50
|
|
|
|
4
|
if ( ref $args{'measures'} ) {
|
518
|
1
|
|
|
|
|
1
|
for my $field ( @{ $args{'measures'} } ) {
|
|
1
|
|
|
|
|
3
|
|
519
|
|
|
|
|
|
|
my $idx =
|
520
|
3
|
|
|
|
|
12
|
$self->{'_FIELDS'}->datum( $self->{'_LANG'}, $field . '_idx' );
|
521
|
3
|
50
|
|
|
|
42
|
if ( nocontent($idx) ) {
|
522
|
0
|
|
|
|
|
0
|
next;
|
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
# croak "The requested value '$field' is not defined for the current SUBTLEX file";
|
525
|
|
|
|
|
|
|
}
|
526
|
3
|
|
|
|
|
31
|
$idx_hash{$idx} = $field;
|
527
|
|
|
|
|
|
|
}
|
528
|
|
|
|
|
|
|
}
|
529
|
|
|
|
|
|
|
return _get_any_vals_for_string_list( [@strings], \%idx_hash,
|
530
|
1
|
|
|
|
|
5
|
map { $self->{$_} } (qw/_PATH _DELIM _EQ/) );
|
|
3
|
|
|
|
|
9
|
|
531
|
|
|
|
|
|
|
}
|
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
=head2 Descriptive frequency statistics for lists
|
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
These methods return a descriptive statistic (sum, mean, median or standard deviation) for a list of B. Like L, they take an optional argument B to specify if the returned values should be occurrences per million, log frequencies, or Zipf values. Providing this as an argument obviates the need to provide multiple methods for each different type of frequency measure, e.g., "mean_opm()", mean_log_opm()", ...
|
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
Because not all types of frequency scales (count, opm, log, Zipf) are provided in all SUBTLEX corpora, these methods will C if there are no canned stats for the particular scale called for.
|
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
It might be thought useful to allow any valid scale to be returned by, say, providing each method without a value for B; a hash-ref of frequency values, keyed by scale-type, might be returned. However, this seems to be unrecommended; it assumes that users are blind as to what measures they want (as well as to what they can get).
|
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
=head3 frq_sum
|
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
$sum = $subtlex->frq_sum(strings => [qw/word1 word2/], scale => 'count|opm|log|zipf');
|
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
Returns the sum of the count, opm, log (usually opm) or Zipf frequency, depending on the value of B.
|
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
=cut
|
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
sub frq_sum {
|
550
|
0
|
|
|
0
|
1
|
0
|
my ( $self, %args ) = @_;
|
551
|
0
|
|
|
|
|
0
|
return sum( $self->_frq_vals(%args) );
|
552
|
|
|
|
|
|
|
}
|
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
=head3 frq_mean
|
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
$mean = $subtlex->frq_mean(strings => [qw/word1 word2/], scale => 'count|opm|log|zipf');
|
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
Returns the arithmetic average of the count, opm, log (usually opm) or Zipf frequency, depending on the value of B.
|
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
=cut
|
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
sub frq_mean {
|
563
|
5
|
|
|
5
|
1
|
42256
|
my ( $self, %args ) = @_;
|
564
|
5
|
|
|
|
|
27
|
return mean( $self->_frq_vals(%args) );
|
565
|
|
|
|
|
|
|
}
|
566
|
|
|
|
|
|
|
*mean_freq = \&frq_mean;
|
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
=head3 frq_median
|
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
$median = $subtlex->frq_median(strings => [qw/word1 word2/], scale => 'count|opm|log|zipf');
|
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
Returns the median count, opm, log (usually opm) or Zipf frequency for the given B, depending on the value of B.
|
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
=cut
|
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
sub frq_median {
|
577
|
1
|
|
|
1
|
1
|
441
|
my ( $self, %args ) = @_;
|
578
|
1
|
|
|
|
|
15
|
return median( $self->_frq_vals(%args) );
|
579
|
|
|
|
|
|
|
}
|
580
|
|
|
|
|
|
|
*median_freq = \*frq_median;
|
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
=head3 frq_sd
|
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
$sd = $subtlex->frq_sd(strings => [qw/word1 word2/], scale => 'count|opm|log|zipf');
|
585
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
Returns the standard deviation of the count, opm, log (usually opm) or Zipf frequency, depending on the value of B.
|
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
=cut
|
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
sub frq_sd {
|
591
|
1
|
|
|
1
|
1
|
764
|
my ( $self, %args ) = @_;
|
592
|
1
|
|
|
|
|
6
|
return stddev( $self->_frq_vals(%args) );
|
593
|
|
|
|
|
|
|
}
|
594
|
|
|
|
|
|
|
*sd_freq = \*frq_sd;
|
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
sub _frq_vals {
|
597
|
7
|
|
|
7
|
|
23
|
my ( $self, %args ) = @_;
|
598
|
|
|
|
|
|
|
croak
|
599
|
|
|
|
|
|
|
'No string(s) to test; pass one or more letter-strings named \'strings\' as a referenced array'
|
600
|
7
|
50
|
|
|
|
24
|
if !$args{'strings'};
|
601
|
|
|
|
|
|
|
my $strs =
|
602
|
|
|
|
|
|
|
ref $args{'strings'}
|
603
|
7
|
50
|
|
|
|
32
|
? $args{'strings'}
|
604
|
|
|
|
|
|
|
: croak 'No reference to an array of letter-strings found';
|
605
|
|
|
|
|
|
|
my $col_i =
|
606
|
|
|
|
|
|
|
hascontent( $args{'scale'} )
|
607
|
|
|
|
|
|
|
? $self->{'_FIELDS'}
|
608
|
|
|
|
|
|
|
->datum( $self->{'_LANG'}, 'frq_' . $args{'scale'} . '_idx' )
|
609
|
7
|
100
|
|
|
|
33
|
: $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'frq_opm_idx' );
|
610
|
7
|
|
|
|
|
222
|
my @vals = ();
|
611
|
7
|
|
|
|
|
12
|
for my $str ( @{$strs} ) {
|
|
7
|
|
|
|
|
18
|
|
612
|
|
|
|
|
|
|
push @vals,
|
613
|
|
|
|
|
|
|
_get_val_for_str( _get_usr_str($str), $col_i,
|
614
|
19
|
|
|
|
|
54
|
map { $self->{$_} } (qw/_PATH _DELIM _EQ/) );
|
|
57
|
|
|
|
|
129
|
|
615
|
|
|
|
|
|
|
}
|
616
|
7
|
|
|
|
|
87
|
return @vals;
|
617
|
|
|
|
|
|
|
}
|
618
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
=head2 Retrieving letter-strings/words
|
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
=head3 select_strings
|
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
$aref = $subtlex->select_strings(frq_opm => [1, 20], length => [4, 4], cv_pattern => 'CVCV', regex => '^f');
|
624
|
|
|
|
|
|
|
$aref = $subtlex->select_strings(frq_zipf => [0, 2], length => [4, 4], cv_pattern => 'CVCV', regex => '^f');
|
625
|
|
|
|
|
|
|
|
626
|
|
|
|
|
|
|
I: select_words
|
627
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
Returns a list of strings (presumably words) from the SUBTLEX corpus that satisfies certain criteria, as per the following arguments:
|
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
=over 2
|
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
=item length
|
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
minimum and/or maximum length of the string (or "letter-length")
|
635
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
=item frq_opm, frq_log, cd_count, etc.
|
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
minimum and/or maximum frequency (as given in whatever unit offered by the datafile for the set language)
|
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
=item cv_pattern
|
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
a consonant-vowel pattern, given as a string by the usual convention, e.g., 'CCVCC' defines a 5-letter word starting and ending with pairs of consonants, the pairs separated by a vowel. 'Y' is defined here as a consonant. The tested strings are stripped of marks and otherwise ASCII transliterated (using L) ahead of the check.
|
643
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
=item regex
|
645
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
a regular expression (L). In the examples above, only letter-strings starting with the letter 'f', followed by one of more other letters, are specified for retrieval. Alternatively, for example, the regex value '[^aeiouy]$' specifies that the letter-strings to be returned must not end with a vowel (or 'y'). The tested strings are stripped of marks and otherwise ASCII transliterated (using L) ahead of matching, so if the string in the file has, say, a 'u' with an Umlaut, it will match a 'u' in the regex.
|
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
=back
|
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
For the minimum/maximum constrained criteria, the two limits are given as a referenced array where the first element is the minimum and the second element is the maximum. For example, [3, 7] would specify letter-strings of 3 to 7 letters in length; [4, 4] specifies letter-strings of only 4 letters in length. If only one of these is to be constrained, then the array would be given as, e.g., [3] to specify a minimum of 3 letters without constraining the maximum, or ['',7] for a maximum of 7 letters without constraining the minimum (checking if the element C as per String::Util).
|
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
The value returned is always a reference to the list of words retrieved (or to an empty list if none was retrieved).
|
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
Calling this method as "list_strings" or "list_words" is deprecated; to avoid confusion with L, which also returns a list of strings. A deprecation warning and wrap to the method is in place as of version 0.06 if using this name; they will be removed in a subsequent version.
|
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
=cut
|
657
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
sub select_strings {
|
659
|
7
|
|
|
7
|
1
|
6515
|
my ( $self, %args ) = @_;
|
660
|
7
|
|
|
|
|
13
|
my %patterns = ();
|
661
|
7
|
100
|
|
|
|
28
|
if ( hascontent( $args{'regex'} ) ) {
|
662
|
1
|
|
|
|
|
22
|
$patterns{'regex'} = qr/$args{'regex'}/xms;
|
663
|
|
|
|
|
|
|
}
|
664
|
7
|
100
|
|
|
|
60
|
if ( hascontent( $args{'cv_pattern'} ) ) {
|
665
|
5
|
|
|
|
|
48
|
my $tmp = q{};
|
666
|
5
|
|
|
|
|
29
|
my @c = split m//ms, uc( $args{'cv_pattern'} );
|
667
|
5
|
|
|
|
|
15
|
foreach (@c) {
|
668
|
36
|
100
|
|
|
|
71
|
$tmp .= $_ eq 'C' ? '[BCDFGHJKLMNPQRSTVWXYZ]' : '[AEIOU]';
|
669
|
|
|
|
|
|
|
}
|
670
|
5
|
|
|
|
|
269
|
$patterns{'cv_pattern'} = qr/^$tmp$/ixms;
|
671
|
|
|
|
|
|
|
}
|
672
|
|
|
|
|
|
|
|
673
|
7
|
|
|
|
|
30
|
my @list = ();
|
674
|
7
|
50
|
|
|
|
281
|
open my $fh, q{<}, $self->{'_PATH'} or croak $OS_ERROR;
|
675
|
|
|
|
|
|
|
LINES:
|
676
|
7
|
|
|
|
|
178
|
while (<$fh>) {
|
677
|
218
|
100
|
|
|
|
5406
|
next if $INPUT_LINE_NUMBER == 1; # skip column heading line
|
678
|
211
|
|
|
|
|
927
|
my @line = split m/\Q$self->{'_DELIM'}\E/xms;
|
679
|
211
|
100
|
|
|
|
273
|
next if !_in_range( length( $line[0] ), @{ $args{'length'} } );
|
|
211
|
|
|
|
|
371
|
|
680
|
157
|
|
|
|
|
280
|
for ( keys %patterns ) {
|
681
|
131
|
100
|
|
|
|
303
|
next LINES if unidecode( $line[0] ) !~ $patterns{$_};
|
682
|
|
|
|
|
|
|
}
|
683
|
36
|
|
|
|
|
186
|
for (qw/frq_count frq_opm frq_log frq_zipf cd_count cd_pct cd_log/) {
|
684
|
152
|
100
|
66
|
|
|
343
|
if (
|
685
|
|
|
|
|
|
|
ref $args{$_}
|
686
|
|
|
|
|
|
|
and hascontent(
|
687
|
|
|
|
|
|
|
$self->{'_FIELDS'}->datum( $self->{'_LANG'}, $_ . '_idx' )
|
688
|
|
|
|
|
|
|
)
|
689
|
|
|
|
|
|
|
)
|
690
|
|
|
|
|
|
|
{
|
691
|
|
|
|
|
|
|
next LINES
|
692
|
|
|
|
|
|
|
if !_in_range(
|
693
|
|
|
|
|
|
|
_clean_value(
|
694
|
|
|
|
|
|
|
$line[
|
695
|
|
|
|
|
|
|
$self->{'_FIELDS'}
|
696
|
|
|
|
|
|
|
->datum( $self->{'_LANG'}, $_ . '_idx' )
|
697
|
|
|
|
|
|
|
]
|
698
|
|
|
|
|
|
|
),
|
699
|
32
|
100
|
|
|
|
601
|
@{ $args{$_} }
|
|
32
|
|
|
|
|
743
|
|
700
|
|
|
|
|
|
|
);
|
701
|
|
|
|
|
|
|
}
|
702
|
|
|
|
|
|
|
}
|
703
|
16
|
50
|
|
|
|
33
|
if ( ref $args{'pos'} ) {
|
704
|
|
|
|
|
|
|
next LINES
|
705
|
|
|
|
|
|
|
if none {
|
706
|
|
|
|
|
|
|
$_ eq $line[ $self->{'_FIELDS'}
|
707
|
0
|
|
|
0
|
|
0
|
->datum( $self->{'_LANG'}, 'pos_dom_idx' ) ]
|
708
|
|
|
|
|
|
|
}
|
709
|
0
|
0
|
|
|
|
0
|
@{ $args{'pos'} };
|
|
0
|
|
|
|
|
0
|
|
710
|
|
|
|
|
|
|
}
|
711
|
16
|
|
|
|
|
63
|
push @list, $line[0];
|
712
|
|
|
|
|
|
|
}
|
713
|
7
|
50
|
|
|
|
170
|
close $fh or croak $OS_ERROR;
|
714
|
|
|
|
|
|
|
|
715
|
7
|
|
|
|
|
73
|
return \@list;
|
716
|
|
|
|
|
|
|
}
|
717
|
|
|
|
|
|
|
*select_words = \&select_strings;
|
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
=head3 all_strings
|
720
|
|
|
|
|
|
|
|
721
|
|
|
|
|
|
|
$aref = $subtlex->all_strings();
|
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
I: all_words
|
724
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
Returns a reference to an array of all letter-strings in the corpus. These are culled of empty and duplicate strings, and then alphabetically sorted.
|
726
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
=cut
|
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
sub all_strings {
|
730
|
1
|
|
|
1
|
1
|
2
|
my ( $self, %args ) = @_;
|
731
|
1
|
|
|
|
|
3
|
my @list = ();
|
732
|
1
|
50
|
|
|
|
37
|
open my $fh, q{<}, $self->{'_PATH'} or croak $OS_ERROR;
|
733
|
1
|
|
|
|
|
29
|
while (<$fh>) {
|
734
|
7
|
100
|
|
|
|
145
|
next if $INPUT_LINE_NUMBER == 1; # skip column heading line
|
735
|
6
|
|
|
|
|
9
|
push @list, _get_file_str( $_, $self->{'_DELIM'} );
|
736
|
|
|
|
|
|
|
}
|
737
|
1
|
50
|
|
|
|
46
|
close $fh or croak $OS_ERROR;
|
738
|
1
|
|
|
|
|
6
|
return [ sort { lc($a) cmp lc($b) } uniq( grep { hascontent($_) } @list ) ];
|
|
8
|
|
|
|
|
29
|
|
|
6
|
|
|
|
|
34
|
|
739
|
|
|
|
|
|
|
}
|
740
|
|
|
|
|
|
|
*all_words = \&all_strings;
|
741
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
=head3 random_string
|
743
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
$string = $subtlex->random_string();
|
745
|
|
|
|
|
|
|
@data = $subtlex->random_string();
|
746
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
I: random_word
|
748
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
Picks a random line from the corpus, using L (except the top header line). Returns the word in that line if called in scalar context; otherwise, the array of data for that line. (A future version might let specifying a match to specific criteria, self-aborting after trying X lines.)
|
750
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
=cut
|
752
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
sub random_string {
|
754
|
2
|
|
|
2
|
1
|
489
|
my ( $self, %args ) = @_;
|
755
|
2
|
|
|
|
|
5
|
eval { require File::RandomLine; };
|
|
2
|
|
|
|
|
451
|
|
756
|
2
|
50
|
|
|
|
2794
|
croak 'Need to install and access module File::RandomLine' if $EVAL_ERROR;
|
757
|
|
|
|
|
|
|
my $rl =
|
758
|
2
|
|
|
|
|
55
|
File::RandomLine->new( $self->{'_PATH'}, { algorithm => 'uniform' } );
|
759
|
2
|
|
|
|
|
342
|
my @ari = ();
|
760
|
2
|
|
66
|
|
|
9
|
while ( not scalar @ari or $ari[0] eq 'Word' ) {
|
761
|
2
|
|
|
|
|
17
|
@ari = split m/\Q$self->{'_DELIM'}\E/xms, $rl->next;
|
762
|
|
|
|
|
|
|
}
|
763
|
2
|
100
|
|
|
|
182
|
return wantarray ? @ari : $ari[0];
|
764
|
|
|
|
|
|
|
}
|
765
|
|
|
|
|
|
|
*random_word = \&random_string;
|
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
=head2 Miscellaneous
|
768
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
=head3 n_lines
|
770
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
$num = $subtlex->n_lines();
|
772
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
Returns the number of lines, less the column headings and any lines with no content, in the installed language file. Expects/accepts no arguments.
|
774
|
|
|
|
|
|
|
|
775
|
|
|
|
|
|
|
=cut
|
776
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
sub n_lines {
|
778
|
3
|
|
|
3
|
1
|
1486
|
my $self = shift;
|
779
|
3
|
|
|
|
|
8
|
my $z = 0;
|
780
|
3
|
50
|
|
|
|
119
|
open( my $fh, q{<}, $self->{'_PATH'} ) or croak $OS_ERROR;
|
781
|
3
|
|
|
|
|
46
|
while (<$fh>) {
|
782
|
94
|
100
|
|
|
|
159
|
next if $INPUT_LINE_NUMBER == 1; # skip column heading line
|
783
|
91
|
50
|
|
|
|
129
|
next if nocontent($_);
|
784
|
91
|
|
|
|
|
755
|
$z++;
|
785
|
|
|
|
|
|
|
}
|
786
|
3
|
50
|
|
|
|
34
|
close $fh or croak $OS_ERROR;
|
787
|
3
|
|
|
|
|
17
|
return $z;
|
788
|
|
|
|
|
|
|
}
|
789
|
|
|
|
|
|
|
*nlines = \&n_lines; # legacy alias
|
790
|
|
|
|
|
|
|
|
791
|
|
|
|
|
|
|
=head3 pct_alpha
|
792
|
|
|
|
|
|
|
|
793
|
|
|
|
|
|
|
Returns the percentage of strings in the subtitles file that satisfy "look like words" relative to the number of lines (as per L). Specifically, after ASCII transliteration of the string (per L), does it match to /[\p{XPosixAlpha}\-']/ (per L, but including apostrophes and dashes)?
|
794
|
|
|
|
|
|
|
|
795
|
|
|
|
|
|
|
=cut
|
796
|
|
|
|
|
|
|
|
797
|
|
|
|
|
|
|
sub pct_alpha {
|
798
|
1
|
|
|
1
|
1
|
266
|
my ( $self, %args ) = @_;
|
799
|
1
|
|
|
|
|
3
|
my $all_strs_aref = $self->all_strings();
|
800
|
1
|
|
|
|
|
1
|
my $count_all = count( @{$all_strs_aref} );
|
|
1
|
|
|
|
|
4
|
|
801
|
1
|
|
|
|
|
17
|
my $pct = q{};
|
802
|
1
|
50
|
|
|
|
3
|
if ( $count_all > 0 ) {
|
803
|
17
|
|
|
17
|
|
145
|
my $count_alpha_strings = count( grep { m/[\p{XPosixAlpha}\-']/xsm }
|
|
17
|
|
|
|
|
62
|
|
|
17
|
|
|
|
|
286
|
|
|
5
|
|
|
|
|
31
|
|
804
|
1
|
|
|
|
|
2
|
map { unidecode($_) } @{$all_strs_aref} );
|
|
5
|
|
|
|
|
53
|
|
|
1
|
|
|
|
|
2
|
|
805
|
1
|
|
|
|
|
12
|
$pct = 100 * $count_alpha_strings / $count_all;
|
806
|
|
|
|
|
|
|
}
|
807
|
1
|
|
|
|
|
3
|
return $pct;
|
808
|
|
|
|
|
|
|
}
|
809
|
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
=head3 set_lang
|
811
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
$lang = $subtlex->set_lang(lang => STR); # DE, FR, NL_all, NL_min, PT, UK or US
|
813
|
|
|
|
|
|
|
$lang = $subtlex->set_lang(lang => STR, path => 'this/is/the/file.csv');
|
814
|
|
|
|
|
|
|
$lang = $subtlex->set_lang(lang => STR, dir => 'file/is/in/here');
|
815
|
|
|
|
|
|
|
|
816
|
|
|
|
|
|
|
Set or guess location of datafile; see L. Naturally, the given value of B (required)--which is used as a database ID--should correspond with any given B to the SUBTLEX datafile (optional but recommended). If only a B value is given, the SUBTLEX datafile should be named so that it uniquely includes the specific value of B.
|
817
|
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
=cut
|
819
|
|
|
|
|
|
|
|
820
|
|
|
|
|
|
|
sub set_lang {
|
821
|
25
|
|
|
25
|
1
|
424
|
my ( $self, %args ) = @_;
|
822
|
|
|
|
|
|
|
## firstly, establish the language to use, and the directory in which this module lives:
|
823
|
25
|
100
|
|
|
|
114
|
return if nocontent($args{'lang'});
|
824
|
24
|
|
|
|
|
456
|
$self->_set_spec_hash( $args{'fieldpath'} );
|
825
|
|
|
|
|
|
|
#@ is the complete pathname actually given in args?
|
826
|
24
|
50
|
|
|
|
127
|
croak 'Need a valid attribute' if not ref $self->{'_FIELDS'}->record( $args{'lang'} );
|
827
|
|
|
|
|
|
|
|
828
|
24
|
|
|
|
|
435
|
$self->{'_LANG'} = delete $args{'lang'};
|
829
|
|
|
|
|
|
|
|
830
|
24
|
50
|
|
|
|
102
|
if ( hascontent( $args{'path'} ) ) {
|
831
|
24
|
50
|
|
|
|
744
|
if ( !-e $args{'path'} ) {
|
832
|
0
|
|
|
|
|
0
|
croak
|
833
|
|
|
|
|
|
|
"Path given for SUBTLEX corpus does not exist: '$args{'path'}'";
|
834
|
|
|
|
|
|
|
}
|
835
|
|
|
|
|
|
|
else {
|
836
|
24
|
|
|
|
|
117
|
$self->{'_PATH'} = delete $args{'path'};
|
837
|
|
|
|
|
|
|
}
|
838
|
|
|
|
|
|
|
}
|
839
|
|
|
|
|
|
|
else {
|
840
|
0
|
|
|
|
|
0
|
my ( $lang, $dir, $path ) = ( $self->{'_LANG'} );
|
841
|
0
|
0
|
|
|
|
0
|
if ( $args{'dir'} ) { # check it's a dir:
|
842
|
|
|
|
|
|
|
croak "Value for argument 'dir' ($args{'dir'}) is not a directory"
|
843
|
0
|
0
|
|
|
|
0
|
if !-d $args{'dir'};
|
844
|
0
|
|
|
|
|
0
|
$dir = delete $args{'dir'};
|
845
|
|
|
|
|
|
|
}
|
846
|
|
|
|
|
|
|
else { # use module's dir :
|
847
|
0
|
|
|
|
|
0
|
$dir = $self->{'_MODULE_DIR'};
|
848
|
|
|
|
|
|
|
}
|
849
|
0
|
|
|
|
|
0
|
for ( path($dir)->children ) {
|
850
|
0
|
0
|
|
|
|
0
|
if (/(?:SUBTLEX[\-_])?\Q$lang/imsx) {
|
851
|
0
|
|
|
|
|
0
|
$path = $_;
|
852
|
0
|
|
|
|
|
0
|
last;
|
853
|
|
|
|
|
|
|
}
|
854
|
|
|
|
|
|
|
}
|
855
|
0
|
0
|
0
|
|
|
0
|
if ( nocontent($path) or not -T $path )
|
856
|
|
|
|
|
|
|
{ # only already defined if it exists
|
857
|
0
|
|
|
|
|
0
|
croak
|
858
|
|
|
|
|
|
|
"Cannot find required SUBTLEX datafile for language '$self->{'_LANG'}' within '$dir'.\nInstall the database (from the URL given in the POD) into either:\n\t(1) the Lingua/Norms/SUBTLEX directory within your Perl distribution (with the filename specified in the POD);\n\t(2) a directory you specify to new(dir => 'my/dir/to/lang/file') (again with the filename specified in the POD); or\n\t(3) a directory, specifying the complete path to that file in new(path => 'this/is/the/file.csv'), including its filename";
|
859
|
|
|
|
|
|
|
}
|
860
|
|
|
|
|
|
|
else {
|
861
|
0
|
|
|
|
|
0
|
$self->{'_PATH'} = $path;
|
862
|
|
|
|
|
|
|
}
|
863
|
|
|
|
|
|
|
}
|
864
|
|
|
|
|
|
|
|
865
|
|
|
|
|
|
|
$self->{'_DELIM'} =
|
866
|
24
|
|
|
|
|
122
|
get_separator( path => $self->{'_PATH'}, lucky => 1 );
|
867
|
|
|
|
|
|
|
|
868
|
24
|
|
|
|
|
6454
|
return $self->{'_LANG'};
|
869
|
|
|
|
|
|
|
}
|
870
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
=head3 get_lang
|
872
|
|
|
|
|
|
|
|
873
|
|
|
|
|
|
|
$str = $subtlex->get_lang();
|
874
|
|
|
|
|
|
|
|
875
|
|
|
|
|
|
|
Returns the language code (e.g., 'UK', 'FR') currently set for the module (which determines the file being looked up, if not explicitly given). The empty string is returned if the language has not been set.
|
876
|
|
|
|
|
|
|
|
877
|
|
|
|
|
|
|
=cut
|
878
|
|
|
|
|
|
|
|
879
|
|
|
|
|
|
|
sub get_lang {
|
880
|
8
|
|
|
8
|
1
|
3543
|
my ( $self, %args ) = @_;
|
881
|
8
|
50
|
|
|
|
35
|
return hascontent( $self->{'_LANG'} ) ? $self->{'_LANG'} : q{};
|
882
|
|
|
|
|
|
|
}
|
883
|
|
|
|
|
|
|
|
884
|
|
|
|
|
|
|
=head3 get_path2db
|
885
|
|
|
|
|
|
|
|
886
|
|
|
|
|
|
|
$path = $subtlex->get_path2db();
|
887
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
Returns the path (directory and filename) from which the module's methods are currently set to look-up strings, frequencies, etc.
|
889
|
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
=cut
|
891
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
sub get_path2db {
|
893
|
0
|
|
|
0
|
1
|
0
|
my ( $self, %args ) = @_;
|
894
|
0
|
|
|
|
|
0
|
return path( $self->{'_PATH'} )->stringify;
|
895
|
|
|
|
|
|
|
}
|
896
|
|
|
|
|
|
|
|
897
|
|
|
|
|
|
|
=head3 get_index
|
898
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
$int = $subtlex->get_index(measure => 'frq_opm');
|
900
|
|
|
|
|
|
|
|
901
|
|
|
|
|
|
|
Returns the index within the currently looked-up file that contains the given B.
|
902
|
|
|
|
|
|
|
|
903
|
|
|
|
|
|
|
=cut
|
904
|
|
|
|
|
|
|
|
905
|
|
|
|
|
|
|
sub get_index {
|
906
|
5
|
|
|
5
|
1
|
2657
|
my ( $self, %args ) = @_;
|
907
|
5
|
50
|
|
|
|
18
|
my $var = delete $args{'measure'} or croak 'Need a named measure';
|
908
|
5
|
|
|
|
|
27
|
return $self->{'_FIELDS'}->datum( $self->{'_LANG'}, $var . '_idx' )
|
909
|
|
|
|
|
|
|
; #{$var};
|
910
|
|
|
|
|
|
|
}
|
911
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
=head3 set_eq
|
913
|
|
|
|
|
|
|
|
914
|
|
|
|
|
|
|
$subtlex->set_eq(match_level => INT); # undef, 0, 1, 2 or 3
|
915
|
|
|
|
|
|
|
|
916
|
|
|
|
|
|
|
See L.
|
917
|
|
|
|
|
|
|
|
918
|
|
|
|
|
|
|
=head3 url2datafile
|
919
|
|
|
|
|
|
|
|
920
|
|
|
|
|
|
|
$url = $subtlex->url2datafile(lang => STRING);
|
921
|
|
|
|
|
|
|
%loc = $subtlex->url2datafile(lang => STRING);
|
922
|
|
|
|
|
|
|
|
923
|
|
|
|
|
|
|
Returns the URL (complete path) where the SUBTLEX file for a given language is stored, and from which it should be downloadable. These are locations as specified (at the time of releasing this version of the module) at L and/or L, and so as listed in the L section. This could include an archive within which the file needs to be retrieved. Called in list context, this method returns a hash with keys for 'www_dir', 'archive' (if the file is within an archive) and 'filename'. (This module does not fetch the file off the WWW itself; it should be installed and available locally--see L).
|
924
|
|
|
|
|
|
|
|
925
|
|
|
|
|
|
|
=cut
|
926
|
|
|
|
|
|
|
|
927
|
|
|
|
0
|
1
|
|
sub url2datafile {
|
928
|
|
|
|
|
|
|
|
929
|
|
|
|
|
|
|
#my ($self, %args) = @_;
|
930
|
|
|
|
|
|
|
#croak 'A value for the argument needs to be given for SUBTLEX url2datafile' if nocontent($args{'lang'});
|
931
|
|
|
|
|
|
|
#my $lang = delete $args{'lang'};
|
932
|
|
|
|
|
|
|
#croak "The value for the argument => $lang is not recognised" if none { $_ => $lang } (qw/UK US NL DE/);
|
933
|
|
|
|
|
|
|
# Hard-copy of WWW dirs, archives (where rel) and filenames for the SUBTLEX files:
|
934
|
|
|
|
|
|
|
## some datafiles are within compressed archives, some not, so ...
|
935
|
|
|
|
|
|
|
#my %req_filespecs = %{$path_hash{$lang}};
|
936
|
|
|
|
|
|
|
|
937
|
|
|
|
|
|
|
#return wantarray ? %req_filespecs : File::Spec->catfile($req_filespecs{$lang}->{'www_dir'}, $req_filespecs{$lang}->{'archive'}, $req_filespecs{$lang}->{'file'});
|
938
|
|
|
|
|
|
|
}
|
939
|
|
|
|
|
|
|
|
940
|
|
|
|
|
|
|
### PRIVATMETHODEN:
|
941
|
|
|
|
|
|
|
|
942
|
|
|
|
|
|
|
sub _get_usr_str {
|
943
|
118
|
|
|
118
|
|
193
|
my $str = shift;
|
944
|
118
|
50
|
|
|
|
313
|
croak 'No string to test; pass a value for to the requested method'
|
945
|
|
|
|
|
|
|
if nocontent($str);
|
946
|
118
|
|
|
|
|
1497
|
return $str;
|
947
|
|
|
|
|
|
|
|
948
|
|
|
|
|
|
|
#return decode( 'UTF-8', $str );#
|
949
|
|
|
|
|
|
|
}
|
950
|
|
|
|
|
|
|
|
951
|
|
|
|
|
|
|
# Given a line from a SUBTLEX file, return all the characters from the start of the line up to the delimiter for that file, after stripping it of any quote characters - e.g., if the line starts: "abacus",20,30 ... and the delimiter is a comma, return: abacus
|
952
|
|
|
|
|
|
|
sub _get_file_str {
|
953
|
1680
|
|
|
1680
|
|
2999
|
my ( $line, $delim ) = @_;
|
954
|
1680
|
|
|
|
|
5977
|
$line =~ /^([^\Q$delim\E]+)/xms;
|
955
|
1680
|
|
|
|
|
3186
|
return trim( unquote($1) );
|
956
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
#my $str = decode('UTF-8', trim(unquote($1)) );
|
958
|
|
|
|
|
|
|
#print STDERR "<$str>\n";
|
959
|
|
|
|
|
|
|
#return $str;
|
960
|
|
|
|
|
|
|
|
961
|
|
|
|
|
|
|
#my $code = guess_encoding($str, qw/ascii utf8 utf16 iso-8859-1 cp1250 latin1 greek/);
|
962
|
|
|
|
|
|
|
#print STDERR "$str\t", $code->decode($str), "\n";
|
963
|
|
|
|
|
|
|
#return $code->decode($str);
|
964
|
|
|
|
|
|
|
}
|
965
|
|
|
|
|
|
|
|
966
|
|
|
|
|
|
|
sub _get_val_for_str {
|
967
|
103
|
|
|
103
|
|
237
|
my ( $str, $col_i, $path, $delim, $eq_fn ) = @_;
|
968
|
103
|
50
|
|
|
|
199
|
croak
|
969
|
|
|
|
|
|
|
'No word to test; pass a letter-string named \'string\' to the function'
|
970
|
|
|
|
|
|
|
if nocontent($str);
|
971
|
103
|
50
|
|
|
|
906
|
croak "The requested value is not defined for the current SUBTLEX corpus"
|
972
|
|
|
|
|
|
|
if nocontent($col_i);
|
973
|
|
|
|
|
|
|
|
974
|
103
|
|
|
|
|
858
|
my $val = q{}; # default value returned is empty string
|
975
|
103
|
50
|
|
|
|
3889
|
open( my $fh, q{<}, $path ) or croak $OS_ERROR;
|
976
|
103
|
|
|
|
|
1448
|
while (<$fh>) {
|
977
|
1522
|
100
|
|
|
|
81826
|
next if $INPUT_LINE_NUMBER == 1; # skip column heading line
|
978
|
1419
|
|
|
|
|
2025
|
my $file_str =
|
979
|
|
|
|
|
|
|
_get_file_str( $_, $delim ); # have to declare as can be empty (!)
|
980
|
1419
|
50
|
|
|
|
32686
|
next if nocontent($file_str);
|
981
|
1419
|
100
|
|
|
|
12315
|
if ( $eq_fn->( $str, $file_str ) ) {
|
982
|
102
|
|
|
|
|
6409
|
$val = _get_val( $_, $delim, $col_i );
|
983
|
102
|
|
|
|
|
2576
|
last;
|
984
|
|
|
|
|
|
|
}
|
985
|
|
|
|
|
|
|
}
|
986
|
103
|
50
|
|
|
|
1664
|
close $fh or croak $OS_ERROR;
|
987
|
103
|
|
|
|
|
683
|
return $val;
|
988
|
|
|
|
|
|
|
}
|
989
|
|
|
|
|
|
|
|
990
|
|
|
|
|
|
|
sub _get_val {
|
991
|
102
|
|
|
102
|
|
218
|
my ( $line, $delim, $col_i ) = @_;
|
992
|
|
|
|
|
|
|
|
993
|
|
|
|
|
|
|
# if the line has quoted fields, and uses the delimiter within the quotes,
|
994
|
|
|
|
|
|
|
# as in SUBTLEX-PT, need to firstly clean the line up:
|
995
|
|
|
|
|
|
|
# this "fix" assumes the quotes are either double- or single quotes and nothing else,
|
996
|
|
|
|
|
|
|
# and there is no trailing delimiter.
|
997
|
|
|
|
|
|
|
# It strips the quotes, and replaces the comma with a vertical bar:
|
998
|
102
|
|
|
|
|
822
|
$line =~ s/["']([^"'\Q$delim\E]+)\Q$delim\E([^"'\Q$delim\E]+)["']/$1|$2/gxsm;
|
999
|
|
|
|
|
|
|
|
1000
|
102
|
|
|
|
|
2205
|
$line =~ m/^(
|
1001
|
|
|
|
|
|
|
[^\Q$delim\E]* # any character from the start not including the delimiter (which might be \t)
|
1002
|
|
|
|
|
|
|
\Q$delim\E # now ending with the delimiter, perhaps as a quoted string
|
1003
|
|
|
|
|
|
|
)
|
1004
|
|
|
|
|
|
|
{$col_i,}? # as many times as necessary to get to the required field value
|
1005
|
|
|
|
|
|
|
([^\Q$delim\E]*) # which should be here
|
1006
|
|
|
|
|
|
|
/msx;
|
1007
|
102
|
|
|
|
|
325
|
return _clean_value($2); # now format the number, strip space ...
|
1008
|
|
|
|
|
|
|
}
|
1009
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
sub _get_val_for_strs {
|
1011
|
3
|
|
|
3
|
|
7
|
my ( $str, $col_i_aref, $path, $delim, $eq_fn ) = @_;
|
1012
|
|
|
|
|
|
|
|
1013
|
|
|
|
|
|
|
# Check we have a string, and valid filed indices:
|
1014
|
3
|
50
|
|
|
|
7
|
croak
|
1015
|
|
|
|
|
|
|
'No word to test; pass a letter-string named \'string\' to the function'
|
1016
|
|
|
|
|
|
|
if nocontent($str);
|
1017
|
|
|
|
|
|
|
croak "The requested value is not defined for the SUBTLEX corpus"
|
1018
|
3
|
50
|
|
6
|
|
31
|
if any { nocontent($_) } @{$col_i_aref};
|
|
6
|
|
|
|
|
30
|
|
|
3
|
|
|
|
|
11
|
|
1019
|
|
|
|
|
|
|
|
1020
|
3
|
|
|
|
|
31
|
my $val = [];
|
1021
|
|
|
|
|
|
|
|
1022
|
|
|
|
|
|
|
# Search for the string, and isolate the requested values:
|
1023
|
3
|
50
|
|
|
|
123
|
open( my $fh, q{<}, $path ) or croak $OS_ERROR;
|
1024
|
3
|
|
|
|
|
62
|
while (<$fh>) {
|
1025
|
37
|
100
|
|
|
|
162
|
next if $INPUT_LINE_NUMBER == 1; # skip column heading line
|
1026
|
34
|
|
|
|
|
47
|
my $file_str =
|
1027
|
|
|
|
|
|
|
_get_file_str( $_, $delim ); # have to declare as can be empty (!)
|
1028
|
34
|
50
|
|
|
|
746
|
next if nocontent($file_str);
|
1029
|
34
|
100
|
|
|
|
284
|
if ( $eq_fn->( $str, $file_str ) ) {
|
1030
|
3
|
|
|
|
|
32
|
my @line = split m/\Q$delim\E/xms;
|
1031
|
3
|
|
|
|
|
15
|
for my $col_i ( @{$col_i_aref} ) {
|
|
3
|
|
|
|
|
12
|
|
1032
|
6
|
|
|
|
|
70
|
push @{$val}, _clean_value( $line[$col_i] );
|
|
6
|
|
|
|
|
16
|
|
1033
|
|
|
|
|
|
|
}
|
1034
|
3
|
|
|
|
|
69
|
last;
|
1035
|
|
|
|
|
|
|
}
|
1036
|
|
|
|
|
|
|
}
|
1037
|
3
|
50
|
|
|
|
49
|
close $fh or croak;
|
1038
|
|
|
|
|
|
|
|
1039
|
|
|
|
|
|
|
# return the reference to array if there is more than 1 value, otherwise just the single value itself
|
1040
|
|
|
|
|
|
|
## but if the string itself was not found, return the empty string for the number of requested fields:
|
1041
|
3
|
|
|
|
|
13
|
my $n_vals = scalar grep { hascontent($_) } @{$val};
|
|
6
|
|
|
|
|
32
|
|
|
3
|
|
|
|
|
7
|
|
1042
|
|
|
|
|
|
|
return
|
1043
|
|
|
|
|
|
|
$n_vals
|
1044
|
|
|
|
|
|
|
? $n_vals > 1
|
1045
|
|
|
|
|
|
|
? $val
|
1046
|
|
|
|
|
|
|
: $val->[0]
|
1047
|
3
|
50
|
|
|
|
47
|
: scalar @{$col_i_aref} > 1 ? [ q{} x scalar @{$col_i_aref} ]
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
50
|
|
|
|
0
|
|
1048
|
|
|
|
|
|
|
: q{};
|
1049
|
|
|
|
|
|
|
}
|
1050
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
sub _get_any_vals_for_string_list {
|
1052
|
1
|
|
|
1
|
|
4
|
my ( $str_aref, $col_i_href, $path, $delim, $eq_fn ) = @_;
|
1053
|
1
|
|
|
|
|
2
|
my %string_vals = ();
|
1054
|
1
|
|
|
|
|
2
|
my @usr_strings = sort { $a cmp $b } @{$str_aref};
|
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
4
|
|
1055
|
|
|
|
|
|
|
|
1056
|
|
|
|
|
|
|
# Search for the string, and isolate the requested values:
|
1057
|
1
|
50
|
|
|
|
40
|
open( my $fh, q{<}, $path ) or croak $OS_ERROR;
|
1058
|
1
|
|
|
|
|
17
|
while (<$fh>) {
|
1059
|
5
|
100
|
|
|
|
28
|
next if $INPUT_LINE_NUMBER == 1; # skip column heading line
|
1060
|
4
|
|
|
|
|
9
|
my $file_str =
|
1061
|
|
|
|
|
|
|
_get_file_str( $_, $delim ); # have to declare as can be empty (!)
|
1062
|
4
|
50
|
|
|
|
135
|
next if nocontent($file_str);
|
1063
|
4
|
100
|
|
7
|
|
50
|
if ( my $found = first { $eq_fn->( $_, $file_str ) } @usr_strings ) {
|
|
7
|
|
|
|
|
21
|
|
1064
|
2
|
|
|
|
|
33
|
my @line = split m/\Q$delim\E/xms; # split the line
|
1065
|
2
|
|
|
|
|
5
|
for my $col_i ( keys %{$col_i_href} ) {
|
|
2
|
|
|
|
|
7
|
|
1066
|
6
|
|
|
|
|
146
|
$string_vals{$file_str}->{ $col_i_href->{$col_i} } =
|
1067
|
|
|
|
|
|
|
_clean_value( $line[$col_i] );
|
1068
|
|
|
|
|
|
|
}
|
1069
|
2
|
100
|
|
|
|
43
|
last if scalar keys %string_vals == scalar @{$str_aref};
|
|
2
|
|
|
|
|
9
|
|
1070
|
1
|
|
|
2
|
|
11
|
splice @usr_strings, ( firstidx { $_ eq $found } @usr_strings ), 1;
|
|
2
|
|
|
|
|
10
|
|
1071
|
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
#print STDERR "checking ",join(q{,}, @usr_strings),"\n";
|
1073
|
|
|
|
|
|
|
}
|
1074
|
|
|
|
|
|
|
}
|
1075
|
1
|
50
|
|
|
|
18
|
close $fh or croak;
|
1076
|
1
|
|
|
|
|
12
|
return \%string_vals;
|
1077
|
|
|
|
|
|
|
|
1078
|
|
|
|
|
|
|
# return the reference to array if there is more than 1 value, otherwise just the single value itself
|
1079
|
|
|
|
|
|
|
## but if the string itself was not found, return the empty string for the number of requested fields:
|
1080
|
|
|
|
|
|
|
#my $n_vals = scalar grep { hascontent($_) } @{$val};
|
1081
|
|
|
|
|
|
|
#return $n_vals ? $n_vals > 1 ? $val : $val->[0] : scalar @{$col_i_aref} > 1 ? [q{} x scalar @{$col_i_aref}] : q{};
|
1082
|
|
|
|
|
|
|
}
|
1083
|
|
|
|
|
|
|
|
1084
|
|
|
|
|
|
|
# Loads a hash-ref of the "specs" for each language file, including the field indices in each file for the measures they contain:
|
1085
|
|
|
|
|
|
|
## Called only by new() after setting the MODULE_DIR
|
1086
|
|
|
|
|
|
|
sub _set_spec_hash {
|
1087
|
24
|
|
|
24
|
|
76
|
my ( $self, $fieldpath ) = @_;
|
1088
|
24
|
|
33
|
|
|
62
|
$fieldpath ||= File::Spec->catfile( $self->{'_MODULE_DIR'}, 'specs.csv' );
|
1089
|
24
|
|
|
|
|
204
|
$self->{'_FIELDS'} = Text::CSV::Hashify->new(
|
1090
|
|
|
|
|
|
|
{ file => $fieldpath, format => 'hoh', key => 'Lang_stub' } );
|
1091
|
24
|
|
|
|
|
40402
|
return;
|
1092
|
|
|
|
|
|
|
}
|
1093
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
sub _in_range {
|
1095
|
243
|
|
|
243
|
|
356
|
my ( $n, $min, $max ) = @_;
|
1096
|
243
|
|
|
|
|
265
|
my $res = 1;
|
1097
|
243
|
50
|
|
|
|
398
|
if ( !is_numeric($n) ) {
|
1098
|
0
|
|
|
|
|
0
|
$res = 0;
|
1099
|
|
|
|
|
|
|
}
|
1100
|
|
|
|
|
|
|
else {
|
1101
|
243
|
100
|
100
|
|
|
3241
|
if ( hascontent($min) and $n < $min ) { # fails min
|
1102
|
48
|
|
|
|
|
377
|
$res = 0;
|
1103
|
|
|
|
|
|
|
}
|
1104
|
243
|
100
|
100
|
|
|
1259
|
if ( $res && ( hascontent($max) and $n > $max ) ) { # fails max and min
|
|
|
|
100
|
|
|
|
|
1105
|
26
|
|
|
|
|
198
|
$res = 0;
|
1106
|
|
|
|
|
|
|
}
|
1107
|
|
|
|
|
|
|
}
|
1108
|
243
|
|
|
|
|
1214
|
return $res;
|
1109
|
|
|
|
|
|
|
}
|
1110
|
|
|
|
|
|
|
|
1111
|
|
|
|
|
|
|
sub _clean_value {
|
1112
|
146
|
|
|
146
|
|
620
|
my $val = shift;
|
1113
|
146
|
50
|
|
|
|
356
|
return q{} if nocontent($val);
|
1114
|
146
|
|
|
|
|
1427
|
$val =~ s/,([^,]+)$/.$1/xsm; # replace ultimate , with .
|
1115
|
146
|
|
|
|
|
338
|
return trim( unquote($val) );
|
1116
|
|
|
|
|
|
|
}
|
1117
|
|
|
|
|
|
|
|
1118
|
|
|
|
|
|
|
sub _pos_is {
|
1119
|
12
|
|
|
12
|
|
499
|
my ( $pos_aref, $fields, $lang ) = @_;
|
1120
|
12
|
50
|
|
|
|
35
|
$pos_aref = [$pos_aref] if !ref $pos_aref;
|
1121
|
12
|
|
|
|
|
17
|
my @test_str = map { split /[\W\.]+/xsm } @{$pos_aref};
|
|
12
|
|
|
|
|
31
|
|
|
12
|
|
|
|
|
22
|
|
1122
|
12
|
50
|
|
|
|
28
|
return [qw/UK/] if !scalar @test_str;
|
1123
|
12
|
|
|
|
|
16
|
my @pos_ari = ();
|
1124
|
12
|
|
|
|
|
22
|
for my $pos_str (@test_str) {
|
1125
|
|
|
|
|
|
|
push @pos_ari, first {
|
1126
|
|
|
|
|
|
|
hascontent( $fields->datum( $lang, 'pos_' . $_ ) )
|
1127
|
47
|
100
|
|
47
|
|
189
|
and first { $_ =~ m/^$pos_str$/xsm }(split /\|/, $fields->datum( $lang, 'pos_' . $_ ))
|
|
46
|
|
|
|
|
1443
|
|
1128
|
|
|
|
|
|
|
}
|
1129
|
12
|
|
|
|
|
62
|
qw/NN VB AJ AV CJ PN PP DA NM IJ NB OT UK/;
|
1130
|
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
}
|
1132
|
12
|
|
|
|
|
54
|
return \@pos_ari;
|
1133
|
|
|
|
|
|
|
}
|
1134
|
|
|
|
|
|
|
|
1135
|
|
|
|
|
|
|
sub _log10 {
|
1136
|
0
|
|
|
0
|
|
0
|
return log(shift) / log(10);
|
1137
|
|
|
|
|
|
|
}
|
1138
|
|
|
|
|
|
|
|
1139
|
|
|
|
|
|
|
sub _val_or_0 {
|
1140
|
46
|
|
|
46
|
|
78
|
my $val = shift;
|
1141
|
46
|
100
|
|
|
|
118
|
return ( is_numeric($val) ) ? $val : 0;
|
1142
|
|
|
|
|
|
|
}
|
1143
|
|
|
|
|
|
|
|
1144
|
|
|
|
|
|
|
sub _croak_defunct {
|
1145
|
0
|
|
|
0
|
|
|
croak
|
1146
|
|
|
|
|
|
|
'That method is defunct. See the POD for an alternative, and the CHANGES file';
|
1147
|
|
|
|
|
|
|
}
|
1148
|
|
|
|
|
|
|
*freqhash = \&_croak_defunct;
|
1149
|
|
|
|
|
|
|
*ldist = \&_croak_defunct;
|
1150
|
|
|
|
|
|
|
*on_count = \&_croak_defunct;
|
1151
|
|
|
|
|
|
|
*on_ldist = \&_croak_defunct;
|
1152
|
|
|
|
|
|
|
*on_freq_max = \&_croak_defunct;
|
1153
|
|
|
|
|
|
|
*on_zipf_mean = \&_croak_defunct;
|
1154
|
|
|
|
|
|
|
*on_freq_mean = \&_croak_defunct;
|
1155
|
|
|
|
|
|
|
*on_lfreq_mean = \&_croak_defunct;
|
1156
|
|
|
|
|
|
|
*on_frq_opm_max = \&_croak_defunct;
|
1157
|
|
|
|
|
|
|
*on_frq_opm_max = \&_croak_defunct;
|
1158
|
|
|
|
|
|
|
*on_frq_zipf_mean = \&_croak_defunct;
|
1159
|
|
|
|
|
|
|
|
1160
|
|
|
|
|
|
|
sub _carp_deprecated {
|
1161
|
0
|
|
|
0
|
|
|
my ( $self, %args ) = @_;
|
1162
|
0
|
|
|
|
|
|
carp
|
1163
|
|
|
|
|
|
|
'That method is deprecated. See the POD for an alternative, and the CHANGES file';
|
1164
|
0
|
|
|
|
|
|
return;
|
1165
|
|
|
|
|
|
|
}
|
1166
|
|
|
|
|
|
|
*list_words = \&_carp_deprecated;
|
1167
|
|
|
|
|
|
|
*list_strings = \&_carp_deprecated;
|
1168
|
|
|
|
|
|
|
|
1169
|
|
|
|
|
|
|
=head1 DIAGNOSTICS
|
1170
|
|
|
|
|
|
|
|
1171
|
|
|
|
|
|
|
=over 4
|
1172
|
|
|
|
|
|
|
|
1173
|
|
|
|
|
|
|
=item * Need a valid attribute
|
1174
|
|
|
|
|
|
|
|
1175
|
|
|
|
|
|
|
When constructing the class object with L, the B argument must have a valid value, as indicated in the table above. Also, the module needs to read in the contents of a file named "specs.csv" which should be located within the SUBTLEX directory where the module itself is located (alongside the downloaded SUBTLEX files). This file specifies the field indices for the various stats within each SUBTLEX datafile. Check that this file is indeed within the Perl/site/lib/Lingua/Norms/SUBTLEX directory. If it is not, download and install the file to that location via the L package of this module.
|
1176
|
|
|
|
|
|
|
|
1177
|
|
|
|
|
|
|
=item * Value given to argument 'dir' (VALUE) in new() is not a directory
|
1178
|
|
|
|
|
|
|
|
1179
|
|
|
|
|
|
|
Croaked from L if called with a value for the argument B, and this value is not actually a directory/folder. This is the directory/folder in which the actual SUBTLEX datafiles should be located.
|
1180
|
|
|
|
|
|
|
|
1181
|
|
|
|
|
|
|
=item * Cannot find required database for language ...
|
1182
|
|
|
|
|
|
|
|
1183
|
|
|
|
|
|
|
Croaked from L if none of the given values to arguments B, B or B are valid, and even the default site/lib directory and US database are not accessible. Check that your have indeed a file with the given value of B (DE, NL, UK or US) within the Perl/site/lib/Lingua/Norms/SUBTLEX directory, or at least that the SUBTLEX-US file is located within it, and can be read via your script.
|
1184
|
|
|
|
|
|
|
|
1185
|
|
|
|
|
|
|
=item * Cannot determine fields for given language
|
1186
|
|
|
|
|
|
|
|
1187
|
|
|
|
|
|
|
Croaked upon construction if no fields are recognized for the given language. The value given to B must be one of DE, NL, UK or US.
|
1188
|
|
|
|
|
|
|
|
1189
|
|
|
|
|
|
|
=item * The requested value is not defined for the ... SUBTLEX corpus
|
1190
|
|
|
|
|
|
|
|
1191
|
|
|
|
|
|
|
Croaked when calling for a value for a statistic that is not defined for a given language, e.g., when requesting a value for the Zipf frequency in the NL corpus.
|
1192
|
|
|
|
|
|
|
|
1193
|
|
|
|
|
|
|
=item * No string to test; pass a value for to FUNCTION()
|
1194
|
|
|
|
|
|
|
|
1195
|
|
|
|
|
|
|
Croaked by several methods that expect a value for the named argument B, and when no such value is given. These methods require the letter-string to be passed to it as a I => I pair, with the key B followed by the value of the string to test.
|
1196
|
|
|
|
|
|
|
|
1197
|
|
|
|
|
|
|
=item * No string(s) to test; pass one or more letter-strings named \'strings\' as a referenced array
|
1198
|
|
|
|
|
|
|
|
1199
|
|
|
|
|
|
|
Same as above but specifically croaked by L which accepts more than one string in a single call.
|
1200
|
|
|
|
|
|
|
|
1201
|
|
|
|
|
|
|
=item * Need to install and have access to module File::RandomLine
|
1202
|
|
|
|
|
|
|
|
1203
|
|
|
|
|
|
|
Croaked by method L if the module it depends on (File::RandomLine) is not installed or accessible. This should have been installed (if not already) upon installation of the present module. See L to download and install this module manually.
|
1204
|
|
|
|
|
|
|
|
1205
|
|
|
|
|
|
|
=back
|
1206
|
|
|
|
|
|
|
|
1207
|
|
|
|
|
|
|
=head1 DEPENDENCIES
|
1208
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
L : for L
|
1210
|
|
|
|
|
|
|
|
1211
|
|
|
|
|
|
|
L : for C method
|
1212
|
|
|
|
|
|
|
|
1213
|
|
|
|
|
|
|
L : C, C, C, C and other functions
|
1214
|
|
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
L : C
|
1216
|
|
|
|
|
|
|
|
1217
|
|
|
|
|
|
|
L : for directory reading when calling L
|
1218
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
L : for various statistical methods
|
1220
|
|
|
|
|
|
|
|
1221
|
|
|
|
|
|
|
L : C
|
1222
|
|
|
|
|
|
|
|
1223
|
|
|
|
|
|
|
L : for determining valid string values
|
1224
|
|
|
|
|
|
|
|
1225
|
|
|
|
|
|
|
L : reads in the specs file
|
1226
|
|
|
|
|
|
|
|
1227
|
|
|
|
|
|
|
L : for determining the field delimiter within the datafiles
|
1228
|
|
|
|
|
|
|
|
1229
|
|
|
|
|
|
|
L : for plain ASCII transliterations of Unicode text
|
1230
|
|
|
|
|
|
|
|
1231
|
|
|
|
|
|
|
=head1 REFERENCES
|
1232
|
|
|
|
|
|
|
|
1233
|
|
|
|
|
|
|
Brysbaert, M., Buchmeier, M., Conrad, M., Jacobs, A.M., Boelte, J., & Boehl, A. (2011). The word frequency effect: A review of recent developments and implications for the choice of frequency estimates in German. I, I<58>, 412-424. doi: L<10.1027/1618-3169/a000123|http://dx.doi.org/10.1027/1618-3169/a000123>
|
1234
|
|
|
|
|
|
|
|
1235
|
|
|
|
|
|
|
Brysbaert, M., & New, B. (2009). Moving beyond Kucera and Francis: A critical evaluation of current word frequency norms and the introduction of a new and improved word frequency measure for American English. I, I<41>, 977-990. doi: L<10.3758/BRM.41.4.977|http://dx.doi.org/10.3758/BRM.41.4.977>
|
1236
|
|
|
|
|
|
|
|
1237
|
|
|
|
|
|
|
Brysbaert, M., New, B., & Keuleers,E. (2012). Adding part-of-speech information to the SUBTLEX-US word frequencies. I, I<44>, 991-997. doi: L<10.3758/s13428-012-0190-4|http://dx.doi.org/10.3758/s13428-012-0190-4>
|
1238
|
|
|
|
|
|
|
|
1239
|
|
|
|
|
|
|
Herdagdelen, A., & Marelli, M. (2017). Social media and language processing: How Facebook and Twitter provide the best frequency estimates for studying word recognition. I, I<41>, 976-995. doi:L<10.1111/cogs.12392|http://dx.doi.org/10.1111/cogs.12392>
|
1240
|
|
|
|
|
|
|
|
1241
|
|
|
|
|
|
|
Keuleers, E., Brysbaert, M., & New, B. (2010). SUBTLEX-NL: A new frequency measure for Dutch words based on film subtitles. I, I<42>, 643-650. doi: L<10.3758/BRM.42.3.643|http://dx.doi.org/10.3758/BRM.42.3.643>
|
1242
|
|
|
|
|
|
|
|
1243
|
|
|
|
|
|
|
New, B., Brysbaert, M., Veronis, J., & Pallier, C. (2007). The use of film subtitles to estimate word frequencies. I, I<28>, 661-677.
|
1244
|
|
|
|
|
|
|
|
1245
|
|
|
|
|
|
|
Soares, A. P., Machado, J., Costa, A., Comesaña, M., & Perea, M. (in press). On the advantages of frequency measures extracted from subtitles: The case of Portuguese. I.
|
1246
|
|
|
|
|
|
|
|
1247
|
|
|
|
|
|
|
Van Heuven, W. J. B., Mandera, P., Keuleers, E., & Brysbaert, M. (2014). SUBTLEX-UK: A new and improved word frequency database for British English. I, I<67>, 1176-1190. doi: L<10.1080/17470218.2013.850521|http://dx.doi.org/10.1080/17470218.2013.850521>
|
1248
|
|
|
|
|
|
|
|
1249
|
|
|
|
|
|
|
=head1 AUTHOR
|
1250
|
|
|
|
|
|
|
|
1251
|
|
|
|
|
|
|
Roderick Garton, C<< >>
|
1252
|
|
|
|
|
|
|
|
1253
|
|
|
|
|
|
|
=head1 BUGS AND LIMITATIONS
|
1254
|
|
|
|
|
|
|
|
1255
|
|
|
|
|
|
|
Please report any bugs or feature requests to C, or through
|
1256
|
|
|
|
|
|
|
the web interface at L. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.
|
1257
|
|
|
|
|
|
|
|
1258
|
|
|
|
|
|
|
=head1 SUPPORT
|
1259
|
|
|
|
|
|
|
|
1260
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command.
|
1261
|
|
|
|
|
|
|
|
1262
|
|
|
|
|
|
|
perldoc Lingua::Norms::SUBTLEX
|
1263
|
|
|
|
|
|
|
|
1264
|
|
|
|
|
|
|
You can also look for information at:
|
1265
|
|
|
|
|
|
|
|
1266
|
|
|
|
|
|
|
=over 4
|
1267
|
|
|
|
|
|
|
|
1268
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker (report bugs here)
|
1269
|
|
|
|
|
|
|
|
1270
|
|
|
|
|
|
|
L
|
1271
|
|
|
|
|
|
|
|
1272
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation
|
1273
|
|
|
|
|
|
|
|
1274
|
|
|
|
|
|
|
L
|
1275
|
|
|
|
|
|
|
|
1276
|
|
|
|
|
|
|
=item * CPAN Ratings
|
1277
|
|
|
|
|
|
|
|
1278
|
|
|
|
|
|
|
L
|
1279
|
|
|
|
|
|
|
|
1280
|
|
|
|
|
|
|
=item * Search CPAN
|
1281
|
|
|
|
|
|
|
|
1282
|
|
|
|
|
|
|
L
|
1283
|
|
|
|
|
|
|
|
1284
|
|
|
|
|
|
|
=back
|
1285
|
|
|
|
|
|
|
|
1286
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT
|
1287
|
|
|
|
|
|
|
|
1288
|
|
|
|
|
|
|
Copyright 2014-2018 Roderick Garton.
|
1289
|
|
|
|
|
|
|
|
1290
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it
|
1291
|
|
|
|
|
|
|
under the terms of either: the GNU General Public License as published
|
1292
|
|
|
|
|
|
|
by the Free Software Foundation; or the Artistic License.
|
1293
|
|
|
|
|
|
|
|
1294
|
|
|
|
|
|
|
See L for more information.
|
1295
|
|
|
|
|
|
|
|
1296
|
|
|
|
|
|
|
To the maximum extent permitted by applicable law, the author of this module disclaims all warranties, either express or implied, including but not limited to implied warranties of merchantability and fitness for a particular purpose, with regard to the software and the accompanying documentation.
|
1297
|
|
|
|
|
|
|
|
1298
|
|
|
|
|
|
|
=cut
|
1299
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
1; # End of Lingua::Norms::SUBTLEX
|