line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
## -*- Mode: CPerl -*- |
2
|
|
|
|
|
|
|
## |
3
|
|
|
|
|
|
|
## File: DiaColloDB::Profile.pm |
4
|
|
|
|
|
|
|
## Author: Bryan Jurish <moocow@cpan.org> |
5
|
|
|
|
|
|
|
## Description: collocation db, (co-)frequency profile |
6
|
|
|
|
|
|
|
## + for scoring heuristics, see: |
7
|
|
|
|
|
|
|
## |
8
|
|
|
|
|
|
|
## - Jörg Didakowski; Alexander Geyken, 2013. From DWDS corpora to a German Word Profile – methodological problems and solutions. |
9
|
|
|
|
|
|
|
## In: Network Strategies, Access Structures and Automatic Extraction of Lexicographical Information. 2nd Work Report of the |
10
|
|
|
|
|
|
|
## Academic Network "Internet Lexicography". Mannheim: Institut für Deutsche Sprache. (OPAL - Online publizierte Arbeiten zur |
11
|
|
|
|
|
|
|
## Linguistik X/2012), S. 43-52. |
12
|
|
|
|
|
|
|
## URL http://www.dwds.de/static/website/publications/pdf/didakowski_geyken_internetlexikografie_2012_final.pdf |
13
|
|
|
|
|
|
|
## |
14
|
|
|
|
|
|
|
## - Rychlý, P. 2008. `A lexicographer-friendly association score'. In P. Sojka and A. Horák (eds.) Proceedings of Recent |
15
|
|
|
|
|
|
|
## Advances in Slavonic Natural Language Processing. RASLAN 2008, 69. |
16
|
|
|
|
|
|
|
## URL http://www.muni.cz/research/publications/937193 , http://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf |
17
|
|
|
|
|
|
|
## |
18
|
|
|
|
|
|
|
## - Kilgarriff, A. and Tugwell, D. 2002. `Sketching words'. In M.-H. Corréard (ed.) Lexicography and Natural |
19
|
|
|
|
|
|
|
## Language Processing: A Festschrift in Honour of B. T. S. Atkins. EURALEX, 125-137. |
20
|
|
|
|
|
|
|
## URL http://www.kilgarriff.co.uk/Publications/2002-KilgTugwell-AtkinsFest.pdf |
21
|
|
|
|
|
|
|
## |
22
|
|
|
|
|
|
|
## - Evert, Stefan (2008). "Corpora and collocations." In A. Lüdeling and M. Kytö (eds.), |
23
|
|
|
|
|
|
|
## Corpus Linguistics. An International Handbook, article 58, pages 1212-1248. |
24
|
|
|
|
|
|
|
## Mouton de Gruyter, Berlin. |
25
|
|
|
|
|
|
|
## URL (extended manuscript): http://purl.org/stefan.evert/PUB/Evert2007HSK_extended_manuscript.pdf |
26
|
|
|
|
|
|
|
## |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
package DiaColloDB::Profile; |
29
|
1
|
|
|
1
|
|
9
|
use DiaColloDB::Utils qw(:math :html); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
84
|
|
30
|
1
|
|
|
1
|
|
416
|
use DiaColloDB::Persistent; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
31
|
|
31
|
1
|
|
|
1
|
|
587
|
use DiaColloDB::Profile::Diff; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
52
|
|
32
|
1
|
|
|
1
|
|
9
|
use IO::File; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
40
|
|
33
|
1
|
|
|
1
|
|
191
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
542
|
|
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
#use overload |
36
|
|
|
|
|
|
|
# #fallback => 0, |
37
|
|
|
|
|
|
|
# bool => sub {defined($_[0])}, |
38
|
|
|
|
|
|
|
# int => sub {$_[0]{N}}, |
39
|
|
|
|
|
|
|
# '+' => \&add, |
40
|
|
|
|
|
|
|
# '+=' => \&_add, |
41
|
|
|
|
|
|
|
# '-' => \&diff, |
42
|
|
|
|
|
|
|
# '-=' => \&_diff; |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
##============================================================================== |
46
|
|
|
|
|
|
|
## Globals & Constants |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB::Persistent); |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
##============================================================================== |
51
|
|
|
|
|
|
|
## Constructors etc. |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
## $prf = CLASS_OR_OBJECT->new(%args) |
54
|
|
|
|
|
|
|
## + %args, object structure: |
55
|
|
|
|
|
|
|
## ( |
56
|
|
|
|
|
|
|
## label => $label, ##-- string label (used by Multi; undef for none(default)) |
57
|
|
|
|
|
|
|
## N => $N, ##-- total marginal relation frequency |
58
|
|
|
|
|
|
|
## f1 => $f1, ##-- total marginal frequency of target word(s) |
59
|
|
|
|
|
|
|
## f2 => \%f2, ##-- total marginal frequency of collocates: ($i2=>$f2, ...) |
60
|
|
|
|
|
|
|
## f12 => \%f12, ##-- collocation frequencies, %f12 = ($i2=>$f12, ...) |
61
|
|
|
|
|
|
|
## titles => \@titles, ##-- item group titles (default:undef: unknown) |
62
|
|
|
|
|
|
|
## # |
63
|
|
|
|
|
|
|
## eps => $eps, ##-- smoothing constant (default=0) #0.5 |
64
|
|
|
|
|
|
|
## score => $func, ##-- selected scoring function qw(f fm lf lfm mi1 mi3 milf ld ll) |
65
|
|
|
|
|
|
|
## milf => \%milf_12, ##-- score: mutual information * logFreq a la Wortprofil; requires compile_milf() |
66
|
|
|
|
|
|
|
## mi1 => \%mi1_12, ##-- score: mutual information; requires compile_mi1() |
67
|
|
|
|
|
|
|
## mi3 => \%mi3_12, ##-- score: mutual information^3 a la Rychly (2008); requires compile_mi3() |
68
|
|
|
|
|
|
|
## ld => \%ld_12, ##-- score: log-dice a la Wortprofil; requires compile_ld() |
69
|
|
|
|
|
|
|
## ll => \%ll_12, ##-- score: 1-sided log-likelihood a la Evert (2008); requires compile_ll() |
70
|
|
|
|
|
|
|
## fm => \%fm_12, ##-- frequency per million score; requires compile_fm() |
71
|
|
|
|
|
|
|
## lf => \%lf_12, ##-- log-frequency ; requires compile_lf() |
72
|
|
|
|
|
|
|
## lfm => \%lfm_12, ##-- log-frequency per million; requires compile_lfm() |
73
|
|
|
|
|
|
|
## ) |
74
|
|
|
|
|
|
|
sub new { |
75
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
76
|
0
|
|
0
|
|
|
|
my $prf = bless({ |
77
|
|
|
|
|
|
|
#label=>undef, |
78
|
|
|
|
|
|
|
N=>1, |
79
|
|
|
|
|
|
|
f1=>0, |
80
|
|
|
|
|
|
|
f2=>{}, |
81
|
|
|
|
|
|
|
f12=>{}, |
82
|
|
|
|
|
|
|
eps=>0, #0.5, |
83
|
|
|
|
|
|
|
#titles=>undef, |
84
|
|
|
|
|
|
|
#ld=>{}, |
85
|
|
|
|
|
|
|
@_ |
86
|
|
|
|
|
|
|
}, (ref($that)||$that)); |
87
|
0
|
|
|
|
|
|
return $prf; |
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
## $prf2 = $prf->clone() |
91
|
|
|
|
|
|
|
## $prf2 = $prf->clone($keep_compiled) |
92
|
|
|
|
|
|
|
## + clones %$mp |
93
|
|
|
|
|
|
|
## + if $keep_score is true, compiled data is cloned too |
94
|
|
|
|
|
|
|
sub clone { |
95
|
0
|
|
|
0
|
1
|
|
my ($prf,$force) = @_; |
96
|
|
|
|
|
|
|
return bless({ |
97
|
0
|
0
|
|
|
|
|
(map {defined($prf->{$_}) ? ($_=>$prf->{$_}) : qw()} qw(label N f1 eps)), |
98
|
0
|
0
|
|
|
|
|
(map {defined($prf->{$_}) ? ($_=>[@{$prf->{$_}}]) : qw()} qw(titles)), |
|
0
|
|
|
|
|
|
|
99
|
0
|
0
|
|
|
|
|
(map {defined($prf->{$_}) ? ($_=>{%{$prf->{$_}}}) : qw()} qw(f2 f12)), |
|
0
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
($force |
101
|
|
|
|
|
|
|
? ( |
102
|
|
|
|
|
|
|
($prf->{score} ? (score=>$prf->{score}) : qw()), |
103
|
0
|
0
|
|
|
|
|
(map {$prf->{$_} ? ($_=>{%{$prf->{$_}}}) : qw()} $prf->scoreKeys), |
|
0
|
0
|
|
|
|
|
|
|
0
|
0
|
|
|
|
|
|
104
|
|
|
|
|
|
|
) |
105
|
|
|
|
|
|
|
: qw()), |
106
|
|
|
|
|
|
|
}, ref($prf)); |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
## $prf2 = $prf->shadow() |
110
|
|
|
|
|
|
|
## $prf2 = $prf->shadow($keep_compiled) |
111
|
|
|
|
|
|
|
## + shadows %$mp |
112
|
|
|
|
|
|
|
## + if $keep_score is true, compiled data is shadowed too (all zeroes) |
113
|
|
|
|
|
|
|
sub shadow { |
114
|
0
|
|
|
0
|
1
|
|
my $prf = $_[0]->clone($_[1]); |
115
|
0
|
|
|
|
|
|
$prf->{f1} = $prf->{N} = 0; |
116
|
0
|
|
|
|
|
|
foreach my $key (grep {defined($prf->{$_})} (qw(f2 f12),$prf->scoreKeys)) { |
|
0
|
|
|
|
|
|
|
117
|
0
|
|
|
|
|
|
$_ = 0 foreach (%{$prf->{$key}}); |
|
0
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
} |
119
|
0
|
|
|
|
|
|
return $prf; |
120
|
|
|
|
|
|
|
} |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
##============================================================================== |
125
|
|
|
|
|
|
|
## Basic Access |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
## $label = $prf->label() |
129
|
|
|
|
|
|
|
## + get label |
130
|
|
|
|
|
|
|
sub label { |
131
|
0
|
|
0
|
0
|
1
|
|
return $_[0]{label} // ''; |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
## \@titles_or_undef = $prf->titles() |
135
|
|
|
|
|
|
|
## + get item titles |
136
|
|
|
|
|
|
|
sub titles { |
137
|
0
|
|
|
0
|
1
|
|
return $_[0]{titles}; |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
## @keys = $prf->scoreKeys() |
141
|
|
|
|
|
|
|
## + returns known score function keys |
142
|
|
|
|
|
|
|
sub scoreKeys { |
143
|
0
|
|
|
0
|
1
|
|
return qw(mi1 mi3 milf ld ll fm lf lfm); |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
## $bool = $prf->empty() |
147
|
|
|
|
|
|
|
## + returns true iff profile is empty |
148
|
|
|
|
|
|
|
sub empty { |
149
|
0
|
|
|
0
|
1
|
|
my $p = shift; |
150
|
|
|
|
|
|
|
#return 0 if ($p->{f1}); ##-- do we want to keep nonzero $f1 even if there are no collocates? i think not... moocow 2015-11-02 |
151
|
0
|
0
|
|
|
|
|
return 1 if (!$p->{f1}); |
152
|
0
|
|
|
|
|
|
return !$p->size; |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
## $size = $prf->size() |
156
|
|
|
|
|
|
|
## + returns total number of collocates defined in profile |
157
|
|
|
|
|
|
|
sub size { |
158
|
0
|
|
|
0
|
0
|
|
my $p = shift; |
159
|
0
|
|
|
|
|
|
my $f = (grep {defined($p->{$_})} qw(f2 f12),$p->scoreKeys)[0]; |
|
0
|
|
|
|
|
|
|
160
|
0
|
0
|
|
|
|
|
return $f ? scalar(keys %{$p->{$f}}) : 0; |
|
0
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
##============================================================================== |
164
|
|
|
|
|
|
|
## I/O |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
167
|
|
|
|
|
|
|
## I/O: JSON |
168
|
|
|
|
|
|
|
## #+ INHERITED from DiaCollocDB::Persistent |
169
|
|
|
|
1
|
|
|
BEGIN { |
170
|
|
|
|
|
|
|
# *TO_JSON = \&TO_JSON__table; |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
sub TO_JSON__table { |
174
|
0
|
|
|
0
|
1
|
|
my $p = shift; |
175
|
0
|
|
|
|
|
|
my @fnames = (grep {defined($p->{$_})} qw(f2 f12),$p->scoreKeys); |
|
0
|
|
|
|
|
|
|
176
|
0
|
|
|
|
|
|
my @funcs = @$p{@fnames}; |
177
|
0
|
0
|
|
|
|
|
my @keys = @funcs ? (keys %{$funcs[0]}) : qw(); |
|
0
|
|
|
|
|
|
|
178
|
0
|
|
|
|
|
|
my ($key,$func); |
179
|
|
|
|
|
|
|
return { |
180
|
0
|
0
|
|
|
|
|
(map {exists($p->{$_}) ? ($_=>$p->{$_}) : qw()} qw(label N f1 eps score)), |
181
|
0
|
|
0
|
|
|
|
cols => [@fnames,@{$p->titles//[]}], |
182
|
|
|
|
|
|
|
data => [ |
183
|
0
|
|
|
|
|
|
(map {$key=$_; [(map {$_->{$key}} @funcs), split(/\t/,$key)]} @keys), |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
], |
185
|
|
|
|
|
|
|
}; |
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
sub TO_JSON__flat { |
189
|
0
|
|
|
0
|
1
|
|
my $p = shift; |
190
|
0
|
|
|
|
|
|
my $keyf = (grep {defined($p->{$_})} qw(f2 f12),$p->scoreKeys)[0]; |
|
0
|
|
|
|
|
|
|
191
|
0
|
0
|
|
|
|
|
my @keys = $keyf ? (keys %{$p->{$keyf}}) : qw(); |
|
0
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
return { |
193
|
0
|
0
|
|
|
|
|
(map {exists($p->{$_}) ? ($_=>$p->{$_}) : qw()} qw(label N f1 eps)), |
194
|
0
|
|
|
|
|
|
(keys => [map {[split(' ',$_)]} @keys]), |
195
|
0
|
0
|
|
|
|
|
(map {defined($p->{$_}) ? ($_ => [@{$p->{$_}}{@keys}]) : qw()} (qw(f2 f12),$p->scoreKeys)), |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
}; |
197
|
|
|
|
|
|
|
} |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
200
|
|
|
|
|
|
|
## I/O: Text |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
## undef = $CLASS_OR_OBJECT->saveTextHeader($fh, hlabel=>$hlabel, titles=>\@titles) |
203
|
|
|
|
|
|
|
sub saveTextHeader { |
204
|
0
|
|
|
0
|
1
|
|
my ($that,$fh,%opts) = @_; |
205
|
|
|
|
|
|
|
my @fields = ( |
206
|
|
|
|
|
|
|
qw(N f1 f2 f12 score), |
207
|
|
|
|
|
|
|
(defined($opts{hlabel}) ? $opts{hlabel} : qw()), |
208
|
0
|
0
|
0
|
|
|
|
@{$opts{titles} // (ref($that) ? $that->{titles} : undef) // [qw(item2)]}, |
|
0
|
0
|
0
|
|
|
|
|
209
|
|
|
|
|
|
|
); |
210
|
0
|
|
|
|
|
|
$fh->print(join("\t", map {"#".($_+1).":$fields[$_]"} (0..$#fields)), "\n"); |
|
0
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
## $bool = $prf->saveTextFh($fh, %opts) |
214
|
|
|
|
|
|
|
## + %opts: |
215
|
|
|
|
|
|
|
## ( |
216
|
|
|
|
|
|
|
## label => $label, ##-- override $prf->{label} (used by Profile::Multi), no tab-separators required |
217
|
|
|
|
|
|
|
## format => $fmt, ##-- printf format for scores (default="%f") |
218
|
|
|
|
|
|
|
## header => $bool, ##-- include header-row? (default=1) |
219
|
|
|
|
|
|
|
## hlabel => $hlabel, ##-- prefix header item-cells with $hlabel (used by Profile::Multi) |
220
|
|
|
|
|
|
|
## ) |
221
|
|
|
|
|
|
|
## + format (flat, TAB-separated): N F1 F2 F12 SCORE LABEL ITEM2 |
222
|
|
|
|
|
|
|
sub saveTextFh { |
223
|
0
|
|
|
0
|
1
|
|
my ($prf,$fh,%opts) = @_; |
224
|
0
|
0
|
|
|
|
|
my $label = (exists($opts{label}) ? $opts{label} : $prf->{label}); |
225
|
0
|
|
|
|
|
|
my ($N,$f1,$f2,$f12) = @$prf{qw(N f1 f2 f12)}; |
226
|
0
|
|
0
|
|
|
|
my $fscore = $prf->{$prf->{score}//'f12'}; |
227
|
0
|
|
0
|
|
|
|
my $fmt = $opts{format} || '%f'; |
228
|
0
|
|
|
|
|
|
binmode($fh,':utf8'); |
229
|
0
|
0
|
0
|
|
|
|
$prf->saveTextHeader($fh,%opts) if ($opts{header}//1); |
230
|
0
|
|
|
|
|
|
foreach (sort {$fscore->{$b} <=> $fscore->{$a}} keys %$fscore) { |
|
0
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
$fh->print(join("\t", |
232
|
0
|
|
0
|
|
|
|
map {$_//0} |
233
|
|
|
|
|
|
|
$N, |
234
|
|
|
|
|
|
|
$f1, |
235
|
|
|
|
|
|
|
$f2->{$_}, |
236
|
|
|
|
|
|
|
$f12->{$_}, |
237
|
0
|
0
|
0
|
|
|
|
sprintf($fmt,$fscore->{$_}//'nan'), |
238
|
|
|
|
|
|
|
(defined($label) ? $label : qw()), |
239
|
|
|
|
|
|
|
$_), |
240
|
|
|
|
|
|
|
"\n"); |
241
|
|
|
|
|
|
|
} |
242
|
0
|
|
|
|
|
|
return $prf; |
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
246
|
|
|
|
|
|
|
## I/O: HTML |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
## $bool = $prf->saveHtmlFile($filename_or_handle, %opts) |
249
|
|
|
|
|
|
|
## + %opts: |
250
|
|
|
|
|
|
|
## ( |
251
|
|
|
|
|
|
|
## table => $bool, ##-- include <table>..</table> ? (default=1) |
252
|
|
|
|
|
|
|
## body => $bool, ##-- include <html><body>..</html></body> ? (default=1) |
253
|
|
|
|
|
|
|
## header => $bool, ##-- include header-row? (default=1) |
254
|
|
|
|
|
|
|
## hlabel => $hlabel, ##-- prefix header item-cells with $hlabel (used by Profile::Multi), no '<th>..</th>' required |
255
|
|
|
|
|
|
|
## label => $label, ##-- prefix item-cells with $label (used by Profile::Multi), no '<td>..</td>' required |
256
|
|
|
|
|
|
|
## format => $fmt, ##-- printf score formatting (default="%.4f") |
257
|
|
|
|
|
|
|
## ) |
258
|
|
|
|
|
|
|
## + saves rows of the format "N F1 F2 F12 SCORE PREFIX? ITEM2" |
259
|
|
|
|
|
|
|
sub saveHtmlFile { |
260
|
0
|
|
|
0
|
1
|
|
my ($prf,$file,%opts) = @_; |
261
|
0
|
0
|
|
|
|
|
my $fh = ref($file) ? $file : IO::File->new(">$file"); |
262
|
0
|
0
|
|
|
|
|
$prf->logconfess("saveHtmlFile(): failed to open '$file': $!") if (!ref($fh)); |
263
|
0
|
|
|
|
|
|
binmode($fh,':utf8'); |
264
|
|
|
|
|
|
|
|
265
|
0
|
0
|
0
|
|
|
|
$fh->print("<html><body>\n") if ($opts{body}//1); |
266
|
0
|
0
|
0
|
|
|
|
$fh->print("<table><tbody>\n") if ($opts{table}//1); |
267
|
|
|
|
|
|
|
$fh->print("<tr>",( |
268
|
0
|
|
|
|
|
|
map {"<th>".htmlesc($_)."</th>"} |
269
|
|
|
|
|
|
|
qw(N f1 f2 f12 score), |
270
|
|
|
|
|
|
|
(defined($opts{hlabel}) ? $opts{hlabel} : qw()), |
271
|
0
|
|
0
|
|
|
|
@{$prf->titles//[qw(item2)]}, |
272
|
|
|
|
|
|
|
), |
273
|
|
|
|
|
|
|
"</tr>\n" |
274
|
0
|
0
|
0
|
|
|
|
) if ($opts{header}//1); |
|
|
0
|
|
|
|
|
|
275
|
|
|
|
|
|
|
|
276
|
0
|
|
|
|
|
|
my ($N,$f1,$f2,$f12) = @$prf{qw(N f1 f2 f12)}; |
277
|
0
|
0
|
|
|
|
|
my $label = (exists($opts{label}) ? $opts{label} : $prf->{label}); |
278
|
0
|
|
0
|
|
|
|
my $fscore = $prf->{$prf->{score}//'f12'}; |
279
|
0
|
|
0
|
|
|
|
my $fmt = $opts{format} || "%.4f"; |
280
|
0
|
|
|
|
|
|
foreach (sort {$fscore->{$b} <=> $fscore->{$a}} keys %$fscore) { |
|
0
|
|
|
|
|
|
|
281
|
0
|
|
0
|
|
|
|
$fh->print("<tr>", (map {"<td>".htmlesc($_//0)."</td>"} |
282
|
|
|
|
|
|
|
$N, |
283
|
|
|
|
|
|
|
$f1, |
284
|
|
|
|
|
|
|
$f2->{$_}, |
285
|
|
|
|
|
|
|
$f12->{$_}, |
286
|
0
|
0
|
0
|
|
|
|
sprintf($fmt, $fscore->{$_}//'nan'), |
287
|
|
|
|
|
|
|
(defined($label) ? $label : qw()), |
288
|
|
|
|
|
|
|
split(/\t/,$_), |
289
|
|
|
|
|
|
|
), |
290
|
|
|
|
|
|
|
"</tr>\n"); |
291
|
|
|
|
|
|
|
} |
292
|
0
|
0
|
0
|
|
|
|
$fh->print("</tbody><table>\n") if ($opts{table}//1); |
293
|
0
|
0
|
0
|
|
|
|
$fh->print("</body></html>\n") if ($opts{body}//1); |
294
|
0
|
0
|
|
|
|
|
$fh->close() if (!ref($file)); |
295
|
0
|
|
|
|
|
|
return $prf; |
296
|
|
|
|
|
|
|
} |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
##============================================================================== |
300
|
|
|
|
|
|
|
## Compilation |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
## $prf = $prf->compile($func,%opts) |
303
|
|
|
|
|
|
|
## + compile for score-function $func, one of qw(f fm lf lfm mi1 mi3 milf ld ll); default='f' |
304
|
|
|
|
|
|
|
sub compile { |
305
|
0
|
|
|
0
|
1
|
|
my $prf = shift; |
306
|
0
|
|
|
|
|
|
my $func = shift; |
307
|
0
|
0
|
0
|
|
|
|
return $prf->compile_f(@_) if (!$func || $func =~ m{^(?:f(?:req(?:uency)?)?(?:12)?)$}i); |
308
|
0
|
0
|
|
|
|
|
return $prf->compile_fm(@_) if ($func =~ m{^(?:f(?:req(?:uency)?)?(?:-?p(?:er)?)?(?:-?m(?:(?:ill)?ion)?)(?:12)?)$}i); |
309
|
0
|
0
|
|
|
|
|
return $prf->compile_lf(@_) if ($func =~ m{^(?:l(?:og)?-?f(?:req(?:uency)?)?(?:12)?)$}i); |
310
|
0
|
0
|
|
|
|
|
return $prf->compile_lfm(@_) if ($func =~ m{^(?:l(?:og)?-?f(?:req(?:uency)?)?(?:-?p(?:er)?)?(?:-?m(?:(?:ill)?ion)?)(?:12)?)$}i); |
311
|
0
|
0
|
|
|
|
|
return $prf->compile_ld(@_) if ($func =~ m{^(?:ld|log-?dice)}i); |
312
|
0
|
0
|
|
|
|
|
return $prf->compile_ll(@_) if ($func =~ m{^(?:ll|log-?l(?:ikelihood)?)}i); |
313
|
0
|
0
|
|
|
|
|
return $prf->compile_milf(@_) if ($func =~ m{^(?:(?:lf)?mi(?:-?lf)?|mutual-?information-?(?:l(?:og)?)?-?f(?:req(?:uency)?)?)?$}i); |
314
|
0
|
0
|
|
|
|
|
return $prf->compile_mi1(@_) if ($func =~ m{^(?:mi1|mutual-?information-?1|pmi1?)$}i); |
315
|
0
|
0
|
|
|
|
|
return $prf->compile_mi3(@_) if ($func =~ m{^(?:mi3|mutual-?information-?3)$}i); |
316
|
0
|
|
|
|
|
|
$prf->logwarn("compile(): unknown score function '$func'"); |
317
|
0
|
|
|
|
|
|
return $prf->compile_f(@_); |
318
|
|
|
|
|
|
|
} |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
## $prf = $prf->uncompile() |
321
|
|
|
|
|
|
|
## + un-compiles all scores for $prf |
322
|
|
|
|
|
|
|
sub uncompile { |
323
|
0
|
|
|
0
|
1
|
|
delete @{$_[0]}{$_[0]->scoreKeys,'score'}; |
|
0
|
|
|
|
|
|
|
324
|
0
|
|
|
|
|
|
return $_[0]; |
325
|
|
|
|
|
|
|
} |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
## $prf = $prf->compile_clean(%opts) |
328
|
|
|
|
|
|
|
## + bashes non-finite compiled score values to undef |
329
|
|
|
|
|
|
|
sub compile_clean { |
330
|
0
|
|
|
0
|
0
|
|
my $prf = shift; |
331
|
0
|
0
|
0
|
|
|
|
return $prf if (!$prf->{score} || !$prf->{$prf->{score}}); |
332
|
0
|
|
|
|
|
|
foreach (values %{$prf->{$prf->{score}}}) { |
|
0
|
|
|
|
|
|
|
333
|
0
|
0
|
|
|
|
|
$_ = undef if (isInf($_)); |
334
|
|
|
|
|
|
|
} |
335
|
0
|
|
|
|
|
|
return $prf; |
336
|
|
|
|
|
|
|
} |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
## $prf = $prf->compile_f() |
339
|
|
|
|
|
|
|
## + just sets $prf->{score} = 'f12' |
340
|
|
|
|
|
|
|
sub compile_f { |
341
|
0
|
|
|
0
|
1
|
|
$_[0]{score} = 'f12'; |
342
|
0
|
|
|
|
|
|
return $_[0]; |
343
|
|
|
|
|
|
|
} |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
## $prf = $prf->compile_fm() |
346
|
|
|
|
|
|
|
## + computes frequency-per-million in $prf->{fm} |
347
|
|
|
|
|
|
|
## + sets $prf->{score}='fm' |
348
|
|
|
|
|
|
|
sub compile_fm { |
349
|
0
|
|
|
0
|
1
|
|
my $prf = shift; |
350
|
0
|
|
|
|
|
|
my $pf12 = $prf->{f12}; |
351
|
0
|
|
|
|
|
|
my $M = $prf->{N} / 1000000; |
352
|
0
|
|
|
|
|
|
my $fm = $prf->{fm} = {}; |
353
|
0
|
|
|
|
|
|
my ($i2,$f12); |
354
|
0
|
|
|
|
|
|
while (($i2,$f12)=each(%$pf12)) { |
355
|
0
|
|
|
|
|
|
$fm->{$i2} = $f12 / $M; |
356
|
|
|
|
|
|
|
} |
357
|
0
|
|
|
|
|
|
$prf->{score} = 'fm'; |
358
|
0
|
|
|
|
|
|
return $prf; |
359
|
|
|
|
|
|
|
} |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
## $prf = $prf->compile_lf(%opts) |
362
|
|
|
|
|
|
|
## + computes log-frequency profile in $prf->{lf} |
363
|
|
|
|
|
|
|
## + sets $prf->{score}='lf' |
364
|
|
|
|
|
|
|
## + %opts: |
365
|
|
|
|
|
|
|
## eps => $eps #-- clobber $prf->{eps} |
366
|
|
|
|
|
|
|
sub compile_lf { |
367
|
0
|
|
|
0
|
1
|
|
my ($prf,%opts) = @_; |
368
|
0
|
|
|
|
|
|
my $pf12 = $prf->{f12}; |
369
|
0
|
|
|
|
|
|
my $lf = $prf->{lf} = {}; |
370
|
0
|
|
0
|
|
|
|
my $eps = $opts{eps} // $prf->{eps} // 0.5; #0 |
|
|
|
0
|
|
|
|
|
371
|
0
|
|
|
|
|
|
my ($i2,$f12); |
372
|
0
|
|
|
|
|
|
while (($i2,$f12)=each(%$pf12)) { |
373
|
0
|
|
|
|
|
|
$lf->{$i2} = log2($f12+$eps); |
374
|
|
|
|
|
|
|
} |
375
|
0
|
|
|
|
|
|
$prf->{score} = 'lf'; |
376
|
0
|
|
|
|
|
|
return $prf->compile_clean(); |
377
|
|
|
|
|
|
|
} |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
## $prf = $prf->compile_lfm(%opts) |
380
|
|
|
|
|
|
|
## + computes log-öfrequency-per-million in $prf->{lfm} |
381
|
|
|
|
|
|
|
## + sets $prf->{score}='lfm' |
382
|
|
|
|
|
|
|
## + %opts: |
383
|
|
|
|
|
|
|
## eps => $eps #-- clobber $prf->{eps} |
384
|
|
|
|
|
|
|
sub compile_lfm { |
385
|
0
|
|
|
0
|
1
|
|
my ($prf,%opts) = @_; |
386
|
0
|
|
|
|
|
|
my $pf12 = $prf->{f12}; |
387
|
0
|
|
0
|
|
|
|
my $eps = $opts{eps} // $prf->{eps} // 0; #0.5; |
|
|
|
0
|
|
|
|
|
388
|
0
|
|
|
|
|
|
my $logM = log2($prf->{N}+$eps) - log2(1000000+$eps); |
389
|
0
|
|
|
|
|
|
my $lfm = $prf->{lfm} = {}; |
390
|
0
|
|
|
|
|
|
my ($i2,$f12); |
391
|
0
|
|
|
|
|
|
while (($i2,$f12)=each(%$pf12)) { |
392
|
0
|
|
|
|
|
|
$lfm->{$i2} = log2($f12+$eps) - $logM; |
393
|
|
|
|
|
|
|
} |
394
|
0
|
|
|
|
|
|
$prf->{score} = 'lfm'; |
395
|
0
|
|
|
|
|
|
return $prf->compile_clean(); |
396
|
|
|
|
|
|
|
} |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
## $prf = $prf->compile_milf(%opts) |
399
|
|
|
|
|
|
|
## + computes MI*logF-profile in $prf->{milf} a la Rychly (2008) |
400
|
|
|
|
|
|
|
## + sets $prf->{score}='milf' |
401
|
|
|
|
|
|
|
## + %opts: |
402
|
|
|
|
|
|
|
## eps => $eps #-- clobber $prf->{eps} |
403
|
|
|
|
|
|
|
BEGIN { |
404
|
1
|
|
|
1
|
|
710
|
*compile_mi = \&compile_milf; ##-- backwards-compatible alias |
405
|
|
|
|
|
|
|
} |
406
|
|
|
|
|
|
|
sub compile_milf { |
407
|
0
|
|
|
0
|
1
|
|
my ($prf,%opts) = @_; |
408
|
0
|
|
|
|
|
|
my ($N,$f1,$pf2,$pf12) = @$prf{qw(N f1 f2 f12)}; |
409
|
0
|
|
|
|
|
|
my $mi = $prf->{milf} = {}; |
410
|
0
|
|
0
|
|
|
|
my $eps = $opts{eps} // $prf->{eps} // 0; #0.5 |
|
|
|
0
|
|
|
|
|
411
|
0
|
|
|
|
|
|
my ($i2,$f2,$f12,$denom); |
412
|
0
|
|
|
|
|
|
while (($i2,$f2)=each(%$pf2)) { |
413
|
0
|
|
0
|
|
|
|
$f12 = ($pf12->{$i2} // 0) + $eps; |
414
|
0
|
|
|
|
|
|
$denom = (($f1+$eps)*($f2+$eps)); |
415
|
0
|
0
|
|
|
|
|
$mi->{$i2} = ( |
|
|
0
|
|
|
|
|
|
416
|
|
|
|
|
|
|
($f12 >= 0 ? log2($f12) : 0) |
417
|
|
|
|
|
|
|
* |
418
|
|
|
|
|
|
|
($denom |
419
|
|
|
|
|
|
|
? log2( (($f12+$eps)*($N+$eps)) / $denom ) |
420
|
|
|
|
|
|
|
: 0) |
421
|
|
|
|
|
|
|
); |
422
|
|
|
|
|
|
|
} |
423
|
0
|
|
|
|
|
|
$prf->{score} = 'milf'; |
424
|
0
|
|
|
|
|
|
return $prf->compile_clean(); |
425
|
|
|
|
|
|
|
} |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
## $prf = $prf->compile_mi1(%opts) |
428
|
|
|
|
|
|
|
## + computes raw poinwise-MI profile in $prf->{mi1} |
429
|
|
|
|
|
|
|
## + sets $prf->{score}='mi1' |
430
|
|
|
|
|
|
|
## + %opts: |
431
|
|
|
|
|
|
|
## eps => $eps #-- clobber $prf->{eps} |
432
|
|
|
|
|
|
|
sub compile_mi1 { |
433
|
0
|
|
|
0
|
1
|
|
my ($prf,%opts) = @_; |
434
|
0
|
|
|
|
|
|
my ($N,$f1,$pf2,$pf12) = @$prf{qw(N f1 f2 f12)}; |
435
|
0
|
|
|
|
|
|
my $mi = $prf->{mi1} = {}; |
436
|
0
|
|
0
|
|
|
|
my $eps = $opts{eps} // $prf->{eps} // 0; #0.5; |
|
|
|
0
|
|
|
|
|
437
|
0
|
|
|
|
|
|
my ($i2,$f2,$f12,$denom); |
438
|
0
|
|
|
|
|
|
while (($i2,$f2)=each(%$pf2)) { |
439
|
0
|
|
0
|
|
|
|
$f12 = $pf12->{$i2} // 0; |
440
|
0
|
|
|
|
|
|
$denom = (($f1+$eps)*($f2+$eps)); |
441
|
0
|
0
|
|
|
|
|
$mi->{$i2} = ($denom > 0 |
442
|
|
|
|
|
|
|
? log2( (($f12+$eps)*($N+$eps)) / $denom ) |
443
|
|
|
|
|
|
|
: undef |
444
|
|
|
|
|
|
|
); |
445
|
|
|
|
|
|
|
} |
446
|
0
|
|
|
|
|
|
$prf->{score} = 'mi1'; |
447
|
0
|
|
|
|
|
|
return $prf->compile_clean(); |
448
|
|
|
|
|
|
|
} |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
## $prf = $prf->compile_mi3(%opts) |
452
|
|
|
|
|
|
|
## + computes MI^3 profile in $prf->{mi} a la Rychly (2008) |
453
|
|
|
|
|
|
|
## + sets $prf->{score}='mi3' |
454
|
|
|
|
|
|
|
## + %opts: |
455
|
|
|
|
|
|
|
## eps => $eps #-- clobber $prf->{eps} |
456
|
|
|
|
|
|
|
sub compile_mi3 { |
457
|
0
|
|
|
0
|
1
|
|
my ($prf,%opts) = @_; |
458
|
0
|
|
|
|
|
|
my ($N,$f1,$pf2,$pf12) = @$prf{qw(N f1 f2 f12)}; |
459
|
0
|
|
|
|
|
|
my $mi3 = $prf->{mi3} = {}; |
460
|
0
|
|
0
|
|
|
|
my $eps = $opts{eps} // $prf->{eps} // 0; #0.5 |
|
|
|
0
|
|
|
|
|
461
|
0
|
|
|
|
|
|
my ($i2,$f2,$f12,$denom); |
462
|
0
|
|
|
|
|
|
while (($i2,$f2)=each(%$pf2)) { |
463
|
0
|
|
0
|
|
|
|
$f12 = $pf12->{$i2} // 0; |
464
|
0
|
|
|
|
|
|
$denom = (($f1+$eps)*($f2+$eps)); |
465
|
0
|
0
|
|
|
|
|
$mi3->{$i2} = ($denom |
466
|
|
|
|
|
|
|
? log2( (($f12+$eps)**3 * ($N+$eps)) / $denom ) |
467
|
|
|
|
|
|
|
: undef |
468
|
|
|
|
|
|
|
); |
469
|
|
|
|
|
|
|
} |
470
|
0
|
|
|
|
|
|
$prf->{score} = 'mi3'; |
471
|
0
|
|
|
|
|
|
return $prf->compile_clean(); |
472
|
|
|
|
|
|
|
} |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
## $prf = $prf->compile_ld(%opts) |
475
|
|
|
|
|
|
|
## + computes log-dice profile in $prf->{ld} a la Rychly (2008) |
476
|
|
|
|
|
|
|
## + sets $pf->{score}='ld' |
477
|
|
|
|
|
|
|
## + %opts: |
478
|
|
|
|
|
|
|
## eps => $eps #-- clobber $prf->{eps} |
479
|
|
|
|
|
|
|
sub compile_ld { |
480
|
0
|
|
|
0
|
1
|
|
my ($prf,%opts) = @_; |
481
|
0
|
|
|
|
|
|
my ($N,$f1,$pf2,$pf12) = @$prf{qw(N f1 f2 f12)}; |
482
|
0
|
|
|
|
|
|
my $ld = $prf->{ld} = {}; |
483
|
0
|
|
0
|
|
|
|
my $eps = $opts{eps} // $prf->{eps} // 0; #0.5; |
|
|
|
0
|
|
|
|
|
484
|
0
|
|
|
|
|
|
my ($i2,$f2,$f12,$denom); |
485
|
0
|
|
|
|
|
|
while (($i2,$f2)=each(%$pf2)) { |
486
|
0
|
|
0
|
|
|
|
$f12 = $pf12->{$i2} // 0; |
487
|
0
|
|
|
|
|
|
$denom = ($f1+$eps) + ($f2+$eps); |
488
|
0
|
0
|
|
|
|
|
$ld->{$i2} = ($denom |
489
|
|
|
|
|
|
|
? (14 + log2( (2 * ($f12+$eps)) / $denom )) |
490
|
|
|
|
|
|
|
: undef); |
491
|
|
|
|
|
|
|
} |
492
|
0
|
|
|
|
|
|
$prf->{score} = 'ld'; |
493
|
0
|
|
|
|
|
|
return $prf->compile_clean(); |
494
|
|
|
|
|
|
|
} |
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
## log0($x) : like log($x) but returns 0 for $x==0 |
497
|
|
|
|
|
|
|
sub log0 { |
498
|
1
|
|
|
1
|
|
15
|
no warnings 'uninitialized'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
106
|
|
499
|
0
|
0
|
|
0
|
0
|
|
return $_[0]>0 ? log($_[0]) : 0; |
500
|
|
|
|
|
|
|
} |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
## log0d($num,$denom) : like log0($num/$denom), but returns 0 for $num==0 or $denom==0 |
503
|
|
|
|
|
|
|
sub log0d { |
504
|
1
|
|
|
1
|
|
9
|
no warnings 'uninitialized'; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
1318
|
|
505
|
0
|
0
|
|
0
|
0
|
|
return $_[1]==0 ? 0 : log0($_[0]/$_[1]); |
506
|
|
|
|
|
|
|
} |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
## $prf = $prf->compile_ll(%opts) |
509
|
|
|
|
|
|
|
## + computes 1-sided log-log-likelihood ratio in $prf->{ll} a la Evert (2008) |
510
|
|
|
|
|
|
|
## + sets $pf->{score}='ll' |
511
|
|
|
|
|
|
|
## + %opts: |
512
|
|
|
|
|
|
|
## eps => $eps #-- clobber $prf->{eps} |
513
|
|
|
|
|
|
|
sub compile_ll { |
514
|
0
|
|
|
0
|
1
|
|
my ($prf,%opts) = @_; |
515
|
0
|
|
|
|
|
|
my $ll = $prf->{ll} = {}; |
516
|
0
|
|
0
|
|
|
|
my $eps = $opts{eps} // $prf->{eps} // 0; #0.5; ##-- IGNORED here |
|
|
|
0
|
|
|
|
|
517
|
0
|
|
|
|
|
|
my ($N,$f1,$pf2,$pf12) = @$prf{qw(N f1 f2 f12)}; |
518
|
0
|
|
|
|
|
|
$N += 2*$eps; |
519
|
0
|
|
|
|
|
|
$f1 += $eps; |
520
|
0
|
|
|
|
|
|
my ($i2,$f2,$f12,$logl); |
521
|
0
|
|
|
|
|
|
my $llmin = 0; |
522
|
0
|
|
|
|
|
|
while (($i2,$f2)=each(%$pf2)) { |
523
|
0
|
|
0
|
|
|
|
$f12 = ($pf12->{$i2} // 0) + $eps; |
524
|
0
|
0
|
|
|
|
|
$logl = (##-- raw log-lambda |
525
|
|
|
|
|
|
|
$N<=0 ? 0 |
526
|
|
|
|
|
|
|
: ($f12*log0d($f12, ($f1*$f2/$N)) |
527
|
|
|
|
|
|
|
+($f1-$f12)*log0d(($f1-$f12), (($f1*($N-$f2)/$N))) |
528
|
|
|
|
|
|
|
+($f2-$f12)*log0d(($f2-$f12), (($N-$f1)*$f2/$N)) |
529
|
|
|
|
|
|
|
+($N-$f1-$f2+$f12)*log0d(($N-$f1-$f2+$f12), (($N-$f1)*($N-$f2)/$N)) |
530
|
|
|
|
|
|
|
) |
531
|
|
|
|
|
|
|
); |
532
|
0
|
0
|
0
|
|
|
|
$ll->{$i2} = (($N && $f12 < ($f1*$f2/$N) ? -1 : 1) ##-- one-sided log-likelihood a la Evert (2008): negative for dis-associations |
533
|
|
|
|
|
|
|
#* $logl ##-- raw log-lambda values over-emphasize strong collocates |
534
|
|
|
|
|
|
|
* log0(1+$logl) ##-- extra log() is better for scaling |
535
|
|
|
|
|
|
|
#* sqrt($logl) ##-- extra sqrt() for scaling |
536
|
|
|
|
|
|
|
#* ($logl**(1/3)) ##-- extra cube-root for scaling |
537
|
|
|
|
|
|
|
); |
538
|
|
|
|
|
|
|
} |
539
|
|
|
|
|
|
|
|
540
|
0
|
|
|
|
|
|
$prf->{score} = 'll'; |
541
|
0
|
|
|
|
|
|
return $prf->compile_clean(); |
542
|
|
|
|
|
|
|
} |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
##============================================================================== |
547
|
|
|
|
|
|
|
## Trimming |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
## \@keys = $prf->which(%opts) |
550
|
|
|
|
|
|
|
## + returns 'good' keys for trimming options %opts: |
551
|
|
|
|
|
|
|
## ( |
552
|
|
|
|
|
|
|
## cutoff => $cutoff, ##-- retain only items with $prf->{$prf->{score}}{$item} >= $cutoff |
553
|
|
|
|
|
|
|
## kbest => $kbest, ##-- retain only $kbest items |
554
|
|
|
|
|
|
|
## kbesta => $kbesta, ##-- retain only $kbest items (absolute value) |
555
|
|
|
|
|
|
|
## return => $which, ##-- either 'good' (default) or 'bad' |
556
|
|
|
|
|
|
|
## as => $as, ##-- 'hash' or 'array'; default='array' |
557
|
|
|
|
|
|
|
## ) |
558
|
|
|
|
|
|
|
sub which { |
559
|
0
|
|
|
0
|
1
|
|
my ($prf,%opts) = @_; |
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
##-- trim: scoring function |
562
|
0
|
0
|
0
|
|
|
|
my $score = $prf->{$prf->{score}//'f12'} |
563
|
|
|
|
|
|
|
or $prf->logconfess("trim(): no profile scores for '$prf->{score}'"); |
564
|
0
|
|
|
|
|
|
my $bad = {}; |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
##-- which: by user-specified cutoff |
567
|
0
|
0
|
0
|
|
|
|
if ((my $cutoff=$opts{cutoff}//'') ne '') { |
568
|
0
|
|
|
|
|
|
my ($key,$val); |
569
|
0
|
|
|
|
|
|
while (($key,$val) = each %$score) { |
570
|
0
|
0
|
|
|
|
|
$bad->{$key} = undef if ($val < $cutoff); |
571
|
|
|
|
|
|
|
} |
572
|
|
|
|
|
|
|
} |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
##-- which: k-best |
575
|
0
|
|
|
|
|
|
my $kbest; |
576
|
0
|
0
|
0
|
|
|
|
if (defined($kbest = $opts{kbest}) && $kbest > 0) { |
577
|
0
|
|
|
|
|
|
my @keys = sort {$score->{$b} <=> $score->{$a}} grep {!exists($bad->{$_})} keys %$score; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
578
|
0
|
0
|
|
|
|
|
if (@keys > $kbest) { |
579
|
0
|
|
|
|
|
|
splice(@keys, 0, $kbest); |
580
|
0
|
|
|
|
|
|
@$bad{@keys} = qw(); |
581
|
|
|
|
|
|
|
} |
582
|
|
|
|
|
|
|
} |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
##-- which: abs k-best |
585
|
0
|
|
|
|
|
|
my $kbesta; |
586
|
0
|
0
|
0
|
|
|
|
if (defined($kbesta = $opts{kbesta}) && $kbesta > 0) { |
587
|
0
|
|
|
|
|
|
my @keys = sort {abs($score->{$b}) <=> abs($score->{$a})} grep {!exists($bad->{$_})} keys %$score; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
588
|
0
|
0
|
|
|
|
|
if (@keys > $kbesta) { |
589
|
0
|
|
|
|
|
|
splice(@keys, 0, $kbesta); |
590
|
0
|
|
|
|
|
|
@$bad{@keys} = qw(); |
591
|
|
|
|
|
|
|
} |
592
|
|
|
|
|
|
|
} |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
##-- which: return |
595
|
0
|
0
|
0
|
|
|
|
if (($opts{return}//'') eq 'bad') { |
596
|
0
|
0
|
0
|
|
|
|
return lc($opts{as}//'array') eq 'hash' ? $bad : [keys %$bad]; |
597
|
|
|
|
|
|
|
} |
598
|
0
|
0
|
0
|
|
|
|
return lc($opts{as}//'array') eq 'hash' ? {map {exists($bad->{$_}) ? qw() : ($_=>undef)} keys %$score } : [grep {!exists($bad->{$_})} keys %$score]; |
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
} |
600
|
|
|
|
|
|
|
|
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
## $prf = $prf->trim(%opts) |
603
|
|
|
|
|
|
|
## + %opts: |
604
|
|
|
|
|
|
|
## ( |
605
|
|
|
|
|
|
|
## kbest => $kbest, ##-- retain only $kbest items (by score value) |
606
|
|
|
|
|
|
|
## kbesta => $kbesta, ##-- retain only $kbest items (by score absolute value) |
607
|
|
|
|
|
|
|
## cutoff => $cutoff, ##-- retain only items with $prf->{$prf->{score}}{$item} >= $cutoff |
608
|
|
|
|
|
|
|
## keep => $keep, ##-- retain keys @$keep (ARRAY) or keys(%$keep) (HASH) |
609
|
|
|
|
|
|
|
## drop => $drop, ##-- drop keys @$drop (ARRAY) or keys(%$drop) (HASH) |
610
|
|
|
|
|
|
|
## ) |
611
|
|
|
|
|
|
|
## + this COULD be factored out into s.t. like $prf->trim($prf->which(%opts)), but it's about 15% faster inline |
612
|
|
|
|
|
|
|
sub trim { |
613
|
0
|
|
|
0
|
1
|
|
my ($prf,%opts) = @_; |
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
##-- trim: scoring function |
616
|
0
|
0
|
0
|
|
|
|
my $score = $prf->{$prf->{score}//'f12'} |
617
|
|
|
|
|
|
|
or $prf->logconfess("trim(): no profile scores for '$prf->{score}'"); |
618
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
##-- trim: by user request: keep |
620
|
0
|
0
|
|
|
|
|
if (defined($opts{keep})) { |
621
|
0
|
0
|
|
|
|
|
my $keep = (UNIVERSAL::isa($opts{keep},'ARRAY') ? {map {($_=>undef)} @{$opts{keep}}} : $opts{keep}); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
622
|
0
|
|
|
|
|
|
my @trim = grep {!exists($keep->{$_})} keys %$score; |
|
0
|
|
|
|
|
|
|
623
|
0
|
|
|
|
|
|
foreach (grep {defined($prf->{$_})} qw(f2 f12),$prf->scoreKeys) { |
|
0
|
|
|
|
|
|
|
624
|
0
|
|
|
|
|
|
delete @{$prf->{$_}}{@trim}; |
|
0
|
|
|
|
|
|
|
625
|
0
|
|
0
|
|
|
|
$_ //= 0 foreach (@{$prf->{$_}}{keys %$keep}); |
|
0
|
|
|
|
|
|
|
626
|
|
|
|
|
|
|
} |
627
|
|
|
|
|
|
|
} |
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
##-- trim: by user request: drop |
630
|
0
|
0
|
|
|
|
|
if (defined($opts{drop})) { |
631
|
0
|
0
|
|
|
|
|
my $drop = (UNIVERSAL::isa($opts{drop},'ARRAY') ? $opts{drop} : [keys %{$opts{drop}}]); |
|
0
|
|
|
|
|
|
|
632
|
0
|
|
|
|
|
|
delete @{$prf->{$_}}{@$drop} foreach (grep {defined($prf->{$_})} qw(f2 f12),$prf->scoreKeys); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
} |
634
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
##-- trim: by user-specified cutoff |
636
|
0
|
0
|
0
|
|
|
|
if ((my $cutoff=$opts{cutoff}//'') ne '') { |
637
|
0
|
|
|
|
|
|
my @trim = qw(); |
638
|
0
|
|
|
|
|
|
my ($key,$val); |
639
|
0
|
|
|
|
|
|
while (($key,$val) = each %$score) { |
640
|
0
|
0
|
|
|
|
|
push(@trim,$key) if ($val < $cutoff); |
641
|
|
|
|
|
|
|
} |
642
|
0
|
|
|
|
|
|
delete @{$prf->{$_}}{@trim} foreach (grep {defined($prf->{$_})} qw(f2 f12),$prf->scoreKeys); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
} |
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
##-- trim: k-best |
646
|
0
|
|
|
|
|
|
my $kbest; |
647
|
0
|
0
|
0
|
|
|
|
if (defined($kbest = $opts{kbest}) && $kbest > 0) { |
648
|
0
|
|
|
|
|
|
my @trim = sort {$score->{$b} <=> $score->{$a}} keys %$score; |
|
0
|
|
|
|
|
|
|
649
|
0
|
0
|
|
|
|
|
if (@trim > $kbest) { |
650
|
0
|
|
|
|
|
|
splice(@trim, 0, $kbest); |
651
|
0
|
|
|
|
|
|
delete @{$prf->{$_}}{@trim} foreach (grep {defined($prf->{$_})} qw(f2 f12),$prf->scoreKeys); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
} |
653
|
|
|
|
|
|
|
} |
654
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
##-- trim: abs k-best |
656
|
0
|
|
|
|
|
|
my $kbesta; |
657
|
0
|
0
|
0
|
|
|
|
if (defined($kbesta = $opts{kbesta}) && $kbesta > 0) { |
658
|
0
|
|
|
|
|
|
my @trim = sort {abs($score->{$b}) <=> abs($score->{$a})} keys %$score; |
|
0
|
|
|
|
|
|
|
659
|
0
|
0
|
|
|
|
|
if (@trim > $kbesta) { |
660
|
0
|
|
|
|
|
|
splice(@trim, 0, $kbesta); |
661
|
0
|
|
|
|
|
|
delete @{$prf->{$_}}{@trim} foreach (grep {defined($prf->{$_})} qw(f2 f12),$prf->scoreKeys); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
} |
663
|
|
|
|
|
|
|
} |
664
|
|
|
|
|
|
|
|
665
|
0
|
|
|
|
|
|
return $prf; |
666
|
|
|
|
|
|
|
} |
667
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
##============================================================================== |
669
|
|
|
|
|
|
|
## Stringification |
670
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
## $i2s = $prf->stringify_map( $obj) |
672
|
|
|
|
|
|
|
## $i2s = $prf->stringify_map(\@key2str) |
673
|
|
|
|
|
|
|
## $i2s = $prf->stringify_map(\&key2str) |
674
|
|
|
|
|
|
|
## $i2s = $prf->stringify_map(\%key2str) |
675
|
|
|
|
|
|
|
## + guts for stringify: get a map for stringification |
676
|
|
|
|
|
|
|
sub stringify_map { |
677
|
0
|
|
|
0
|
1
|
|
my ($prf,$i2s) = @_; |
678
|
1
|
|
|
1
|
|
12
|
no warnings 'numeric'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
816
|
|
679
|
0
|
0
|
|
|
|
|
if (UNIVERSAL::can($i2s,'i2s')) { |
|
|
0
|
|
|
|
|
|
680
|
0
|
|
0
|
|
|
|
$i2s = { map {($_=>$i2s->i2s($_))} sort {$a<=>$b} keys %{$prf->{$prf->{score}//'f12'}} }; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
} |
682
|
|
|
|
|
|
|
elsif (UNIVERSAL::isa($i2s,'CODE')) { |
683
|
0
|
|
0
|
|
|
|
$i2s = { map {($_=>$i2s->($_))} sort {$a<=>$b} keys %{$prf->{$prf->{score}//'f12'}} }; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
} |
685
|
0
|
|
|
|
|
|
return $i2s; |
686
|
|
|
|
|
|
|
} |
687
|
|
|
|
|
|
|
|
688
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
## $prf = $prf->stringify( $obj) |
690
|
|
|
|
|
|
|
## $prf = $prf->stringify(\@key2str) |
691
|
|
|
|
|
|
|
## $prf = $prf->stringify(\&key2str) |
692
|
|
|
|
|
|
|
## $prf = $prf->stringify(\%key2str) |
693
|
|
|
|
|
|
|
## + stringifies profile (destructive) via $obj->i2s($key2), $key2str->($i2) or $key2str->{$i2} |
694
|
|
|
|
|
|
|
sub stringify { |
695
|
0
|
|
|
0
|
1
|
|
my ($prf,$i2s) = @_; |
696
|
0
|
|
|
|
|
|
$i2s = $prf->stringify_map($i2s); |
697
|
0
|
0
|
|
|
|
|
if (UNIVERSAL::isa($i2s,'HASH')) { |
|
|
0
|
|
|
|
|
|
698
|
0
|
|
|
|
|
|
foreach (grep {defined $prf->{$_}} qw(f2 f12),$prf->scoreKeys) { |
|
0
|
|
|
|
|
|
|
699
|
0
|
|
|
|
|
|
my $sh = {}; |
700
|
0
|
|
|
|
|
|
@$sh{@$i2s{keys %{$prf->{$_}}}} = values %{$prf->{$_}}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
701
|
0
|
|
|
|
|
|
$prf->{$_} = $sh; |
702
|
|
|
|
|
|
|
} |
703
|
0
|
|
|
|
|
|
return $prf; |
704
|
|
|
|
|
|
|
} |
705
|
|
|
|
|
|
|
elsif (UNIVERSAL::isa($i2s,'ARRAY')) { |
706
|
0
|
|
|
|
|
|
foreach (grep {defined $prf->{$_}} qw(f2 f12),$prf->scoreKeys) { |
|
0
|
|
|
|
|
|
|
707
|
0
|
|
|
|
|
|
my $sh = {}; |
708
|
0
|
|
|
|
|
|
@$sh{@$i2s[keys %{$prf->{$_}}]} = values %{$prf->{$_}}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
709
|
0
|
|
|
|
|
|
$prf->{$_} = $sh; |
710
|
|
|
|
|
|
|
} |
711
|
0
|
|
|
|
|
|
return $prf; |
712
|
|
|
|
|
|
|
} |
713
|
|
|
|
|
|
|
|
714
|
0
|
|
0
|
|
|
|
$prf->logconfess("stringify(): don't know how to stringify via '", ($i2s//'undef'). "'"); |
715
|
|
|
|
|
|
|
} |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
##============================================================================== |
718
|
|
|
|
|
|
|
## Algebraic operations |
719
|
|
|
|
|
|
|
|
720
|
|
|
|
|
|
|
## $prf = $prf->_add($prf2,%opts) |
721
|
|
|
|
|
|
|
## + adds $prf2 frequency data to $prf (destructive) |
722
|
|
|
|
|
|
|
## + implicitly un-compiles $prf |
723
|
|
|
|
|
|
|
## + %opts: |
724
|
|
|
|
|
|
|
## N => $bool, ##-- whether to add N values (default:true) |
725
|
|
|
|
|
|
|
## f1 => $bool, ##-- whether to add f1 values (default:true) |
726
|
|
|
|
|
|
|
sub _add { |
727
|
0
|
|
|
0
|
|
|
my ($pa,$pb,%opts) = @_; |
728
|
0
|
0
|
0
|
|
|
|
$pa->{N} += $pb->{N} if (!exists($opts{N}) || $opts{N}); |
729
|
0
|
0
|
0
|
|
|
|
$pa->{f1} += $pb->{f1} if (!exists($opts{f1}) || $opts{f1}); |
730
|
0
|
|
|
|
|
|
my ($af2,$af12) = @$pa{qw(f2 f12)}; |
731
|
0
|
|
|
|
|
|
my ($bf2,$bf12) = @$pb{qw(f2 f12)}; |
732
|
0
|
|
|
|
|
|
foreach (keys %$bf12) { |
733
|
0
|
|
0
|
|
|
|
$af2->{$_} += ($bf2->{$_} // 0); |
734
|
0
|
|
0
|
|
|
|
$af12->{$_} += ($bf12->{$_} // 0); |
735
|
|
|
|
|
|
|
} |
736
|
0
|
|
|
|
|
|
return $pa->uncompile(); |
737
|
|
|
|
|
|
|
} |
738
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
## $prf3 = $prf1->add($prf2,%opts) |
740
|
|
|
|
|
|
|
## + returns sum of $prf1 and $prf2 frequency data (destructive) |
741
|
|
|
|
|
|
|
## + see _add() method for %opts |
742
|
|
|
|
|
|
|
sub add { |
743
|
0
|
|
|
0
|
1
|
|
return $_[0]->clone->_add(@_[1..$#_]); |
744
|
|
|
|
|
|
|
} |
745
|
|
|
|
|
|
|
|
746
|
|
|
|
|
|
|
## $psum = $CLASS_OR_OBJECT->_sum(\@profiles,%opts) |
747
|
|
|
|
|
|
|
## + returns a profile representing sum of \@profiles, passing %opts to _add() |
748
|
|
|
|
|
|
|
## + if called as a class method and \@profiles contains only 1 element, that element is returned |
749
|
|
|
|
|
|
|
## + otherwise, \@profiles are added to the (new) object |
750
|
|
|
|
|
|
|
sub _sum { |
751
|
0
|
|
|
0
|
|
|
my ($that,$profiles,%opts) = @_; |
752
|
0
|
0
|
0
|
|
|
|
return $profiles->[0] if (!ref($that) && @$profiles==1); |
753
|
0
|
0
|
|
|
|
|
my $psum = ref($that) ? $that : $that->new(); |
754
|
0
|
|
|
|
|
|
$psum->_add($_,%opts) foreach (@$profiles); |
755
|
0
|
|
|
|
|
|
return $psum; |
756
|
|
|
|
|
|
|
} |
757
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
## $psum = $CLASS_OR_OBJECT->sum(\@profiles,%opts) |
759
|
|
|
|
|
|
|
## + returns a new profile representing sum of \@profiles |
760
|
|
|
|
|
|
|
sub sum { |
761
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
762
|
0
|
|
0
|
|
|
|
return (ref($that)||$that)->new->_sum(@_); |
763
|
|
|
|
|
|
|
} |
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
## $diff = $prf1->diff($prf2,%opts) |
766
|
|
|
|
|
|
|
## + wraps DiaColloDB::Profile::Diff->new($prf1,$prf2,%opts) |
767
|
|
|
|
|
|
|
## + %opts: |
768
|
|
|
|
|
|
|
## N => $bool, ##-- whether to subtract N values (default:true) |
769
|
|
|
|
|
|
|
## f1 => $bool, ##-- whether to subtract f1 values (default:true) |
770
|
|
|
|
|
|
|
## f2 => $bool, ##-- whether to subtract f2 values (default:true) |
771
|
|
|
|
|
|
|
## f12 => $bool, ##-- whether to subtract f12 values (default:true) |
772
|
|
|
|
|
|
|
## score => $bool, ##-- whether to subtract score values (default:true) |
773
|
|
|
|
|
|
|
sub diff { |
774
|
0
|
|
|
0
|
1
|
|
return DiaColloDB::Profile::Diff->new(@_); |
775
|
|
|
|
|
|
|
} |
776
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
##============================================================================== |
779
|
|
|
|
|
|
|
## Footer |
780
|
|
|
|
|
|
|
1; |
781
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
__END__ |