line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
## -*- Mode: CPerl -*- |
2
|
|
|
|
|
|
|
## File: DiaColloDB::Profile::Diff.pm |
3
|
|
|
|
|
|
|
## Author: Bryan Jurish <moocow@cpan.org> |
4
|
|
|
|
|
|
|
## Description: collocation db, profile diffs |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
package DiaColloDB::Profile::Diff; |
8
|
1
|
|
|
1
|
|
7
|
use DiaColloDB::Utils qw(:math :html); |
|
1
|
|
|
|
|
39
|
|
|
1
|
|
|
|
|
37
|
|
9
|
1
|
|
|
1
|
|
272
|
use DiaColloDB::Profile; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
26
|
|
10
|
1
|
|
|
1
|
|
5
|
use IO::File; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
31
|
|
11
|
1
|
|
|
1
|
|
188
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
2008
|
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
##============================================================================== |
15
|
|
|
|
|
|
|
## Globals & Constants |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB::Profile); |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
##============================================================================== |
20
|
|
|
|
|
|
|
## Constructors etc. |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
## $dprf = CLASS_OR_OBJECT->new(%args) |
23
|
|
|
|
|
|
|
## $dprf = CLASS_OR_OBJECT->new($prf1,$prf2,%args) |
24
|
|
|
|
|
|
|
## + %args, object structure: |
25
|
|
|
|
|
|
|
## ( |
26
|
|
|
|
|
|
|
## ##-- DiaColloDB::Profile::Diff |
27
|
|
|
|
|
|
|
## prf1 => $prf1, ##-- 1st operand |
28
|
|
|
|
|
|
|
## prf2 => $prf2, ##-- 2nd operand |
29
|
|
|
|
|
|
|
## diff => $diff, ##-- low-level score-diff binary operation (default='abs-diff'); known values: |
30
|
|
|
|
|
|
|
## ## abs-diff # $score=$a-$b ; aliases=qw(absolute-difference abs-difference abs-diff adiff adifference a-) ; select=kbesta ; default |
31
|
|
|
|
|
|
|
## ## diff # $score=$a-$b ; aliases=qw(difference diff d minus -) |
32
|
|
|
|
|
|
|
## ## sum # $score=$a+$b ; aliases=qw(sum add plus +) |
33
|
|
|
|
|
|
|
## ## min # $score=min($a,$b) |
34
|
|
|
|
|
|
|
## ## max # $score=max($a,$b) |
35
|
|
|
|
|
|
|
## ## avg # $score=avg($a,$b) ; aliases=qw(average avg mean) |
36
|
|
|
|
|
|
|
## ## havg # $score=harmonic_avg($a,$b) ; aliases=qw(harmonic-average harmonic-mean havg hmean ha h) |
37
|
|
|
|
|
|
|
## ## gavg # $score=geometric_avg($a,$b) ; aliases=qw(geometric-average geometric-mean gavg gmean ga g) |
38
|
|
|
|
|
|
|
## ## lavg # $score=log_avg($a,$b) ; aliases=qw(logarithmic-average logarithmic-mean log-average log-mean lavg lmean la l) |
39
|
|
|
|
|
|
|
## ##-- DiaColloDB::Profile keys |
40
|
|
|
|
|
|
|
## label => $label, ##-- string label (used by Multi; undef for none(default)) |
41
|
|
|
|
|
|
|
## #N => $N, ##-- OVERRIDE:unused: total marginal relation frequency |
42
|
|
|
|
|
|
|
## #f1 => $f1, ##-- OVERRIDE:unused: total marginal frequency of target word(s) |
43
|
|
|
|
|
|
|
## #f2 => \%f2, ##-- OVERRIDE:unused: total marginal frequency of collocates: ($i2=>$f2, ...) |
44
|
|
|
|
|
|
|
## #f12 => \%f12, ##-- OVERRIDE:unused: collocation frequencies, %f12 = ($i2=>$f12, ...) |
45
|
|
|
|
|
|
|
## # |
46
|
|
|
|
|
|
|
## eps => $eps, ##-- smoothing constant (default=undef: no smoothing) |
47
|
|
|
|
|
|
|
## score => $func, ##-- selected scoring function ('f12', 'mi', or 'ld') |
48
|
|
|
|
|
|
|
## mi => \%mi12, ##-- DIFFERENCE: score: mutual information * logFreq a la Wortprofil; requires compile_mi() |
49
|
|
|
|
|
|
|
## ld => \%ld12, ##-- DIFFERENCE: score: log-dice a la Wortprofil; requires compile_ld() |
50
|
|
|
|
|
|
|
## fm => \%fm12, ##-- DIFFERENCE: score: frequency per million; requires compile_fm() |
51
|
|
|
|
|
|
|
## ) |
52
|
|
|
|
|
|
|
sub new { |
53
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
54
|
0
|
0
|
0
|
|
|
|
my $prf1 = !defined($_[0]) || UNIVERSAL::isa(ref($_[0]),'DiaColloDB::Profile') ? shift : undef; |
55
|
0
|
0
|
0
|
|
|
|
my $prf2 = !defined($_[0]) || UNIVERSAL::isa(ref($_[0]),'DiaColloDB::Profile') ? shift : undef; |
56
|
0
|
|
|
|
|
|
my %opts = @_; |
57
|
0
|
|
|
|
|
|
my $dprf = $that->SUPER::new( |
58
|
|
|
|
|
|
|
prf1=>$prf1, |
59
|
|
|
|
|
|
|
prf2=>$prf2, |
60
|
|
|
|
|
|
|
diff=>'adiff', |
61
|
|
|
|
|
|
|
%opts, |
62
|
|
|
|
|
|
|
); |
63
|
0
|
|
|
|
|
|
delete @$dprf{grep {!defined($opts{$_})} qw(N f1 f2 f12)}; |
|
0
|
|
|
|
|
|
|
64
|
0
|
0
|
0
|
|
|
|
return $dprf->populate() if ($dprf->{prf1} && $dprf->{prf2}); |
65
|
0
|
|
|
|
|
|
return $dprf; |
66
|
|
|
|
|
|
|
} |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
## $dprf2 = $dprf->clone() |
69
|
|
|
|
|
|
|
## $dprf2 = $dprf->clone($keep_compiled) |
70
|
|
|
|
|
|
|
## + clones %$dprf |
71
|
|
|
|
|
|
|
## + if $keep_score is true, compiled data is cloned too |
72
|
|
|
|
|
|
|
sub clone { |
73
|
0
|
|
|
0
|
1
|
|
my ($dprf,$force) = @_; |
74
|
|
|
|
|
|
|
return bless({ |
75
|
|
|
|
|
|
|
label=>$dprf->{label}, |
76
|
|
|
|
|
|
|
diff=>$dprf->{diff}, |
77
|
|
|
|
|
|
|
(defined($dprf->{prf1}) ? $dprf->{prf1}->clone($force) : qw()), |
78
|
0
|
0
|
|
|
|
|
(defined($dprf->{prf2}) ? $dprf->{prf2}->clone($force) : qw()), |
|
|
0
|
|
|
|
|
|
79
|
|
|
|
|
|
|
}, ref($dprf)); |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
##============================================================================== |
83
|
|
|
|
|
|
|
## Basic Access |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
## ($prf1,$prf2) = $dprf->operands(); |
86
|
|
|
|
|
|
|
sub operands { |
87
|
0
|
|
|
0
|
1
|
|
return @{$_[0]}{qw(prf1 prf2)}; |
|
0
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
## $bool = $dprf->empty() |
91
|
|
|
|
|
|
|
## + returns true iff both operands are empty |
92
|
|
|
|
|
|
|
sub empty { |
93
|
0
|
|
|
0
|
1
|
|
my $dp = shift; |
94
|
0
|
|
0
|
|
|
|
return (!$dp->{prf1} || $dp->{prf1}->empty) && (!$dp->{prf2} || $dp->{prf2}->empty); |
95
|
|
|
|
|
|
|
} |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
##============================================================================== |
98
|
|
|
|
|
|
|
## I/O |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
101
|
|
|
|
|
|
|
## I/O: JSON |
102
|
|
|
|
|
|
|
## + mostly INHERITED from DiaCollocDB::Persistent |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
## $obj = $CLASS_OR_OBJECT->loadJsonData( $data,%opts) |
105
|
|
|
|
|
|
|
## + guts for loadJsonString(), loadJsonFile() |
106
|
|
|
|
|
|
|
sub loadJsonData { |
107
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
108
|
0
|
|
|
|
|
|
my $dprf = $that->DiaColloDB::Persistent::loadJsonData(@_); |
109
|
0
|
|
|
|
|
|
bless($_,'DiaColloDB::Profile') foreach (grep {defined($_)} @$dprf{qw(prf1 prf2)}); |
|
0
|
|
|
|
|
|
|
110
|
0
|
|
|
|
|
|
return $dprf; |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
114
|
|
|
|
|
|
|
## I/O: Text |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
## undef = $CLASS_OR_OBJECT->saveTextHeader($fh, hlabel=>$hlabel, titles=>\@titles) |
117
|
|
|
|
|
|
|
sub saveTextHeader { |
118
|
0
|
|
|
0
|
1
|
|
my ($that,$fh,%opts) = @_; |
119
|
|
|
|
|
|
|
my @fields = ( |
120
|
0
|
|
|
|
|
|
(map {("${_}a","${_}b")} qw(N f1 f2 f12 score)), |
121
|
|
|
|
|
|
|
qw(diff), |
122
|
|
|
|
|
|
|
(defined($opts{hlabel}) ? $opts{hlabel} : qw()), |
123
|
0
|
0
|
0
|
|
|
|
@{$opts{titles} // (ref($that) ? $that->{titles} : undef) // [qw(item2)]}, |
|
0
|
0
|
0
|
|
|
|
|
124
|
|
|
|
|
|
|
); |
125
|
0
|
|
|
|
|
|
$fh->print(join("\t", map {"#".($_+1).":$fields[$_]"} (0..$#fields)), "\n"); |
|
0
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
## $bool = $prf->saveTextFh($fh, %opts) |
129
|
|
|
|
|
|
|
## + %opts: |
130
|
|
|
|
|
|
|
## ( |
131
|
|
|
|
|
|
|
## label => $label, ##-- override $prf->{label} (used by Profile::Multi), no tab-separators required |
132
|
|
|
|
|
|
|
## format => $fmt, ##-- printf score formatting (default="%.4f") |
133
|
|
|
|
|
|
|
## header => $bool, ##-- include header-row? (default=1) |
134
|
|
|
|
|
|
|
## hlabel => $hlabel, ##-- prefix header item-cells with $hlabel (used by Profile::Multi) |
135
|
|
|
|
|
|
|
## ) |
136
|
|
|
|
|
|
|
## + format (flat, TAB-separated): Na Nb F1a F1b F2a F2b F12a F12b SCOREa SCOREb SCOREdiff LABEL ITEM2 |
137
|
|
|
|
|
|
|
sub saveTextFh { |
138
|
0
|
|
|
0
|
1
|
|
my ($dprf,$fh,%opts) = @_; |
139
|
0
|
|
|
|
|
|
binmode($fh,':utf8'); |
140
|
|
|
|
|
|
|
|
141
|
0
|
|
|
|
|
|
my ($pa,$pb,$fscore) = @$dprf{qw(prf1 prf2 score)}; |
142
|
0
|
|
0
|
|
|
|
$fscore //= 'f12'; |
143
|
0
|
|
|
|
|
|
my ($Na,$f1a,$f2a,$f12a,$scorea) = @$pa{qw(N f1 f2 f12),$fscore}; |
144
|
0
|
|
|
|
|
|
my ($Nb,$f1b,$f2b,$f12b,$scoreb) = @$pb{qw(N f1 f2 f12),$fscore}; |
145
|
0
|
|
|
|
|
|
my $scored = $dprf->{$fscore}; |
146
|
0
|
0
|
|
|
|
|
my $label = exists($opts{label}) ? $opts{label} : $dprf->{label}; |
147
|
0
|
|
0
|
|
|
|
my $fmt = $opts{fmt} || '%f'; |
148
|
0
|
0
|
0
|
|
|
|
$dprf->saveTextHeader($fh,%opts) if ($opts{header}//1); |
149
|
|
|
|
|
|
|
|
150
|
0
|
|
|
|
|
|
foreach (sort {$scored->{$b} <=> $scored->{$a}} keys %$scored) { |
|
0
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
$fh->print(join("\t", |
152
|
0
|
|
0
|
|
|
|
map {$_//0} |
153
|
|
|
|
|
|
|
$Na, $Nb, |
154
|
|
|
|
|
|
|
$f1a,$f1b, |
155
|
|
|
|
|
|
|
$f2a->{$_}, $f2b->{$_}, |
156
|
|
|
|
|
|
|
$f12a->{$_}, $f2b->{$_}, |
157
|
|
|
|
|
|
|
sprintf($fmt,$scorea->{$_}//'nan'), |
158
|
|
|
|
|
|
|
sprintf($fmt,$scoreb->{$_}//'nan'), |
159
|
0
|
0
|
0
|
|
|
|
sprintf($fmt,$scored->{$_}//'nan'), |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
160
|
|
|
|
|
|
|
(defined($label) ? $label : qw()), |
161
|
|
|
|
|
|
|
$_), |
162
|
|
|
|
|
|
|
"\n"); |
163
|
|
|
|
|
|
|
} |
164
|
0
|
|
|
|
|
|
return $dprf; |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
168
|
|
|
|
|
|
|
## I/O: HTML |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
## $bool = $prf->saveHtmlFile($filename_or_handle, %opts) |
171
|
|
|
|
|
|
|
## + %opts: |
172
|
|
|
|
|
|
|
## ( |
173
|
|
|
|
|
|
|
## table => $bool, ##-- include <table>..</table> ? (default=1) |
174
|
|
|
|
|
|
|
## body => $bool, ##-- include <html><body>..</html></body> ? (default=1) |
175
|
|
|
|
|
|
|
## header => $bool, ##-- include header-row? (default=1) |
176
|
|
|
|
|
|
|
## verbose => $bool, ##-- include verbose output? (default=0) |
177
|
|
|
|
|
|
|
## hlabel => $hlabel, ##-- prefix header item-cells with $hlabel (used by Profile::Multi), no '<th>..</th>' required |
178
|
|
|
|
|
|
|
## label => $label, ##-- prefix item-cells with $label (used by Profile::Multi), no '<td>..</td>' required |
179
|
|
|
|
|
|
|
## format => $fmt, ##-- printf score formatting (default="%.4f") |
180
|
|
|
|
|
|
|
## ) |
181
|
|
|
|
|
|
|
## + saves rows of the format "SCOREa SCOREb DIFF PREFIX? ITEM2" |
182
|
|
|
|
|
|
|
sub saveHtmlFile { |
183
|
0
|
|
|
0
|
1
|
|
my ($dprf,$file,%opts) = @_; |
184
|
0
|
0
|
|
|
|
|
my $fh = ref($file) ? $file : IO::File->new(">$file"); |
185
|
0
|
0
|
|
|
|
|
$dprf->logconfess("saveHtmlFile(): failed to open '$file': $!") if (!ref($fh)); |
186
|
0
|
|
|
|
|
|
binmode($fh,':utf8'); |
187
|
|
|
|
|
|
|
|
188
|
0
|
0
|
0
|
|
|
|
$fh->print("<html><body>\n") if ($opts{body}//1); |
189
|
0
|
0
|
0
|
|
|
|
$fh->print("<table><tbody>\n") if ($opts{table}//1); |
190
|
|
|
|
|
|
|
$fh->print("<tr>",( |
191
|
0
|
|
|
|
|
|
map {"<th>".htmlesc($_)."</th>"} |
192
|
0
|
|
|
|
|
|
($opts{verbose} ? (map {("${_}a","${_}b")} qw(N f1 f2 f12)) : qw()), |
193
|
|
|
|
|
|
|
qw(ascore bscore diff), |
194
|
|
|
|
|
|
|
(defined($opts{hlabel}) ? $opts{hlabel} : qw()), |
195
|
0
|
|
0
|
|
|
|
@{$dprf->{titles}//[qw(item2)]}, |
196
|
|
|
|
|
|
|
), |
197
|
|
|
|
|
|
|
"</tr>\n" |
198
|
0
|
0
|
0
|
|
|
|
) if ($opts{header}//1); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
199
|
|
|
|
|
|
|
|
200
|
0
|
|
|
|
|
|
my ($pa,$pb,$fscore) = @$dprf{qw(prf1 prf2 score)}; |
201
|
0
|
|
0
|
|
|
|
$fscore //= 'f12'; |
202
|
0
|
|
|
|
|
|
my $scorea = $pa->{$fscore}; |
203
|
0
|
|
|
|
|
|
my $scoreb = $pb->{$fscore}; |
204
|
0
|
|
|
|
|
|
my $scored = $dprf->{$fscore}; |
205
|
0
|
|
0
|
|
|
|
my $fmt = $opts{format} || "%.4f"; |
206
|
0
|
0
|
|
|
|
|
my $label = exists($opts{label}) ? $opts{label} : $dprf->{label}; |
207
|
0
|
|
|
|
|
|
foreach (sort {$scored->{$b} <=> $scored->{$a}} keys %$scored) { |
|
0
|
|
|
|
|
|
|
208
|
0
|
|
|
|
|
|
$fh->print("<tr>", (map {"<td>".htmlesc($_)."</td>"} |
209
|
|
|
|
|
|
|
($opts{verbose} |
210
|
0
|
|
0
|
|
|
|
? (map {($_//0)} |
211
|
|
|
|
|
|
|
$pa->{N}, $pb->{N}, |
212
|
|
|
|
|
|
|
$pa->{f1}, $pb->{f1}, |
213
|
|
|
|
|
|
|
$pa->{f2}{$_}, $pb->{f2}{$_}, |
214
|
|
|
|
|
|
|
$pa->{f12}{$_}, $pb->{f12}{$_}, |
215
|
|
|
|
|
|
|
) |
216
|
|
|
|
|
|
|
: qw()), |
217
|
|
|
|
|
|
|
sprintf($fmt,$scorea->{$_}//'nan'), |
218
|
|
|
|
|
|
|
sprintf($fmt,$scoreb->{$_}//'nan'), |
219
|
0
|
0
|
0
|
|
|
|
sprintf($fmt,$scored->{$_}//'nan'), |
|
|
0
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
220
|
|
|
|
|
|
|
(defined($label) ? $label : qw()), |
221
|
|
|
|
|
|
|
split(/\t/,$_)), |
222
|
|
|
|
|
|
|
"</tr>\n"); |
223
|
|
|
|
|
|
|
} |
224
|
0
|
0
|
0
|
|
|
|
$fh->print("</tbody><table>\n") if ($opts{table}//1); |
225
|
0
|
0
|
0
|
|
|
|
$fh->print("</body></html>\n") if ($opts{body}//1); |
226
|
0
|
0
|
|
|
|
|
$fh->close() if (!ref($file)); |
227
|
0
|
|
|
|
|
|
return $dprf; |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
##============================================================================== |
232
|
|
|
|
|
|
|
## Compilation |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
##---------------------------------------------------------------------- |
235
|
|
|
|
|
|
|
## Compilation: diff-ops |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
## %DIFFOPS : ($opAlias => $opName, ...) : canonical diff-operation names |
238
|
|
|
|
|
|
|
our %DIFFOPS = |
239
|
|
|
|
|
|
|
( |
240
|
|
|
|
|
|
|
(map {($_=>'diff')} qw(difference diff d minus -)), |
241
|
|
|
|
|
|
|
(map {($_=>'adiff')} qw(absolute-difference abs-difference abs-diff adiff adifference a- DEFAULT)), |
242
|
|
|
|
|
|
|
(map {($_=>'sum')} qw(add plus sum +)), |
243
|
|
|
|
|
|
|
(map {($_=>'min')} qw(minimum min)), |
244
|
|
|
|
|
|
|
(map {($_=>'max')} qw(maximum max)), |
245
|
|
|
|
|
|
|
(map {($_=>'avg')} qw(average avg mean)), |
246
|
|
|
|
|
|
|
(map {($_=>'havg')} qw(harmonic-average harmonic-avg harmonic harm haverage havg ha h)), |
247
|
|
|
|
|
|
|
(map {($_=>'gavg')} qw(geometric-average geometric-mean geometric geom geo gavg gmean ga g)), |
248
|
|
|
|
|
|
|
(map {($_=>'lavg')} qw(logarithmic-average logarithmic-mean logarithmic log-average log-mean log lavg lmean la l)), |
249
|
|
|
|
|
|
|
); |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
## $opname = $dprf->diffop() |
252
|
|
|
|
|
|
|
## $opname = $CLASS_OR_OBJECT->diffop($opNameOrAlias) |
253
|
|
|
|
|
|
|
## + returns canonical diff operation-name for $opNameOrAlias |
254
|
|
|
|
|
|
|
sub diffop { |
255
|
0
|
|
|
0
|
1
|
|
my ($that,$op) = @_; |
256
|
0
|
0
|
0
|
|
|
|
$op //= $that->{diff} if (ref($that)); |
257
|
0
|
0
|
0
|
|
|
|
return (defined($op) ? $DIFFOPS{$op} : undef) // $op // $DIFFOPS{DEFAULT}; |
|
|
|
0
|
|
|
|
|
258
|
|
|
|
|
|
|
} |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
## \&FUNC = $dprf->diffsub() |
261
|
|
|
|
|
|
|
## \&FUNC = $CLASS_OR_OBJECT->diffsub($opNameOrAlias) |
262
|
|
|
|
|
|
|
## + gets low-level binary diff operation for diff-operation $opNameOrAlias (default=$dprf->{diff}) |
263
|
|
|
|
|
|
|
sub diffsub { |
264
|
0
|
|
|
0
|
1
|
|
my ($that,$opname) = @_; |
265
|
0
|
0
|
|
|
|
|
return $opname if (UNIVERSAL::isa($opname,'CODE')); ##-- code-ref |
266
|
0
|
|
|
|
|
|
my $op = $that->diffop($opname); |
267
|
0
|
|
|
|
|
|
my $sub = $that->can("diffop_$op"); |
268
|
0
|
0
|
|
|
|
|
return $sub if (defined($sub)); |
269
|
0
|
|
|
|
|
|
$that->logwarn("unknown low-level diff operation '$op' defaults to '$DIFFOPS{DEFAULT}'"); |
270
|
0
|
|
|
|
|
|
return \&diffop_diff; |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
## $how = $dprf->diffpretrim() |
274
|
|
|
|
|
|
|
## $how = $CLASS_OR_OBJECT->diffpretrim($opNameOrAlias) |
275
|
|
|
|
|
|
|
## + returns if and how diff should pre-trim operand profiles: one of: |
276
|
|
|
|
|
|
|
## 0 : don't pre-trim |
277
|
|
|
|
|
|
|
## 'restrict' : intersect defined collocates |
278
|
|
|
|
|
|
|
## 'kbest' : union of k-best collocates |
279
|
|
|
|
|
|
|
sub diffpretrim { |
280
|
0
|
|
|
0
|
1
|
|
my ($that,$op) = @_; |
281
|
0
|
|
|
|
|
|
$op = $that->diffop($op); |
282
|
0
|
0
|
|
|
|
|
if ($op =~ /^min|avg/) { |
|
|
0
|
|
|
|
|
|
283
|
0
|
|
|
|
|
|
return 'restrict'; |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
elsif ($op =~ m/^a?diff|max/) { |
286
|
0
|
|
|
|
|
|
return 'kbest'; |
287
|
|
|
|
|
|
|
} |
288
|
0
|
|
|
|
|
|
return 0; |
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
## $selector = $dprf->diffkbest() |
292
|
|
|
|
|
|
|
## $selector = $CLASS_OR_OBJECT->diffkbest($opNameOrAlias) |
293
|
|
|
|
|
|
|
## + returns 'kbest' selector appropriate for which() or trim() methods |
294
|
|
|
|
|
|
|
sub diffkbest { |
295
|
0
|
|
|
0
|
1
|
|
my ($that,$op) = @_; |
296
|
0
|
0
|
|
|
|
|
return $that->diffop($op) eq 'adiff' ? 'kbesta' : 'kbest'; |
297
|
|
|
|
|
|
|
} |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
|
300
|
1
|
|
|
1
|
|
1450
|
BEGIN { *diffop_adiff = \&diffop_diff; } |
301
|
0
|
|
|
0
|
1
|
|
sub diffop_diff { return $_[0]-$_[1]; } |
302
|
0
|
|
|
0
|
1
|
|
sub diffop_sum { return $_[0]+$_[1]; } |
303
|
0
|
0
|
|
0
|
1
|
|
sub diffop_min { return $_[0]<$_[1] ? $_[0] : $_[1]; } |
304
|
0
|
0
|
|
0
|
1
|
|
sub diffop_max { return $_[0]>$_[1] ? $_[0] : $_[1]; } |
305
|
0
|
|
|
0
|
1
|
|
sub diffop_avg { return ($_[0]+$_[1])/2.0; } |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
#sub diffop_havg { return $_[0]<=0 || $_[1]<=0 ? 0 : 2.0/(1.0/$_[0] + 1.0/$_[1]); } |
308
|
|
|
|
|
|
|
##-- |
309
|
|
|
|
|
|
|
#our $havg_eps = 0.1; |
310
|
|
|
|
|
|
|
#sub diffop_havg { return 2.0/(1.0/($_[0]+$havg_eps) + 1.0/($_[1]+$havg_eps)) - $havg_eps; } |
311
|
|
|
|
|
|
|
##-- |
312
|
0
|
0
|
0
|
0
|
0
|
|
sub diffop_havg0 { return $_[0]<=0 || $_[1]<=0 ? 0 : (2*$_[0]*$_[1])/($_[0]+$_[1]); } |
313
|
0
|
|
|
0
|
1
|
|
sub diffop_havg { return diffop_avg(diffop_havg0(@_),diffop_avg(@_)); } |
314
|
|
|
|
|
|
|
|
315
|
0
|
0
|
|
0
|
0
|
|
sub nthRoot { return ($_[0]<0 ? -1 : 1) * abs($_[0])**(1/$_[1]); } |
316
|
|
|
|
|
|
|
#sub diffop_gavg { return nthRoot($_[0]*$_[1], 2); } |
317
|
|
|
|
|
|
|
##-- |
318
|
0
|
|
|
0
|
0
|
|
sub diffop_gavg0 { return nthRoot($_[0]*$_[1], 2); } |
319
|
0
|
|
|
0
|
1
|
|
sub diffop_gavg { return diffop_avg(diffop_gavg0(@_),diffop_avg(@_)); } |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
sub diffop_lavg { |
323
|
0
|
0
|
|
0
|
1
|
|
my ($x,$y) = $_[0]<$_[1] ? @_[0,1] : @_[1,0]; |
324
|
0
|
0
|
|
|
|
|
my $delta = $x<=1 ? (1-$x) : 0; |
325
|
0
|
|
|
|
|
|
return exp( log(($x+$delta)*($y+$delta))/2.0 ) - $delta; |
326
|
|
|
|
|
|
|
} |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
##---------------------------------------------------------------------- |
330
|
|
|
|
|
|
|
## Compilation: guts |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
## $dprf = $dprf->populate() |
333
|
|
|
|
|
|
|
## $dprf = $dprf->populate($prf1,$prf2) |
334
|
|
|
|
|
|
|
## + populates diff-profile by applying the selected diff-operation on aligned operand scores |
335
|
|
|
|
|
|
|
sub populate { |
336
|
0
|
|
|
0
|
1
|
|
my ($dprf,$pa,$pb) = @_; |
337
|
0
|
|
0
|
|
|
|
$pa //= $dprf->{prf1}; |
338
|
0
|
|
0
|
|
|
|
$pb //= $dprf->{prf2}; |
339
|
0
|
0
|
0
|
|
|
|
$pa = $pb->shadow(1) if (!$pa && $pb); |
340
|
0
|
0
|
0
|
|
|
|
$pb = $pa->shadow(1) if ( $pa && !$pb); |
341
|
0
|
|
|
|
|
|
@$dprf{qw(prf1 prf2)} = ($pa,$pb); |
342
|
0
|
|
0
|
|
|
|
$dprf->{label} //= $pa->label() ."-" . $pb->label(); |
343
|
|
|
|
|
|
|
|
344
|
0
|
|
0
|
|
|
|
my $scoref = $dprf->{score} = $dprf->{score} // $pa->{score} // $pb->{score} // 'f12'; |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
345
|
0
|
|
|
|
|
|
my ($af2,$af12,$ascore) = @$pa{qw(f2 f12),$scoref}; |
346
|
0
|
|
|
|
|
|
my ($bf2,$bf12,$bscore) = @$pb{qw(f2 f12),$scoref}; |
347
|
0
|
|
0
|
|
|
|
my $dscore = $dprf->{$scoref} = ($dprf->{$scoref} // {}); |
348
|
0
|
|
|
|
|
|
my $diffsub = $dprf->diffsub(); |
349
|
0
|
0
|
|
|
|
|
$dprf->logconfess("populate(): no {$scoref} key for \$pa") if (!$ascore); |
350
|
0
|
0
|
|
|
|
|
$dprf->logconfess("populate(): no {$scoref} key for \$pb") if (!$bscore); |
351
|
0
|
|
|
|
|
|
foreach (keys %$bscore) { |
352
|
0
|
|
0
|
|
|
|
$af2->{$_} //= 0; |
353
|
0
|
|
0
|
|
|
|
$af12->{$_} //= 0; |
354
|
0
|
|
0
|
|
|
|
$ascore->{$_} //= 0; |
355
|
0
|
|
0
|
|
|
|
$dscore->{$_} = $diffsub->(($ascore->{$_}//0), ($bscore->{$_}//0)); |
|
|
|
0
|
|
|
|
|
356
|
|
|
|
|
|
|
} |
357
|
0
|
|
|
|
|
|
return $dprf; |
358
|
|
|
|
|
|
|
} |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
## $dprf = $dprf->compile($func,%opts) |
362
|
|
|
|
|
|
|
## + compile for score-function $func, one of qw(f fm mi ld); default='f' |
363
|
|
|
|
|
|
|
sub compile { |
364
|
0
|
|
|
0
|
1
|
|
my ($dprf,$func) = (shift,shift); |
365
|
|
|
|
|
|
|
$dprf->logconfess("compile(): cannot compile without operand profiles") |
366
|
0
|
0
|
0
|
|
|
|
if (!$dprf->{prf1} || !$dprf->{prf2}); |
367
|
0
|
0
|
|
|
|
|
$dprf->{prf1}->compile($func,@_) or return undef; |
368
|
0
|
0
|
|
|
|
|
$dprf->{prf2}->compile($func,@_) or return undef; |
369
|
0
|
|
|
|
|
|
$dprf->{score} = $dprf->{prf1}{score}; |
370
|
0
|
|
|
|
|
|
return $dprf->populate(); |
371
|
|
|
|
|
|
|
} |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
## $dprf = $dprf->uncompile() |
374
|
|
|
|
|
|
|
## + un-compiles all scores for $dprd |
375
|
|
|
|
|
|
|
sub uncompile { |
376
|
0
|
|
|
0
|
1
|
|
my $dprf = shift; |
377
|
0
|
0
|
|
|
|
|
$dprf->{prf1}->uncompile() if ($dprf->{prf1}); |
378
|
0
|
0
|
|
|
|
|
$dprf->{prf2}->uncompile() if ($dprf->{prf2}); |
379
|
0
|
|
|
|
|
|
return $dprf->SUPER::uncompile(); |
380
|
|
|
|
|
|
|
} |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
##============================================================================== |
383
|
|
|
|
|
|
|
## Trimming |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
## \@keys = $prf->which(%opts) |
386
|
|
|
|
|
|
|
## + returns 'good' keys for trimming options %opts: |
387
|
|
|
|
|
|
|
## ( |
388
|
|
|
|
|
|
|
## cutoff => $cutoff, ##-- retain only items with $prf->{$prf->{score}}{$item} >= $cutoff |
389
|
|
|
|
|
|
|
## kbest => $kbest, ##-- retain only $kbest items |
390
|
|
|
|
|
|
|
## kbesta => $kbesta, ##-- retain only $kbest items (absolute value) |
391
|
|
|
|
|
|
|
## return => $which, ##-- either 'good' (default) or 'bad' |
392
|
|
|
|
|
|
|
## as => $as, ##-- 'hash' or 'array'; default='array' |
393
|
|
|
|
|
|
|
## ) |
394
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Profile |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
## $dprf = $dprf->trim(%opts) |
397
|
|
|
|
|
|
|
## + %opts: |
398
|
|
|
|
|
|
|
## ( |
399
|
|
|
|
|
|
|
## kbest => $kbest, ##-- retain only $kbest items (by score value) |
400
|
|
|
|
|
|
|
## kbesta => $kbesta, ##-- retain only $kbest items (by score absolute value) |
401
|
|
|
|
|
|
|
## cutoff => $cutoff, ##-- retain only items with $prf->{$prf->{score}}{$item} >= $cutoff |
402
|
|
|
|
|
|
|
## keep => $keep, ##-- retain keys @$keep (ARRAY) or keys(%$keep) (HASH) |
403
|
|
|
|
|
|
|
## drop => $drop, ##-- drop keys @$drop (ARRAY) or keys(%$drop) (HASH) |
404
|
|
|
|
|
|
|
## ) |
405
|
|
|
|
|
|
|
sub trim { |
406
|
0
|
|
|
0
|
1
|
|
my ($dprf,%opts) = @_; |
407
|
0
|
|
|
|
|
|
my ($pa,$pb) = @$dprf{qw(prf1 prf2)}; |
408
|
|
|
|
|
|
|
|
409
|
0
|
0
|
0
|
|
|
|
if ($opts{keep} || $opts{drop}) { |
410
|
|
|
|
|
|
|
##-- explicit keep request |
411
|
0
|
0
|
|
|
|
|
$dprf->populate() if (!$dprf->{score}); |
412
|
0
|
0
|
|
|
|
|
$dprf->SUPER::trim(%opts) or return undef; |
413
|
|
|
|
|
|
|
} |
414
|
|
|
|
|
|
|
else { |
415
|
|
|
|
|
|
|
##-- heuristic (pre-)trimming |
416
|
0
|
|
|
|
|
|
$dprf->pretrim($pa,$pb,%opts); |
417
|
0
|
|
|
|
|
|
$dprf->populate(); |
418
|
0
|
|
|
|
|
|
$dprf->SUPER::trim(%opts); |
419
|
|
|
|
|
|
|
} |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
##-- trim operand profiles |
422
|
0
|
|
0
|
|
|
|
my $keep = $dprf->{$dprf->{score}//'f12'}; |
423
|
0
|
0
|
0
|
|
|
|
$pa->trim(keep=>$keep) or return undef if ($pa); |
424
|
0
|
0
|
0
|
|
|
|
$pb->trim(keep=>$keep) or return undef if ($pb); |
425
|
|
|
|
|
|
|
|
426
|
0
|
|
|
|
|
|
return $dprf; |
427
|
|
|
|
|
|
|
} |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
## ($pa,$pb) = $CLASS_OR_OBJECT->pretrim($pa,$pb,%opts) |
430
|
|
|
|
|
|
|
## + perform pre-trimming on aligned profile pair ($pa,$pb) |
431
|
|
|
|
|
|
|
sub pretrim { |
432
|
0
|
|
|
0
|
1
|
|
my ($that,$pa,$pb,%opts) = @_; |
433
|
0
|
|
|
|
|
|
my $pretrim = $that->diffpretrim($opts{diff}); |
434
|
|
|
|
|
|
|
|
435
|
0
|
0
|
0
|
|
|
|
if ($pretrim eq 'kbest') { |
|
|
0
|
0
|
|
|
|
|
436
|
|
|
|
|
|
|
##-- pre-trim: union of k-best collocates |
437
|
0
|
0
|
|
|
|
|
my %keep = map {($_=>undef)} (($pa ? @{$pa->which(%opts)} : qw()), ($pb ? @{$pb->which(%opts)} : qw())); |
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
438
|
0
|
0
|
|
|
|
|
$pa->trim(keep=>\%keep) if ($pa); |
439
|
0
|
0
|
|
|
|
|
$pb->trim(keep=>\%keep) if ($pb); |
440
|
|
|
|
|
|
|
} |
441
|
|
|
|
|
|
|
elsif ($pretrim eq 'restrict' && $pa && $pb) { |
442
|
|
|
|
|
|
|
my @drop = ( |
443
|
0
|
|
|
|
|
|
(grep {!exists $pa->{f12}{$_}} keys %{$pb->{f12}}), |
|
0
|
|
|
|
|
|
|
444
|
0
|
|
|
|
|
|
(grep {!exists $pb->{f12}{$_}} keys %{$pa->{f12}}), |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
); |
446
|
0
|
|
|
|
|
|
$pa->trim(drop=>\@drop); |
447
|
0
|
|
|
|
|
|
$pb->trim(drop=>\@drop); |
448
|
|
|
|
|
|
|
} |
449
|
0
|
|
|
|
|
|
return ($pa,$pb); |
450
|
|
|
|
|
|
|
} |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
##============================================================================== |
453
|
|
|
|
|
|
|
## Stringification |
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
## $dprf = $dprf->stringify( $obj) |
456
|
|
|
|
|
|
|
## $dprf = $dprf->stringify(\@key2str) |
457
|
|
|
|
|
|
|
## $dprf = $dprf->stringify(\&key2str) |
458
|
|
|
|
|
|
|
## $dprf = $dprf->stringify(\%key2str) |
459
|
|
|
|
|
|
|
## + stringifies profile (destructive) via $obj->i2s($key2), $key2str->($i2) or $key2str->{$i2} |
460
|
|
|
|
|
|
|
sub stringify { |
461
|
0
|
|
|
0
|
1
|
|
my ($dprf,$i2s) = @_; |
462
|
0
|
|
|
|
|
|
$i2s = $dprf->stringify_map($i2s); |
463
|
0
|
|
0
|
|
|
|
$_->stringify($i2s) or return undef foreach (grep {defined($_)} $dprf->operands); |
|
0
|
|
|
|
|
|
|
464
|
0
|
|
|
|
|
|
return $dprf->SUPER::stringify($i2s); |
465
|
|
|
|
|
|
|
} |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
##============================================================================== |
468
|
|
|
|
|
|
|
## Binary operations |
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
## $dprf = $dprf->_add($dprf2,%opts) |
471
|
|
|
|
|
|
|
## + adds $dprf2 operand frequency data to $dprf operands (destructive) |
472
|
|
|
|
|
|
|
## + implicitly un-compiles $dprf |
473
|
|
|
|
|
|
|
## + %opts: |
474
|
|
|
|
|
|
|
## N => $bool, ##-- whether to add N values (default:true) |
475
|
|
|
|
|
|
|
## f1 => $bool, ##-- whether to add f1 values (default:true) |
476
|
|
|
|
|
|
|
sub _add { |
477
|
0
|
|
|
0
|
|
|
my ($dpa,$dpb,%opts) = @_; |
478
|
0
|
0
|
0
|
|
|
|
$dpa->{prf1}->_add($dpb->{prf1}) if ($dpa->{prf1} && $dpb->{prf1}); |
479
|
0
|
0
|
0
|
|
|
|
$dpa->{prf2}->_add($dpb->{prf2}) if ($dpa->{prf2} && $dpb->{prf2}); |
480
|
0
|
|
|
|
|
|
return $dpa->uncompile(); |
481
|
|
|
|
|
|
|
} |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
## $dprf3 = $dprf1->add($dprf2,%opts) |
484
|
|
|
|
|
|
|
## + returns sum of $dprf1 and $dprf2 operatnd frequency data (destructive) |
485
|
|
|
|
|
|
|
## + see _add() method for %opts |
486
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Profile |
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
## $diff = $prf1->diff($prf2,%opts) |
489
|
|
|
|
|
|
|
## + returns score-diff of $prf1 and $prf2 frequency data (destructive) |
490
|
|
|
|
|
|
|
## + %opts: see _diff() method |
491
|
|
|
|
|
|
|
## + INHERITED but probably useless |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
##============================================================================== |
495
|
|
|
|
|
|
|
## Footer |
496
|
|
|
|
|
|
|
1; |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
__END__ |