| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
## -*- Mode: CPerl -*- |
|
2
|
|
|
|
|
|
|
## File: DiaColloDB::Profile::Diff.pm |
|
3
|
|
|
|
|
|
|
## Author: Bryan Jurish <moocow@cpan.org> |
|
4
|
|
|
|
|
|
|
## Description: collocation db, profile diffs |
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
package DiaColloDB::Profile::Diff; |
|
8
|
1
|
|
|
1
|
|
7
|
use DiaColloDB::Utils qw(:math :html); |
|
|
1
|
|
|
|
|
72
|
|
|
|
1
|
|
|
|
|
37
|
|
|
9
|
1
|
|
|
1
|
|
272
|
use DiaColloDB::Profile; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
49
|
|
|
10
|
1
|
|
|
1
|
|
8
|
use IO::File; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
40
|
|
|
11
|
1
|
|
|
1
|
|
212
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
2215
|
|
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
##============================================================================== |
|
15
|
|
|
|
|
|
|
## Globals & Constants |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB::Profile); |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
##============================================================================== |
|
20
|
|
|
|
|
|
|
## Constructors etc. |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
## $dprf = CLASS_OR_OBJECT->new(%args) |
|
23
|
|
|
|
|
|
|
## $dprf = CLASS_OR_OBJECT->new($prf1,$prf2,%args) |
|
24
|
|
|
|
|
|
|
## + %args, object structure: |
|
25
|
|
|
|
|
|
|
## ( |
|
26
|
|
|
|
|
|
|
## ##-- DiaColloDB::Profile::Diff |
|
27
|
|
|
|
|
|
|
## prf1 => $prf1, ##-- 1st operand |
|
28
|
|
|
|
|
|
|
## prf2 => $prf2, ##-- 2nd operand |
|
29
|
|
|
|
|
|
|
## diff => $diff, ##-- low-level score-diff binary operation (default='abs-diff'); known values: |
|
30
|
|
|
|
|
|
|
## ## abs-diff # $score=$a-$b ; aliases=qw(absolute-difference abs-difference abs-diff adiff adifference a-) ; select=kbesta ; default |
|
31
|
|
|
|
|
|
|
## ## diff # $score=$a-$b ; aliases=qw(difference diff d minus -) |
|
32
|
|
|
|
|
|
|
## ## sum # $score=$a+$b ; aliases=qw(sum add plus +) |
|
33
|
|
|
|
|
|
|
## ## min # $score=min($a,$b) |
|
34
|
|
|
|
|
|
|
## ## max # $score=max($a,$b) |
|
35
|
|
|
|
|
|
|
## ## avg # $score=avg($a,$b) ; aliases=qw(average avg mean) |
|
36
|
|
|
|
|
|
|
## ## havg # $score=harmonic_avg($a,$b) ; aliases=qw(harmonic-average harmonic-mean havg hmean ha h) |
|
37
|
|
|
|
|
|
|
## ## gavg # $score=geometric_avg($a,$b) ; aliases=qw(geometric-average geometric-mean gavg gmean ga g) |
|
38
|
|
|
|
|
|
|
## ## lavg # $score=log_avg($a,$b) ; aliases=qw(logarithmic-average logarithmic-mean log-average log-mean lavg lmean la l) |
|
39
|
|
|
|
|
|
|
## ##-- DiaColloDB::Profile keys |
|
40
|
|
|
|
|
|
|
## label => $label, ##-- string label (used by Multi; undef for none(default)) |
|
41
|
|
|
|
|
|
|
## #N => $N, ##-- OVERRIDE:unused: total marginal relation frequency |
|
42
|
|
|
|
|
|
|
## #f1 => $f1, ##-- OVERRIDE:unused: total marginal frequency of target word(s) |
|
43
|
|
|
|
|
|
|
## #f2 => \%f2, ##-- OVERRIDE:unused: total marginal frequency of collocates: ($i2=>$f2, ...) |
|
44
|
|
|
|
|
|
|
## #f12 => \%f12, ##-- OVERRIDE:unused: collocation frequencies, %f12 = ($i2=>$f12, ...) |
|
45
|
|
|
|
|
|
|
## # |
|
46
|
|
|
|
|
|
|
## eps => $eps, ##-- smoothing constant (default=undef: no smoothing) |
|
47
|
|
|
|
|
|
|
## score => $func, ##-- selected scoring function ('f12', 'mi', or 'ld') |
|
48
|
|
|
|
|
|
|
## mi => \%mi12, ##-- DIFFERENCE: score: mutual information * logFreq a la Wortprofil; requires compile_mi() |
|
49
|
|
|
|
|
|
|
## ld => \%ld12, ##-- DIFFERENCE: score: log-dice a la Wortprofil; requires compile_ld() |
|
50
|
|
|
|
|
|
|
## fm => \%fm12, ##-- DIFFERENCE: score: frequency per million; requires compile_fm() |
|
51
|
|
|
|
|
|
|
## ) |
|
52
|
|
|
|
|
|
|
sub new { |
|
53
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
|
54
|
0
|
0
|
0
|
|
|
|
my $prf1 = !defined($_[0]) || UNIVERSAL::isa(ref($_[0]),'DiaColloDB::Profile') ? shift : undef; |
|
55
|
0
|
0
|
0
|
|
|
|
my $prf2 = !defined($_[0]) || UNIVERSAL::isa(ref($_[0]),'DiaColloDB::Profile') ? shift : undef; |
|
56
|
0
|
|
|
|
|
|
my %opts = @_; |
|
57
|
0
|
|
|
|
|
|
my $dprf = $that->SUPER::new( |
|
58
|
|
|
|
|
|
|
prf1=>$prf1, |
|
59
|
|
|
|
|
|
|
prf2=>$prf2, |
|
60
|
|
|
|
|
|
|
diff=>'adiff', |
|
61
|
|
|
|
|
|
|
%opts, |
|
62
|
|
|
|
|
|
|
); |
|
63
|
0
|
|
|
|
|
|
delete @$dprf{grep {!defined($opts{$_})} qw(N f1 f2 f12)}; |
|
|
0
|
|
|
|
|
|
|
|
64
|
0
|
0
|
0
|
|
|
|
return $dprf->populate() if ($dprf->{prf1} && $dprf->{prf2}); |
|
65
|
0
|
|
|
|
|
|
return $dprf; |
|
66
|
|
|
|
|
|
|
} |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
## $dprf2 = $dprf->clone() |
|
69
|
|
|
|
|
|
|
## $dprf2 = $dprf->clone($keep_compiled) |
|
70
|
|
|
|
|
|
|
## + clones %$dprf |
|
71
|
|
|
|
|
|
|
## + if $keep_score is true, compiled data is cloned too |
|
72
|
|
|
|
|
|
|
sub clone { |
|
73
|
0
|
|
|
0
|
1
|
|
my ($dprf,$force) = @_; |
|
74
|
|
|
|
|
|
|
return bless({ |
|
75
|
|
|
|
|
|
|
label=>$dprf->{label}, |
|
76
|
|
|
|
|
|
|
diff=>$dprf->{diff}, |
|
77
|
|
|
|
|
|
|
(defined($dprf->{prf1}) ? $dprf->{prf1}->clone($force) : qw()), |
|
78
|
0
|
0
|
|
|
|
|
(defined($dprf->{prf2}) ? $dprf->{prf2}->clone($force) : qw()), |
|
|
|
0
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
}, ref($dprf)); |
|
80
|
|
|
|
|
|
|
} |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
##============================================================================== |
|
83
|
|
|
|
|
|
|
## Basic Access |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
## ($prf1,$prf2) = $dprf->operands(); |
|
86
|
|
|
|
|
|
|
sub operands { |
|
87
|
0
|
|
|
0
|
1
|
|
return @{$_[0]}{qw(prf1 prf2)}; |
|
|
0
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
} |
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
## $bool = $dprf->empty() |
|
91
|
|
|
|
|
|
|
## + returns true iff both operands are empty |
|
92
|
|
|
|
|
|
|
sub empty { |
|
93
|
0
|
|
|
0
|
1
|
|
my $dp = shift; |
|
94
|
0
|
|
0
|
|
|
|
return (!$dp->{prf1} || $dp->{prf1}->empty) && (!$dp->{prf2} || $dp->{prf2}->empty); |
|
95
|
|
|
|
|
|
|
} |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
##============================================================================== |
|
98
|
|
|
|
|
|
|
## I/O |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
101
|
|
|
|
|
|
|
## I/O: JSON |
|
102
|
|
|
|
|
|
|
## + mostly INHERITED from DiaCollocDB::Persistent |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
## $obj = $CLASS_OR_OBJECT->loadJsonData( $data,%opts) |
|
105
|
|
|
|
|
|
|
## + guts for loadJsonString(), loadJsonFile() |
|
106
|
|
|
|
|
|
|
sub loadJsonData { |
|
107
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
|
108
|
0
|
|
|
|
|
|
my $dprf = $that->DiaColloDB::Persistent::loadJsonData(@_); |
|
109
|
0
|
|
|
|
|
|
bless($_,'DiaColloDB::Profile') foreach (grep {defined($_)} @$dprf{qw(prf1 prf2)}); |
|
|
0
|
|
|
|
|
|
|
|
110
|
0
|
|
|
|
|
|
return $dprf; |
|
111
|
|
|
|
|
|
|
} |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
114
|
|
|
|
|
|
|
## I/O: Text |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
## undef = $CLASS_OR_OBJECT->saveTextHeader($fh, hlabel=>$hlabel, titles=>\@titles) |
|
117
|
|
|
|
|
|
|
sub saveTextHeader { |
|
118
|
0
|
|
|
0
|
1
|
|
my ($that,$fh,%opts) = @_; |
|
119
|
|
|
|
|
|
|
my @fields = ( |
|
120
|
0
|
|
|
|
|
|
(map {("${_}a","${_}b")} qw(N f1 f2 f12 score)), |
|
121
|
|
|
|
|
|
|
qw(diff), |
|
122
|
|
|
|
|
|
|
(defined($opts{hlabel}) ? $opts{hlabel} : qw()), |
|
123
|
0
|
0
|
0
|
|
|
|
@{$opts{titles} // (ref($that) ? $that->{titles} : undef) // [qw(item2)]}, |
|
|
0
|
0
|
0
|
|
|
|
|
|
124
|
|
|
|
|
|
|
); |
|
125
|
0
|
|
|
|
|
|
$fh->print(join("\t", map {"#".($_+1).":$fields[$_]"} (0..$#fields)), "\n"); |
|
|
0
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
} |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
## $bool = $prf->saveTextFh($fh, %opts) |
|
129
|
|
|
|
|
|
|
## + %opts: |
|
130
|
|
|
|
|
|
|
## ( |
|
131
|
|
|
|
|
|
|
## label => $label, ##-- override $prf->{label} (used by Profile::Multi), no tab-separators required |
|
132
|
|
|
|
|
|
|
## format => $fmt, ##-- printf score formatting (default="%.4f") |
|
133
|
|
|
|
|
|
|
## header => $bool, ##-- include header-row? (default=1) |
|
134
|
|
|
|
|
|
|
## hlabel => $hlabel, ##-- prefix header item-cells with $hlabel (used by Profile::Multi) |
|
135
|
|
|
|
|
|
|
## ) |
|
136
|
|
|
|
|
|
|
## + format (flat, TAB-separated): Na Nb F1a F1b F2a F2b F12a F12b SCOREa SCOREb SCOREdiff LABEL ITEM2 |
|
137
|
|
|
|
|
|
|
sub saveTextFh { |
|
138
|
0
|
|
|
0
|
1
|
|
my ($dprf,$fh,%opts) = @_; |
|
139
|
0
|
|
|
|
|
|
binmode($fh,':utf8'); |
|
140
|
|
|
|
|
|
|
|
|
141
|
0
|
|
|
|
|
|
my ($pa,$pb,$fscore) = @$dprf{qw(prf1 prf2 score)}; |
|
142
|
0
|
|
0
|
|
|
|
$fscore //= 'f12'; |
|
143
|
0
|
|
|
|
|
|
my ($Na,$f1a,$f2a,$f12a,$scorea) = @$pa{qw(N f1 f2 f12),$fscore}; |
|
144
|
0
|
|
|
|
|
|
my ($Nb,$f1b,$f2b,$f12b,$scoreb) = @$pb{qw(N f1 f2 f12),$fscore}; |
|
145
|
0
|
|
|
|
|
|
my $scored = $dprf->{$fscore}; |
|
146
|
0
|
0
|
|
|
|
|
my $label = exists($opts{label}) ? $opts{label} : $dprf->{label}; |
|
147
|
0
|
|
0
|
|
|
|
my $fmt = $opts{fmt} || '%f'; |
|
148
|
0
|
0
|
0
|
|
|
|
$dprf->saveTextHeader($fh,%opts) if ($opts{header}//1); |
|
149
|
|
|
|
|
|
|
|
|
150
|
0
|
|
|
|
|
|
foreach (sort {$scored->{$b} <=> $scored->{$a}} keys %$scored) { |
|
|
0
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
$fh->print(join("\t", |
|
152
|
0
|
|
0
|
|
|
|
map {$_//0} |
|
153
|
|
|
|
|
|
|
$Na, $Nb, |
|
154
|
|
|
|
|
|
|
$f1a,$f1b, |
|
155
|
|
|
|
|
|
|
$f2a->{$_}, $f2b->{$_}, |
|
156
|
|
|
|
|
|
|
$f12a->{$_}, $f2b->{$_}, |
|
157
|
|
|
|
|
|
|
sprintf($fmt,$scorea->{$_}//'nan'), |
|
158
|
|
|
|
|
|
|
sprintf($fmt,$scoreb->{$_}//'nan'), |
|
159
|
0
|
0
|
0
|
|
|
|
sprintf($fmt,$scored->{$_}//'nan'), |
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
160
|
|
|
|
|
|
|
(defined($label) ? $label : qw()), |
|
161
|
|
|
|
|
|
|
$_), |
|
162
|
|
|
|
|
|
|
"\n"); |
|
163
|
|
|
|
|
|
|
} |
|
164
|
0
|
|
|
|
|
|
return $dprf; |
|
165
|
|
|
|
|
|
|
} |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
168
|
|
|
|
|
|
|
## I/O: HTML |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
## $bool = $prf->saveHtmlFile($filename_or_handle, %opts) |
|
171
|
|
|
|
|
|
|
## + %opts: |
|
172
|
|
|
|
|
|
|
## ( |
|
173
|
|
|
|
|
|
|
## table => $bool, ##-- include <table>..</table> ? (default=1) |
|
174
|
|
|
|
|
|
|
## body => $bool, ##-- include <html><body>..</html></body> ? (default=1) |
|
175
|
|
|
|
|
|
|
## header => $bool, ##-- include header-row? (default=1) |
|
176
|
|
|
|
|
|
|
## verbose => $bool, ##-- include verbose output? (default=0) |
|
177
|
|
|
|
|
|
|
## hlabel => $hlabel, ##-- prefix header item-cells with $hlabel (used by Profile::Multi), no '<th>..</th>' required |
|
178
|
|
|
|
|
|
|
## label => $label, ##-- prefix item-cells with $label (used by Profile::Multi), no '<td>..</td>' required |
|
179
|
|
|
|
|
|
|
## format => $fmt, ##-- printf score formatting (default="%.4f") |
|
180
|
|
|
|
|
|
|
## ) |
|
181
|
|
|
|
|
|
|
## + saves rows of the format "SCOREa SCOREb DIFF PREFIX? ITEM2" |
|
182
|
|
|
|
|
|
|
sub saveHtmlFile { |
|
183
|
0
|
|
|
0
|
1
|
|
my ($dprf,$file,%opts) = @_; |
|
184
|
0
|
0
|
|
|
|
|
my $fh = ref($file) ? $file : IO::File->new(">$file"); |
|
185
|
0
|
0
|
|
|
|
|
$dprf->logconfess("saveHtmlFile(): failed to open '$file': $!") if (!ref($fh)); |
|
186
|
0
|
|
|
|
|
|
binmode($fh,':utf8'); |
|
187
|
|
|
|
|
|
|
|
|
188
|
0
|
0
|
0
|
|
|
|
$fh->print("<html><body>\n") if ($opts{body}//1); |
|
189
|
0
|
0
|
0
|
|
|
|
$fh->print("<table><tbody>\n") if ($opts{table}//1); |
|
190
|
|
|
|
|
|
|
$fh->print("<tr>",( |
|
191
|
0
|
|
|
|
|
|
map {"<th>".htmlesc($_)."</th>"} |
|
192
|
0
|
|
|
|
|
|
($opts{verbose} ? (map {("${_}a","${_}b")} qw(N f1 f2 f12)) : qw()), |
|
193
|
|
|
|
|
|
|
qw(ascore bscore diff), |
|
194
|
|
|
|
|
|
|
(defined($opts{hlabel}) ? $opts{hlabel} : qw()), |
|
195
|
0
|
|
0
|
|
|
|
@{$dprf->{titles}//[qw(item2)]}, |
|
196
|
|
|
|
|
|
|
), |
|
197
|
|
|
|
|
|
|
"</tr>\n" |
|
198
|
0
|
0
|
0
|
|
|
|
) if ($opts{header}//1); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
|
|
200
|
0
|
|
|
|
|
|
my ($pa,$pb,$fscore) = @$dprf{qw(prf1 prf2 score)}; |
|
201
|
0
|
|
0
|
|
|
|
$fscore //= 'f12'; |
|
202
|
0
|
|
|
|
|
|
my $scorea = $pa->{$fscore}; |
|
203
|
0
|
|
|
|
|
|
my $scoreb = $pb->{$fscore}; |
|
204
|
0
|
|
|
|
|
|
my $scored = $dprf->{$fscore}; |
|
205
|
0
|
|
0
|
|
|
|
my $fmt = $opts{format} || "%.4f"; |
|
206
|
0
|
0
|
|
|
|
|
my $label = exists($opts{label}) ? $opts{label} : $dprf->{label}; |
|
207
|
0
|
|
|
|
|
|
foreach (sort {$scored->{$b} <=> $scored->{$a}} keys %$scored) { |
|
|
0
|
|
|
|
|
|
|
|
208
|
0
|
|
|
|
|
|
$fh->print("<tr>", (map {"<td>".htmlesc($_)."</td>"} |
|
209
|
|
|
|
|
|
|
($opts{verbose} |
|
210
|
0
|
|
0
|
|
|
|
? (map {($_//0)} |
|
211
|
|
|
|
|
|
|
$pa->{N}, $pb->{N}, |
|
212
|
|
|
|
|
|
|
$pa->{f1}, $pb->{f1}, |
|
213
|
|
|
|
|
|
|
$pa->{f2}{$_}, $pb->{f2}{$_}, |
|
214
|
|
|
|
|
|
|
$pa->{f12}{$_}, $pb->{f12}{$_}, |
|
215
|
|
|
|
|
|
|
) |
|
216
|
|
|
|
|
|
|
: qw()), |
|
217
|
|
|
|
|
|
|
sprintf($fmt,$scorea->{$_}//'nan'), |
|
218
|
|
|
|
|
|
|
sprintf($fmt,$scoreb->{$_}//'nan'), |
|
219
|
0
|
0
|
0
|
|
|
|
sprintf($fmt,$scored->{$_}//'nan'), |
|
|
|
0
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
220
|
|
|
|
|
|
|
(defined($label) ? $label : qw()), |
|
221
|
|
|
|
|
|
|
split(/\t/,$_)), |
|
222
|
|
|
|
|
|
|
"</tr>\n"); |
|
223
|
|
|
|
|
|
|
} |
|
224
|
0
|
0
|
0
|
|
|
|
$fh->print("</tbody><table>\n") if ($opts{table}//1); |
|
225
|
0
|
0
|
0
|
|
|
|
$fh->print("</body></html>\n") if ($opts{body}//1); |
|
226
|
0
|
0
|
|
|
|
|
$fh->close() if (!ref($file)); |
|
227
|
0
|
|
|
|
|
|
return $dprf; |
|
228
|
|
|
|
|
|
|
} |
|
229
|
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
##============================================================================== |
|
232
|
|
|
|
|
|
|
## Compilation |
|
233
|
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
##---------------------------------------------------------------------- |
|
235
|
|
|
|
|
|
|
## Compilation: diff-ops |
|
236
|
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
## %DIFFOPS : ($opAlias => $opName, ...) : canonical diff-operation names |
|
238
|
|
|
|
|
|
|
our %DIFFOPS = |
|
239
|
|
|
|
|
|
|
( |
|
240
|
|
|
|
|
|
|
(map {($_=>'diff')} qw(difference diff d minus -)), |
|
241
|
|
|
|
|
|
|
(map {($_=>'adiff')} qw(absolute-difference abs-difference abs-diff adiff adifference a- DEFAULT)), |
|
242
|
|
|
|
|
|
|
(map {($_=>'sum')} qw(add plus sum +)), |
|
243
|
|
|
|
|
|
|
(map {($_=>'min')} qw(minimum min)), |
|
244
|
|
|
|
|
|
|
(map {($_=>'max')} qw(maximum max)), |
|
245
|
|
|
|
|
|
|
(map {($_=>'avg')} qw(average avg mean)), |
|
246
|
|
|
|
|
|
|
(map {($_=>'havg')} qw(harmonic-average harmonic-avg harmonic harm haverage havg ha h)), |
|
247
|
|
|
|
|
|
|
(map {($_=>'gavg')} qw(geometric-average geometric-mean geometric geom geo gavg gmean ga g)), |
|
248
|
|
|
|
|
|
|
(map {($_=>'lavg')} qw(logarithmic-average logarithmic-mean logarithmic log-average log-mean log lavg lmean la l)), |
|
249
|
|
|
|
|
|
|
); |
|
250
|
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
## $opname = $dprf->diffop() |
|
252
|
|
|
|
|
|
|
## $opname = $CLASS_OR_OBJECT->diffop($opNameOrAlias) |
|
253
|
|
|
|
|
|
|
## + returns canonical diff operation-name for $opNameOrAlias |
|
254
|
|
|
|
|
|
|
sub diffop { |
|
255
|
0
|
|
|
0
|
1
|
|
my ($that,$op) = @_; |
|
256
|
0
|
0
|
0
|
|
|
|
$op //= $that->{diff} if (ref($that)); |
|
257
|
0
|
0
|
0
|
|
|
|
return (defined($op) ? $DIFFOPS{$op} : undef) // $op // $DIFFOPS{DEFAULT}; |
|
|
|
|
0
|
|
|
|
|
|
258
|
|
|
|
|
|
|
} |
|
259
|
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
## \&FUNC = $dprf->diffsub() |
|
261
|
|
|
|
|
|
|
## \&FUNC = $CLASS_OR_OBJECT->diffsub($opNameOrAlias) |
|
262
|
|
|
|
|
|
|
## + gets low-level binary diff operation for diff-operation $opNameOrAlias (default=$dprf->{diff}) |
|
263
|
|
|
|
|
|
|
sub diffsub { |
|
264
|
0
|
|
|
0
|
1
|
|
my ($that,$opname) = @_; |
|
265
|
0
|
0
|
|
|
|
|
return $opname if (UNIVERSAL::isa($opname,'CODE')); ##-- code-ref |
|
266
|
0
|
|
|
|
|
|
my $op = $that->diffop($opname); |
|
267
|
0
|
|
|
|
|
|
my $sub = $that->can("diffop_$op"); |
|
268
|
0
|
0
|
|
|
|
|
return $sub if (defined($sub)); |
|
269
|
0
|
|
|
|
|
|
$that->logwarn("unknown low-level diff operation '$op' defaults to '$DIFFOPS{DEFAULT}'"); |
|
270
|
0
|
|
|
|
|
|
return \&diffop_diff; |
|
271
|
|
|
|
|
|
|
} |
|
272
|
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
## $how = $dprf->diffpretrim() |
|
274
|
|
|
|
|
|
|
## $how = $CLASS_OR_OBJECT->diffpretrim($opNameOrAlias) |
|
275
|
|
|
|
|
|
|
## + returns if and how diff should pre-trim operand profiles: one of: |
|
276
|
|
|
|
|
|
|
## 0 : don't pre-trim |
|
277
|
|
|
|
|
|
|
## 'restrict' : intersect defined collocates |
|
278
|
|
|
|
|
|
|
## 'kbest' : union of k-best collocates |
|
279
|
|
|
|
|
|
|
sub diffpretrim { |
|
280
|
0
|
|
|
0
|
1
|
|
my ($that,$op) = @_; |
|
281
|
0
|
|
|
|
|
|
$op = $that->diffop($op); |
|
282
|
0
|
0
|
|
|
|
|
if ($op =~ /^min|avg/) { |
|
|
|
0
|
|
|
|
|
|
|
283
|
0
|
|
|
|
|
|
return 'restrict'; |
|
284
|
|
|
|
|
|
|
} |
|
285
|
|
|
|
|
|
|
elsif ($op =~ m/^a?diff|max/) { |
|
286
|
0
|
|
|
|
|
|
return 'kbest'; |
|
287
|
|
|
|
|
|
|
} |
|
288
|
0
|
|
|
|
|
|
return 0; |
|
289
|
|
|
|
|
|
|
} |
|
290
|
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
## $selector = $dprf->diffkbest() |
|
292
|
|
|
|
|
|
|
## $selector = $CLASS_OR_OBJECT->diffkbest($opNameOrAlias) |
|
293
|
|
|
|
|
|
|
## + returns 'kbest' selector appropriate for which() or trim() methods |
|
294
|
|
|
|
|
|
|
sub diffkbest { |
|
295
|
0
|
|
|
0
|
1
|
|
my ($that,$op) = @_; |
|
296
|
0
|
0
|
|
|
|
|
return $that->diffop($op) eq 'adiff' ? 'kbesta' : 'kbest'; |
|
297
|
|
|
|
|
|
|
} |
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
|
|
300
|
1
|
|
|
1
|
|
1568
|
BEGIN { *diffop_adiff = \&diffop_diff; } |
|
301
|
0
|
|
|
0
|
1
|
|
sub diffop_diff { return $_[0]-$_[1]; } |
|
302
|
0
|
|
|
0
|
1
|
|
sub diffop_sum { return $_[0]+$_[1]; } |
|
303
|
0
|
0
|
|
0
|
1
|
|
sub diffop_min { return $_[0]<$_[1] ? $_[0] : $_[1]; } |
|
304
|
0
|
0
|
|
0
|
1
|
|
sub diffop_max { return $_[0]>$_[1] ? $_[0] : $_[1]; } |
|
305
|
0
|
|
|
0
|
1
|
|
sub diffop_avg { return ($_[0]+$_[1])/2.0; } |
|
306
|
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
#sub diffop_havg { return $_[0]<=0 || $_[1]<=0 ? 0 : 2.0/(1.0/$_[0] + 1.0/$_[1]); } |
|
308
|
|
|
|
|
|
|
##-- |
|
309
|
|
|
|
|
|
|
#our $havg_eps = 0.1; |
|
310
|
|
|
|
|
|
|
#sub diffop_havg { return 2.0/(1.0/($_[0]+$havg_eps) + 1.0/($_[1]+$havg_eps)) - $havg_eps; } |
|
311
|
|
|
|
|
|
|
##-- |
|
312
|
0
|
0
|
0
|
0
|
0
|
|
sub diffop_havg0 { return $_[0]<=0 || $_[1]<=0 ? 0 : (2*$_[0]*$_[1])/($_[0]+$_[1]); } |
|
313
|
0
|
|
|
0
|
1
|
|
sub diffop_havg { return diffop_avg(diffop_havg0(@_),diffop_avg(@_)); } |
|
314
|
|
|
|
|
|
|
|
|
315
|
0
|
0
|
|
0
|
0
|
|
sub nthRoot { return ($_[0]<0 ? -1 : 1) * abs($_[0])**(1/$_[1]); } |
|
316
|
|
|
|
|
|
|
#sub diffop_gavg { return nthRoot($_[0]*$_[1], 2); } |
|
317
|
|
|
|
|
|
|
##-- |
|
318
|
0
|
|
|
0
|
0
|
|
sub diffop_gavg0 { return nthRoot($_[0]*$_[1], 2); } |
|
319
|
0
|
|
|
0
|
1
|
|
sub diffop_gavg { return diffop_avg(diffop_gavg0(@_),diffop_avg(@_)); } |
|
320
|
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
sub diffop_lavg { |
|
323
|
0
|
0
|
|
0
|
1
|
|
my ($x,$y) = $_[0]<$_[1] ? @_[0,1] : @_[1,0]; |
|
324
|
0
|
0
|
|
|
|
|
my $delta = $x<=1 ? (1-$x) : 0; |
|
325
|
0
|
|
|
|
|
|
return exp( log(($x+$delta)*($y+$delta))/2.0 ) - $delta; |
|
326
|
|
|
|
|
|
|
} |
|
327
|
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
##---------------------------------------------------------------------- |
|
330
|
|
|
|
|
|
|
## Compilation: guts |
|
331
|
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
## $dprf = $dprf->populate() |
|
333
|
|
|
|
|
|
|
## $dprf = $dprf->populate($prf1,$prf2) |
|
334
|
|
|
|
|
|
|
## + populates diff-profile by applying the selected diff-operation on aligned operand scores |
|
335
|
|
|
|
|
|
|
sub populate { |
|
336
|
0
|
|
|
0
|
1
|
|
my ($dprf,$pa,$pb) = @_; |
|
337
|
0
|
|
0
|
|
|
|
$pa //= $dprf->{prf1}; |
|
338
|
0
|
|
0
|
|
|
|
$pb //= $dprf->{prf2}; |
|
339
|
0
|
0
|
0
|
|
|
|
$pa = $pb->shadow(1) if (!$pa && $pb); |
|
340
|
0
|
0
|
0
|
|
|
|
$pb = $pa->shadow(1) if ( $pa && !$pb); |
|
341
|
0
|
|
|
|
|
|
@$dprf{qw(prf1 prf2)} = ($pa,$pb); |
|
342
|
0
|
|
0
|
|
|
|
$dprf->{label} //= $pa->label() ."-" . $pb->label(); |
|
343
|
|
|
|
|
|
|
|
|
344
|
0
|
|
0
|
|
|
|
my $scoref = $dprf->{score} = $dprf->{score} // $pa->{score} // $pb->{score} // 'f12'; |
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
345
|
0
|
|
|
|
|
|
my ($af2,$af12,$ascore) = @$pa{qw(f2 f12),$scoref}; |
|
346
|
0
|
|
|
|
|
|
my ($bf2,$bf12,$bscore) = @$pb{qw(f2 f12),$scoref}; |
|
347
|
0
|
|
0
|
|
|
|
my $dscore = $dprf->{$scoref} = ($dprf->{$scoref} // {}); |
|
348
|
0
|
|
|
|
|
|
my $diffsub = $dprf->diffsub(); |
|
349
|
0
|
0
|
|
|
|
|
$dprf->logconfess("populate(): no {$scoref} key for \$pa") if (!$ascore); |
|
350
|
0
|
0
|
|
|
|
|
$dprf->logconfess("populate(): no {$scoref} key for \$pb") if (!$bscore); |
|
351
|
0
|
|
|
|
|
|
foreach (keys %$bscore) { |
|
352
|
0
|
|
0
|
|
|
|
$af2->{$_} //= 0; |
|
353
|
0
|
|
0
|
|
|
|
$af12->{$_} //= 0; |
|
354
|
0
|
|
0
|
|
|
|
$ascore->{$_} //= 0; |
|
355
|
0
|
|
0
|
|
|
|
$dscore->{$_} = $diffsub->(($ascore->{$_}//0), ($bscore->{$_}//0)); |
|
|
|
|
0
|
|
|
|
|
|
356
|
|
|
|
|
|
|
} |
|
357
|
0
|
|
|
|
|
|
return $dprf; |
|
358
|
|
|
|
|
|
|
} |
|
359
|
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
## $dprf = $dprf->compile($func,%opts) |
|
362
|
|
|
|
|
|
|
## + compile for score-function $func, one of qw(f fm mi ld); default='f' |
|
363
|
|
|
|
|
|
|
sub compile { |
|
364
|
0
|
|
|
0
|
1
|
|
my ($dprf,$func) = (shift,shift); |
|
365
|
|
|
|
|
|
|
$dprf->logconfess("compile(): cannot compile without operand profiles") |
|
366
|
0
|
0
|
0
|
|
|
|
if (!$dprf->{prf1} || !$dprf->{prf2}); |
|
367
|
0
|
0
|
|
|
|
|
$dprf->{prf1}->compile($func,@_) or return undef; |
|
368
|
0
|
0
|
|
|
|
|
$dprf->{prf2}->compile($func,@_) or return undef; |
|
369
|
0
|
|
|
|
|
|
$dprf->{score} = $dprf->{prf1}{score}; |
|
370
|
0
|
|
|
|
|
|
return $dprf->populate(); |
|
371
|
|
|
|
|
|
|
} |
|
372
|
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
## $dprf = $dprf->uncompile() |
|
374
|
|
|
|
|
|
|
## + un-compiles all scores for $dprd |
|
375
|
|
|
|
|
|
|
sub uncompile { |
|
376
|
0
|
|
|
0
|
1
|
|
my $dprf = shift; |
|
377
|
0
|
0
|
|
|
|
|
$dprf->{prf1}->uncompile() if ($dprf->{prf1}); |
|
378
|
0
|
0
|
|
|
|
|
$dprf->{prf2}->uncompile() if ($dprf->{prf2}); |
|
379
|
0
|
|
|
|
|
|
return $dprf->SUPER::uncompile(); |
|
380
|
|
|
|
|
|
|
} |
|
381
|
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
##============================================================================== |
|
383
|
|
|
|
|
|
|
## Trimming |
|
384
|
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
## \@keys = $prf->which(%opts) |
|
386
|
|
|
|
|
|
|
## + returns 'good' keys for trimming options %opts: |
|
387
|
|
|
|
|
|
|
## ( |
|
388
|
|
|
|
|
|
|
## cutoff => $cutoff, ##-- retain only items with $prf->{$prf->{score}}{$item} >= $cutoff |
|
389
|
|
|
|
|
|
|
## kbest => $kbest, ##-- retain only $kbest items |
|
390
|
|
|
|
|
|
|
## kbesta => $kbesta, ##-- retain only $kbest items (absolute value) |
|
391
|
|
|
|
|
|
|
## return => $which, ##-- either 'good' (default) or 'bad' |
|
392
|
|
|
|
|
|
|
## as => $as, ##-- 'hash' or 'array'; default='array' |
|
393
|
|
|
|
|
|
|
## ) |
|
394
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Profile |
|
395
|
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
## $dprf = $dprf->trim(%opts) |
|
397
|
|
|
|
|
|
|
## + %opts: |
|
398
|
|
|
|
|
|
|
## ( |
|
399
|
|
|
|
|
|
|
## kbest => $kbest, ##-- retain only $kbest items (by score value) |
|
400
|
|
|
|
|
|
|
## kbesta => $kbesta, ##-- retain only $kbest items (by score absolute value) |
|
401
|
|
|
|
|
|
|
## cutoff => $cutoff, ##-- retain only items with $prf->{$prf->{score}}{$item} >= $cutoff |
|
402
|
|
|
|
|
|
|
## keep => $keep, ##-- retain keys @$keep (ARRAY) or keys(%$keep) (HASH) |
|
403
|
|
|
|
|
|
|
## drop => $drop, ##-- drop keys @$drop (ARRAY) or keys(%$drop) (HASH) |
|
404
|
|
|
|
|
|
|
## ) |
|
405
|
|
|
|
|
|
|
sub trim { |
|
406
|
0
|
|
|
0
|
1
|
|
my ($dprf,%opts) = @_; |
|
407
|
0
|
|
|
|
|
|
my ($pa,$pb) = @$dprf{qw(prf1 prf2)}; |
|
408
|
|
|
|
|
|
|
|
|
409
|
0
|
0
|
0
|
|
|
|
if ($opts{keep} || $opts{drop}) { |
|
410
|
|
|
|
|
|
|
##-- explicit keep request |
|
411
|
0
|
0
|
|
|
|
|
$dprf->populate() if (!$dprf->{score}); |
|
412
|
0
|
0
|
|
|
|
|
$dprf->SUPER::trim(%opts) or return undef; |
|
413
|
|
|
|
|
|
|
} |
|
414
|
|
|
|
|
|
|
else { |
|
415
|
|
|
|
|
|
|
##-- heuristic (pre-)trimming |
|
416
|
0
|
|
|
|
|
|
$dprf->pretrim($pa,$pb,%opts); |
|
417
|
0
|
|
|
|
|
|
$dprf->populate(); |
|
418
|
0
|
|
|
|
|
|
$dprf->SUPER::trim(%opts); |
|
419
|
|
|
|
|
|
|
} |
|
420
|
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
##-- trim operand profiles |
|
422
|
0
|
|
0
|
|
|
|
my $keep = $dprf->{$dprf->{score}//'f12'}; |
|
423
|
0
|
0
|
0
|
|
|
|
$pa->trim(keep=>$keep) or return undef if ($pa); |
|
424
|
0
|
0
|
0
|
|
|
|
$pb->trim(keep=>$keep) or return undef if ($pb); |
|
425
|
|
|
|
|
|
|
|
|
426
|
0
|
|
|
|
|
|
return $dprf; |
|
427
|
|
|
|
|
|
|
} |
|
428
|
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
## ($pa,$pb) = $CLASS_OR_OBJECT->pretrim($pa,$pb,%opts) |
|
430
|
|
|
|
|
|
|
## + perform pre-trimming on aligned profile pair ($pa,$pb) |
|
431
|
|
|
|
|
|
|
sub pretrim { |
|
432
|
0
|
|
|
0
|
1
|
|
my ($that,$pa,$pb,%opts) = @_; |
|
433
|
0
|
|
|
|
|
|
my $pretrim = $that->diffpretrim($opts{diff}); |
|
434
|
|
|
|
|
|
|
|
|
435
|
0
|
0
|
0
|
|
|
|
if ($pretrim eq 'kbest') { |
|
|
|
0
|
0
|
|
|
|
|
|
436
|
|
|
|
|
|
|
##-- pre-trim: union of k-best collocates |
|
437
|
0
|
0
|
|
|
|
|
my %keep = map {($_=>undef)} (($pa ? @{$pa->which(%opts)} : qw()), ($pb ? @{$pb->which(%opts)} : qw())); |
|
|
0
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
438
|
0
|
0
|
|
|
|
|
$pa->trim(keep=>\%keep) if ($pa); |
|
439
|
0
|
0
|
|
|
|
|
$pb->trim(keep=>\%keep) if ($pb); |
|
440
|
|
|
|
|
|
|
} |
|
441
|
|
|
|
|
|
|
elsif ($pretrim eq 'restrict' && $pa && $pb) { |
|
442
|
|
|
|
|
|
|
my @drop = ( |
|
443
|
0
|
|
|
|
|
|
(grep {!exists $pa->{f12}{$_}} keys %{$pb->{f12}}), |
|
|
0
|
|
|
|
|
|
|
|
444
|
0
|
|
|
|
|
|
(grep {!exists $pb->{f12}{$_}} keys %{$pa->{f12}}), |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
); |
|
446
|
0
|
|
|
|
|
|
$pa->trim(drop=>\@drop); |
|
447
|
0
|
|
|
|
|
|
$pb->trim(drop=>\@drop); |
|
448
|
|
|
|
|
|
|
} |
|
449
|
0
|
|
|
|
|
|
return ($pa,$pb); |
|
450
|
|
|
|
|
|
|
} |
|
451
|
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
##============================================================================== |
|
453
|
|
|
|
|
|
|
## Stringification |
|
454
|
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
## $dprf = $dprf->stringify( $obj) |
|
456
|
|
|
|
|
|
|
## $dprf = $dprf->stringify(\@key2str) |
|
457
|
|
|
|
|
|
|
## $dprf = $dprf->stringify(\&key2str) |
|
458
|
|
|
|
|
|
|
## $dprf = $dprf->stringify(\%key2str) |
|
459
|
|
|
|
|
|
|
## + stringifies profile (destructive) via $obj->i2s($key2), $key2str->($i2) or $key2str->{$i2} |
|
460
|
|
|
|
|
|
|
sub stringify { |
|
461
|
0
|
|
|
0
|
1
|
|
my ($dprf,$i2s) = @_; |
|
462
|
0
|
|
|
|
|
|
$i2s = $dprf->stringify_map($i2s); |
|
463
|
0
|
|
0
|
|
|
|
$_->stringify($i2s) or return undef foreach (grep {defined($_)} $dprf->operands); |
|
|
0
|
|
|
|
|
|
|
|
464
|
0
|
|
|
|
|
|
return $dprf->SUPER::stringify($i2s); |
|
465
|
|
|
|
|
|
|
} |
|
466
|
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
##============================================================================== |
|
468
|
|
|
|
|
|
|
## Binary operations |
|
469
|
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
## $dprf = $dprf->_add($dprf2,%opts) |
|
471
|
|
|
|
|
|
|
## + adds $dprf2 operand frequency data to $dprf operands (destructive) |
|
472
|
|
|
|
|
|
|
## + implicitly un-compiles $dprf |
|
473
|
|
|
|
|
|
|
## + %opts: |
|
474
|
|
|
|
|
|
|
## N => $bool, ##-- whether to add N values (default:true) |
|
475
|
|
|
|
|
|
|
## f1 => $bool, ##-- whether to add f1 values (default:true) |
|
476
|
|
|
|
|
|
|
sub _add { |
|
477
|
0
|
|
|
0
|
|
|
my ($dpa,$dpb,%opts) = @_; |
|
478
|
0
|
0
|
0
|
|
|
|
$dpa->{prf1}->_add($dpb->{prf1}) if ($dpa->{prf1} && $dpb->{prf1}); |
|
479
|
0
|
0
|
0
|
|
|
|
$dpa->{prf2}->_add($dpb->{prf2}) if ($dpa->{prf2} && $dpb->{prf2}); |
|
480
|
0
|
|
|
|
|
|
return $dpa->uncompile(); |
|
481
|
|
|
|
|
|
|
} |
|
482
|
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
## $dprf3 = $dprf1->add($dprf2,%opts) |
|
484
|
|
|
|
|
|
|
## + returns sum of $dprf1 and $dprf2 operatnd frequency data (destructive) |
|
485
|
|
|
|
|
|
|
## + see _add() method for %opts |
|
486
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Profile |
|
487
|
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
## $diff = $prf1->diff($prf2,%opts) |
|
489
|
|
|
|
|
|
|
## + returns score-diff of $prf1 and $prf2 frequency data (destructive) |
|
490
|
|
|
|
|
|
|
## + %opts: see _diff() method |
|
491
|
|
|
|
|
|
|
## + INHERITED but probably useless |
|
492
|
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
##============================================================================== |
|
495
|
|
|
|
|
|
|
## Footer |
|
496
|
|
|
|
|
|
|
1; |
|
497
|
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
__END__ |