line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
## -*- Mode: CPerl -*- |
2
|
|
|
|
|
|
|
## File: DiaColloDB::Compat::v0_09::Relation.pm |
3
|
|
|
|
|
|
|
## Author: Bryan Jurish <moocow@cpan.org> |
4
|
|
|
|
|
|
|
## Description: collocation db, relation API (abstract & utilities) |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package DiaColloDB::Compat::v0_09::Relation; |
7
|
1
|
|
|
1
|
|
8
|
use DiaColloDB::Compat; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
31
|
|
8
|
1
|
|
|
1
|
|
6
|
use DiaColloDB::Relation; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
51
|
|
9
|
1
|
|
|
1
|
|
6
|
use DiaColloDB::Utils qw(:si); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
37
|
|
10
|
1
|
|
|
1
|
|
236
|
use Algorithm::BinarySearch::Vec qw(:api); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
39
|
|
11
|
1
|
|
|
1
|
|
297
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
114
|
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
##============================================================================== |
14
|
|
|
|
|
|
|
## Globals & Constants |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB::Relation DiaColloDB::Compat); |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
##============================================================================== |
19
|
|
|
|
|
|
|
## Constructors etc. |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
## $rel = CLASS_OR_OBJECT->new(%args) |
22
|
|
|
|
|
|
|
## + %args, object structure: see subclases |
23
|
|
|
|
|
|
|
sub new { |
24
|
0
|
|
|
0
|
1
|
|
my ($that,%args) = @_; |
25
|
0
|
|
0
|
|
|
|
return bless({ %args }, ref($that)||$that); |
26
|
|
|
|
|
|
|
} |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
##============================================================================== |
29
|
|
|
|
|
|
|
## Relation API: create |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
## $rel = $CLASS_OR_OBJECT->create($coldb, $tokdat_file, %opts) |
32
|
|
|
|
|
|
|
## + populates current database from $tokdat_file, |
33
|
|
|
|
|
|
|
## a tt-style text file containing 1 token-id perl line with optional blank lines |
34
|
|
|
|
|
|
|
## + %opts: clobber %$rel |
35
|
|
|
|
|
|
|
## + DISABLED |
36
|
|
|
|
|
|
|
BEGIN { |
37
|
1
|
|
|
1
|
|
7
|
*create = DiaColloDB::Compat->nocompat('create'); |
38
|
|
|
|
|
|
|
} |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
##============================================================================== |
41
|
|
|
|
|
|
|
## Relation API: union |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
## $rel = $CLASS_OR_OBJECT->union($coldb, \@pairs, %opts) |
44
|
|
|
|
|
|
|
## + merge multiple co-frequency indices into new object |
45
|
|
|
|
|
|
|
## + @pairs : array of pairs ([$argrel,\@xi2u],...) |
46
|
|
|
|
|
|
|
## of relation-objects $argrel and tuple-id maps \@xi2u for $rel |
47
|
|
|
|
|
|
|
## + %opts: clobber %$rel |
48
|
|
|
|
|
|
|
## + implicitly flushes the new index |
49
|
|
|
|
|
|
|
## + DISABLED |
50
|
|
|
|
|
|
|
BEGIN { |
51
|
1
|
|
|
1
|
|
6
|
*union = DiaColloDB::Compat->nocompat('union'); |
52
|
|
|
|
|
|
|
} |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
##============================================================================== |
55
|
|
|
|
|
|
|
## Relation API: info |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
## \%info = $rel->dbinfo($coldb) |
58
|
|
|
|
|
|
|
## + embedded info-hash for $coldb->dbinfo() |
59
|
|
|
|
|
|
|
sub dbinfo { |
60
|
0
|
|
|
0
|
0
|
|
my $rel = shift; |
61
|
0
|
|
|
|
|
|
my $info = { class=>ref($rel) }; |
62
|
0
|
0
|
|
|
|
|
if ($rel->can('du')) { |
63
|
0
|
|
|
|
|
|
$info->{du_b} = $rel->du(); |
64
|
0
|
|
|
|
|
|
$info->{du_h} = si_str($info->{du_b}); |
65
|
|
|
|
|
|
|
} |
66
|
0
|
|
|
|
|
|
return $info; |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
##============================================================================== |
71
|
|
|
|
|
|
|
## Relation API: profiling & comparison: top-level |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
74
|
|
|
|
|
|
|
## Relation API: profile |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
## $mprf = $rel->profile($coldb, %opts) |
77
|
|
|
|
|
|
|
## + get a relation profile for selected items as a DiaColloDB::Profile::Multi object |
78
|
|
|
|
|
|
|
## + %opts: |
79
|
|
|
|
|
|
|
## ( |
80
|
|
|
|
|
|
|
## ##-- selection parameters |
81
|
|
|
|
|
|
|
## query => $query, ##-- target request ATTR:REQ... |
82
|
|
|
|
|
|
|
## date => $date1, ##-- string or array or range "MIN-MAX" (inclusive) : default=all |
83
|
|
|
|
|
|
|
## ## |
84
|
|
|
|
|
|
|
## ##-- aggregation parameters |
85
|
|
|
|
|
|
|
## slice => $slice, ##-- date slice (default=1, 0 for global profile) |
86
|
|
|
|
|
|
|
## groupby => $groupby, ##-- string or array "ATTR1[:HAVING1] ...": default=$coldb->attrs; see groupby() method |
87
|
|
|
|
|
|
|
## ## |
88
|
|
|
|
|
|
|
## ##-- scoring and trimming parameters |
89
|
|
|
|
|
|
|
## eps => $eps, ##-- smoothing constant (default=0) |
90
|
|
|
|
|
|
|
## score => $func, ##-- scoring function ("f"|"lf"|"fm"|"lfm"|"mi"|"ld"|"ll") : default="f" |
91
|
|
|
|
|
|
|
## kbest => $k, ##-- return only $k best collocates per date (slice) : default=-1:all |
92
|
|
|
|
|
|
|
## cutoff => $cutoff, ##-- minimum score |
93
|
|
|
|
|
|
|
## global => $bool, ##-- trim profiles globally (vs. locally for each date-slice?) (default=0) |
94
|
|
|
|
|
|
|
## ## |
95
|
|
|
|
|
|
|
## ##-- profiling and debugging parameters |
96
|
|
|
|
|
|
|
## strings => $bool, ##-- do/don't stringify item keys (default=do) |
97
|
|
|
|
|
|
|
## packed => $bool, ##-- leave item keys packed (default=don't) |
98
|
|
|
|
|
|
|
## fill => $bool, ##-- if true, returned multi-profile will have null profiles inserted for missing slices |
99
|
|
|
|
|
|
|
## onepass => $bool, ##-- if true, use fast but incorrect 1-pass method (Cofreqs subclass only) |
100
|
|
|
|
|
|
|
## ) |
101
|
|
|
|
|
|
|
## + default implementation |
102
|
|
|
|
|
|
|
## - calls $rel->subprofile1() for every requested date-slice, then |
103
|
|
|
|
|
|
|
## - calls $rel->subprofile2() to compute item2 frequencies, and finally |
104
|
|
|
|
|
|
|
## - collects the result in a DiaColloDB::Profile::Multi object |
105
|
|
|
|
|
|
|
## + default values for %opts should be set by higher-level call, e.g. DiaColloDB::profile() |
106
|
|
|
|
|
|
|
sub profile { |
107
|
0
|
|
|
0
|
1
|
|
my ($reldb,$coldb,%opts) = @_; |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
##-- sanity check(s) |
110
|
0
|
0
|
0
|
|
|
|
$reldb->logconfess("profile(): incompatible DB class ".ref($coldb).", v".($coldb->{version}//'???')." for directory $coldb->{dbdir}") |
111
|
|
|
|
|
|
|
if (!$coldb->isa('DiaColloDB::Compat::v0_09::DiaColloDB')); |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
##-- common variables |
114
|
0
|
|
|
|
|
|
my $logProfile = $coldb->{logProfile}; |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
##-- variables: by attribute |
117
|
0
|
|
|
|
|
|
my $groupby= $coldb->groupby($opts{groupby}); |
118
|
0
|
|
|
|
|
|
my $attrs = $coldb->attrs(); |
119
|
0
|
|
|
|
|
|
my $adata = $coldb->attrData($attrs); |
120
|
0
|
|
|
|
|
|
my $a2data = {map {($_->{a}=>$_)} @$adata}; |
|
0
|
|
|
|
|
|
|
121
|
0
|
|
|
|
|
|
my $areqs = $coldb->parseRequest($opts{query}, logas=>'query', default=>$attrs->[0]); |
122
|
0
|
|
|
|
|
|
foreach (@$areqs) { |
123
|
0
|
|
|
|
|
|
$a2data->{$_->[0]}{req} = $_->[1]; |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
##-- sanity check(s) |
127
|
0
|
0
|
|
|
|
|
if (!@$areqs) { |
128
|
0
|
|
|
|
|
|
$reldb->logwarn($coldb->{error}="profile(): no target attributes specified (supported attributes: ".join(' ',@{$coldb->attrs}).")"); |
|
0
|
|
|
|
|
|
|
129
|
0
|
|
|
|
|
|
return undef; |
130
|
|
|
|
|
|
|
} |
131
|
0
|
0
|
|
|
|
|
if (!@{$groupby->{attrs}}) { |
|
0
|
|
|
|
|
|
|
132
|
0
|
|
|
|
|
|
$reldb->logconfess($coldb->{error}="profile(): cannot profile with empty groupby clause"); |
133
|
0
|
|
|
|
|
|
return undef; |
134
|
|
|
|
|
|
|
} |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
##-- prepare: get target IDs (by attribute) |
137
|
0
|
|
|
|
|
|
my ($ac); |
138
|
0
|
|
0
|
|
|
|
foreach $ac (grep {($_->{req}//'') ne ''} @$adata) { |
|
0
|
|
|
|
|
|
|
139
|
0
|
|
|
|
|
|
$ac->{reqids} = $coldb->enumIds($ac->{enum},$ac->{req},logLevel=>$logProfile,logPrefix=>"profile(): get target $ac->{a}-values"); |
140
|
0
|
0
|
|
|
|
|
if (!@{$ac->{reqids}}) { |
|
0
|
|
|
|
|
|
|
141
|
0
|
|
|
|
|
|
$reldb->logwarn($coldb->{error}="profile(): no $ac->{a}-attribute values match user query '$ac->{req}'"); |
142
|
0
|
|
|
|
|
|
return undef; |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
##-- prepare: get tuple-ids (by attribute) |
147
|
0
|
|
|
|
|
|
$reldb->vlog($logProfile, "profile(): get target tuple IDs"); |
148
|
0
|
|
|
|
|
|
my $xivec = undef; |
149
|
0
|
|
|
|
|
|
my $nbits = undef; |
150
|
0
|
|
|
|
|
|
my $pack_xv = undef; |
151
|
0
|
|
|
|
|
|
my $test_xv = undef; ##-- test value via vec() |
152
|
0
|
|
|
|
|
|
foreach $ac (grep {$_->{reqids}} @$adata) { |
|
0
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
##-- sanity checks |
154
|
0
|
|
0
|
|
|
|
$nbits //= $ac->{a2x}{len_i}*8; |
155
|
0
|
|
0
|
|
|
|
$pack_xv //= "$ac->{a2x}{pack_i}*"; |
156
|
0
|
0
|
|
|
|
|
vec($test_xv='',0,$nbits) = 0x12345678 if (!defined($test_xv)); |
157
|
|
|
|
|
|
|
$reldb->logconfess("profile(): multimap pack-size mismatch: nbits($ac->{a2x}{base}.*) != $nbits") |
158
|
0
|
0
|
|
|
|
|
if ($ac->{a2x}{len_i} != $nbits/8); |
159
|
|
|
|
|
|
|
$reldb->logconfess("profile(): multimap pack-template '$ac->{a2x}{pack_i}' for $ac->{a2x}{base}.* is not big-endian") |
160
|
0
|
0
|
|
|
|
|
if (pack($ac->{a2x}{pack_i},0x12345678) ne $test_xv); |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
##-- target set construction |
163
|
0
|
|
|
|
|
|
my $axiset = ''; |
164
|
0
|
|
|
|
|
|
$axiset = vunion($axiset, $ac->{a2x}->fetchraw($_), $nbits) foreach (@{$ac->{reqids}}); |
|
0
|
|
|
|
|
|
|
165
|
0
|
0
|
|
|
|
|
$xivec = defined($xivec) ? vintersect($xivec, $axiset, $nbits) : $axiset; |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
##-- check maxExpand |
169
|
0
|
|
0
|
|
|
|
$nbits //= packsize($coldb->{pack_id}); |
170
|
0
|
0
|
|
|
|
|
my $nxis = $xivec ? length($xivec)/($nbits/8) : 0; |
171
|
0
|
0
|
0
|
|
|
|
if ($coldb->{maxExpand}>0 && $nxis > $coldb->{maxExpand}) { |
172
|
0
|
|
|
|
|
|
$reldb->logwarn("profile(): Warning: target set exceeds max expansion size ($nxis > $coldb->{maxExpand}): truncating"); |
173
|
0
|
|
|
|
|
|
substr($xivec, -($nxis - $coldb->{maxExpand})*($nbits/8)) = ''; |
174
|
|
|
|
|
|
|
} |
175
|
0
|
0
|
|
|
|
|
my $xis = [$xivec ? unpack($pack_xv, $xivec) : qw()]; |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
##-- prepare: parse and filter tuples |
178
|
0
|
|
|
|
|
|
$reldb->vlog($logProfile, "profile(): parse and filter target tuples (date=$opts{date}, slice=$opts{slice}, fill=$opts{fill})"); |
179
|
0
|
|
|
|
|
|
my $d2xis = $coldb->xidsByDate($xis, @opts{qw(date slice fill)}); |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
##-- profile: get relation profiles (by date-slice, pass 1: f12) |
182
|
0
|
|
0
|
|
|
|
my $onepass = $opts{onepass} || ($reldb->can('subprofile2') eq \&subprofile2); |
183
|
0
|
0
|
|
|
|
|
$reldb->vlog($logProfile, "profile(): get frequency profile(s): ".($onepass ? 'single-pass' : 'pass-1')); |
184
|
0
|
|
|
|
|
|
my %d2prf = qw(); |
185
|
0
|
|
|
|
|
|
my @slices = sort {$a<=>$b} keys %$d2xis; |
|
0
|
|
|
|
|
|
|
186
|
0
|
|
|
|
|
|
my ($d,$prf); |
187
|
0
|
|
|
|
|
|
foreach $d (@slices) { |
188
|
0
|
|
|
|
|
|
$prf = $reldb->subprofile1($d2xis->{$d}, groupby=>$groupby->{xi2g}, coldb=>$coldb, onepass=>$onepass, opts=>\%opts); |
189
|
0
|
|
|
|
|
|
$prf->{label} = $d; |
190
|
0
|
|
|
|
|
|
$prf->{titles} = $groupby->{titles}; |
191
|
0
|
|
|
|
|
|
$d2prf{$d} = $prf; |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
##-- profile: complete slice-wise profiles (pass 2: f2) |
195
|
0
|
0
|
|
|
|
|
if (!$onepass) { |
196
|
0
|
|
|
|
|
|
$reldb->vlog($logProfile, "profile(): get frequency profile(s): pass-2"); |
197
|
0
|
|
|
|
|
|
$reldb->subprofile2(\%d2prf, %opts, coldb=>$coldb, groupby=>$groupby, a2data=>$a2data, opts=>\%opts); |
198
|
|
|
|
|
|
|
} |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
##-- compile & collect: multi-profile |
201
|
0
|
|
|
|
|
|
foreach $prf (values %d2prf) { |
202
|
0
|
|
|
|
|
|
$prf->compile($opts{score}, eps=>$opts{eps}); |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
my $mp = DiaColloDB::Profile::Multi->new(profiles=>[@d2prf{@slices}], |
205
|
|
|
|
|
|
|
titles=>$groupby->{titles}, |
206
|
0
|
|
|
|
|
|
qinfo =>$reldb->qinfo($coldb, %opts, qreqs=>$areqs, gbreq=>$groupby), |
207
|
|
|
|
|
|
|
); |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
##-- trim and stringify |
210
|
0
|
|
|
|
|
|
$reldb->vlog($logProfile, "profile(): trim and stringify"); |
211
|
0
|
|
|
|
|
|
$mp->trim(%opts, empty=>!$opts{fill}); |
212
|
0
|
0
|
|
|
|
|
if (!$opts{packed}) { |
213
|
0
|
0
|
0
|
|
|
|
if ($opts{strings}//1) { |
214
|
0
|
|
|
|
|
|
$mp->stringify($groupby->{g2s}); |
215
|
|
|
|
|
|
|
} else { |
216
|
0
|
|
|
|
|
|
$mp->stringify($groupby->{g2txt}); |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
} |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
##-- return |
221
|
0
|
|
|
|
|
|
return $mp; |
222
|
|
|
|
|
|
|
} |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
226
|
|
|
|
|
|
|
## Relation API: comparison (diff) |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
## $mpdiff = $rel->compare($coldb, %opts) |
229
|
|
|
|
|
|
|
## + get a relation comparison profile for selected items as a DiaColloDB::Profile::MultiDiff object |
230
|
|
|
|
|
|
|
## + %opts: |
231
|
|
|
|
|
|
|
## ( |
232
|
|
|
|
|
|
|
## ##-- selection parameters |
233
|
|
|
|
|
|
|
## (a|b)?query => $query, ##-- target query as for parseRequest() |
234
|
|
|
|
|
|
|
## (a|b)?date => $date1, ##-- string or array or range "MIN-MAX" (inclusive) : default=all |
235
|
|
|
|
|
|
|
## ## |
236
|
|
|
|
|
|
|
## ##-- aggregation parameters |
237
|
|
|
|
|
|
|
## groupby => $groupby, ##-- string or array "ATTR1[:HAVING1] ...": default=$coldb->attrs; see groupby() method |
238
|
|
|
|
|
|
|
## (a|b)?slice => $slice, ##-- date slice (default=1, 0 for global profile) |
239
|
|
|
|
|
|
|
## ## |
240
|
|
|
|
|
|
|
## ##-- scoring and trimming parameters |
241
|
|
|
|
|
|
|
## eps => $eps, ##-- smoothing constant (default=0) |
242
|
|
|
|
|
|
|
## score => $func, ##-- scoring function ("f"|"lf"|"fm"|"lfm"|"mi"|"ld"|"ll") : default="f" |
243
|
|
|
|
|
|
|
## kbest => $k, ##-- return only $k best collocates per date (slice) : default=-1:all |
244
|
|
|
|
|
|
|
## cutoff => $cutoff, ##-- minimum score |
245
|
|
|
|
|
|
|
## global => $bool, ##-- trim profiles globally (vs. locally for each date-slice?) (default=0) |
246
|
|
|
|
|
|
|
## diff => $diff, ##-- low-level score-diff operation (adiff|diff|sum|min|max|avg|havg); default='adiff' |
247
|
|
|
|
|
|
|
## ## |
248
|
|
|
|
|
|
|
## ##-- profiling and debugging parameters |
249
|
|
|
|
|
|
|
## strings => $bool, ##-- do/don't stringify item keys (default=do) |
250
|
|
|
|
|
|
|
## packed => $bool, ##-- leave item keys packed (override stringification; default=don't) |
251
|
|
|
|
|
|
|
## ## |
252
|
|
|
|
|
|
|
## ##-- sublcass abstraction parameters |
253
|
|
|
|
|
|
|
## _gbparse => $bool, ##-- if true (default), 'groupby' clause will be parsed only once, using $coldb->groupby() method |
254
|
|
|
|
|
|
|
## _abkeys => \@abkeys, ##-- additional key-suffixes KEY s.t. (KEY=>VAL) gets passed to profile() calls if e.g. (aKEY=>VAL) is in %opts |
255
|
|
|
|
|
|
|
## ) |
256
|
|
|
|
|
|
|
## + default implementation wraps profile() method |
257
|
|
|
|
|
|
|
## + default values for %opts should be set by higher-level call, e.g. DiaColloDB::compare() |
258
|
|
|
|
|
|
|
sub compare { |
259
|
0
|
|
|
0
|
1
|
|
my ($reldb,$coldb,%opts) = @_; |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
##-- common variables |
262
|
0
|
|
|
|
|
|
my $logProfile = $coldb->{logProfile}; |
263
|
0
|
|
0
|
|
|
|
my $groupby = $opts{groupby} || [@{$coldb->attrs}]; |
264
|
0
|
0
|
0
|
|
|
|
$groupby = $coldb->groupby($groupby) if ($opts{_gbparse}//1); |
265
|
0
|
0
|
0
|
|
|
|
my %aopts = map {exists($opts{"a$_"}) ? ($_=>$opts{"a$_"}) : qw()} (qw(query date slice), @{$opts{_abkeys}//[]}); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
266
|
0
|
0
|
0
|
|
|
|
my %bopts = map {exists($opts{"b$_"}) ? ($_=>$opts{"b$_"}) : qw()} (qw(query date slice), @{$opts{_abkeys}//[]}); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
267
|
0
|
|
|
|
|
|
my %popts = (kbest=>-1,cutoff=>'',global=>0,strings=>0,packed=>1,fill=>1, groupby=>$groupby); |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
##-- get profiles to compare |
270
|
0
|
0
|
|
|
|
|
my $mpa = $reldb->profile($coldb,%opts, %aopts,%popts) or return undef; |
271
|
0
|
0
|
|
|
|
|
my $mpb = $reldb->profile($coldb,%opts, %bopts,%popts) or return undef; |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
##-- alignment and trimming |
274
|
0
|
0
|
|
|
|
|
$reldb->vlog($logProfile, "compare(): align and trim (".($opts{global} ? 'global' : 'local').")"); |
275
|
0
|
|
|
|
|
|
my $ppairs = DiaColloDB::Profile::MultiDiff->align($mpa,$mpb); |
276
|
0
|
|
|
|
|
|
DiaColloDB::Profile::MultiDiff->trimPairs($ppairs, %opts); |
277
|
0
|
|
|
|
|
|
my $diff = DiaColloDB::Profile::MultiDiff->new($mpa,$mpb, titles=>$mpa->{titles}, diff=>$opts{diff}); |
278
|
0
|
0
|
|
|
|
|
$diff->trim( DiaColloDB::Profile::Diff->diffkbest($opts{diff})=>$opts{kbest} ) if (!$opts{global}); |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
##-- finalize: stringify |
281
|
0
|
0
|
|
|
|
|
if (!$opts{packed}) { |
282
|
0
|
0
|
0
|
|
|
|
if ($opts{strings}//1) { |
283
|
0
|
|
|
|
|
|
$diff->stringify($groupby->{g2s}); |
284
|
|
|
|
|
|
|
} else { |
285
|
0
|
|
|
|
|
|
$diff->stringify($groupby->{g2txt}); |
286
|
|
|
|
|
|
|
} |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
|
289
|
0
|
|
|
|
|
|
return $diff; |
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
## $mpdiff = $rel->diff($coldb, %opts) |
293
|
|
|
|
|
|
|
## + alias for compare() |
294
|
|
|
|
|
|
|
sub diff { |
295
|
0
|
|
|
0
|
1
|
|
my $rel = shift; |
296
|
0
|
|
|
|
|
|
return $rel->compare(@_); |
297
|
|
|
|
|
|
|
} |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
##============================================================================== |
301
|
|
|
|
|
|
|
## Relation API: default: subprofile1() |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
## $prf = $rel->subprofile1(\@xids, %opts) |
304
|
|
|
|
|
|
|
## + get joint frequency profile for @xids (db must be opened) |
305
|
|
|
|
|
|
|
## + %opts: |
306
|
|
|
|
|
|
|
## groupby => \&gbsub, ##-- key-extractor $key2_or_undef = $gbsub->($i2) |
307
|
|
|
|
|
|
|
## coldb => $coldb, ##-- parent DiaColloDB object (for shared data, debugging) |
308
|
|
|
|
|
|
|
## opts => \%opts, ##-- pass-through for options to top-level profile() method |
309
|
|
|
|
|
|
|
sub subprofile1 { |
310
|
0
|
|
|
0
|
1
|
|
my ($rel,$ids,%opts) = @_; |
311
|
0
|
|
|
|
|
|
$rel->logconfess("subprofile(): abstract method called"); |
312
|
|
|
|
|
|
|
} |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
## \%slice2prf = $rel->subprofile2(\%slice2prf, %opts) |
315
|
|
|
|
|
|
|
## + populate f2 frequencies for profiles in \%slice2prf |
316
|
|
|
|
|
|
|
## + %opts: |
317
|
|
|
|
|
|
|
## groupby => \%gbreq, ##-- parsed groupby object |
318
|
|
|
|
|
|
|
## a2data => \%a2data, ##-- maps indexed attributes to associated datastructures |
319
|
|
|
|
|
|
|
## coldb => $coldb, ##-- parent DiaColloDB object (for shared data, debugging) |
320
|
|
|
|
|
|
|
## opts => \%opts, ##-- pass-through for options to top-level profile() method |
321
|
|
|
|
|
|
|
## + default implementation just returns \%slice2prf |
322
|
|
|
|
|
|
|
sub subprofile2 { |
323
|
|
|
|
|
|
|
#my ($rel,$slice2prf,%opts) = @_; |
324
|
0
|
|
|
0
|
1
|
|
return $_[1]; |
325
|
|
|
|
|
|
|
} |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
##============================================================================== |
328
|
|
|
|
|
|
|
## Relation API: default: qinfo() |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
## \%qinfo = $rel->qinfo($coldb, %opts) |
331
|
|
|
|
|
|
|
## + get query-info hash for profile administrivia (ddc hit links) |
332
|
|
|
|
|
|
|
## + %opts: as for profile(), additionally: |
333
|
|
|
|
|
|
|
## ( |
334
|
|
|
|
|
|
|
## qreqs => \@areqs, ##-- as returned by $coldb->parseRequest($opts{query}) |
335
|
|
|
|
|
|
|
## gbreq => \%groupby, ##-- as returned by $coldb->groupby($opts{groupby}) |
336
|
|
|
|
|
|
|
## ) |
337
|
|
|
|
|
|
|
## + returned hash \%qinfo should have keys: |
338
|
|
|
|
|
|
|
## ( |
339
|
|
|
|
|
|
|
## fcoef => $fcoef, ##-- frequency coefficient (2*$coldb->{dmax} for CoFreqs) |
340
|
|
|
|
|
|
|
## qtemplate => $qtemplate, ##-- query template with __W1.I1__ rsp __W2.I2__ replacing groupby fields |
341
|
|
|
|
|
|
|
## ) |
342
|
|
|
|
|
|
|
sub qinfo { |
343
|
0
|
|
|
0
|
1
|
|
my ($rel,$coldb,%opts) = @_; |
344
|
0
|
|
|
|
|
|
$rel->logconfess("qinfo(): abstract method called"); |
345
|
|
|
|
|
|
|
} |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
## (\@q1strs,\@q2strs,\@qxstrs,\@fstrs) = $rel->qinfoData($coldb,%opts) |
348
|
|
|
|
|
|
|
## + parses @opts{qw(qreqs gbreq)} into conditions on w1, w2 and metadata filters (for ddc linkup) |
349
|
|
|
|
|
|
|
## + call this from subclass qinfo() methods |
350
|
|
|
|
|
|
|
sub qinfoData { |
351
|
0
|
|
|
0
|
1
|
|
my ($rel,$coldb,%opts) = @_; |
352
|
0
|
|
|
|
|
|
my (@q1strs,@q2strs,@qxstrs,@fstrs,$q,$q2); |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
##-- query clause |
355
|
0
|
|
|
|
|
|
foreach (@{$opts{qreqs}}) { |
|
0
|
|
|
|
|
|
|
356
|
0
|
|
|
|
|
|
$q = $coldb->attrQuery(@$_); |
357
|
0
|
0
|
0
|
|
|
|
if (UNIVERSAL::isa($q,'DDC::Any::CQFilter')) { |
|
|
0
|
|
|
|
|
|
358
|
0
|
|
|
|
|
|
push(@fstrs, $q->toString); |
359
|
|
|
|
|
|
|
} |
360
|
|
|
|
|
|
|
elsif (defined($q) && !UNIVERSAL::isa($q,'DDC::Any::CQTokAny')) { |
361
|
0
|
|
|
|
|
|
push(@q1strs, $q->toString); |
362
|
|
|
|
|
|
|
} |
363
|
|
|
|
|
|
|
} |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
##-- groupby clause |
366
|
0
|
|
|
|
|
|
my $xi=1; |
367
|
0
|
|
|
|
|
|
foreach (@{$opts{gbreq}{areqs}}) { |
|
0
|
|
|
|
|
|
|
368
|
0
|
0
|
|
|
|
|
if ($_->[0] =~ /^doc\.(.*)/) { |
369
|
0
|
|
|
|
|
|
push(@fstrs, DDC::Any::CQFHasField->new($1,"__W2.${xi}__")->toString); |
370
|
|
|
|
|
|
|
} |
371
|
|
|
|
|
|
|
else { |
372
|
0
|
|
|
|
|
|
push(@q2strs, DDC::Any::CQTokExact->new($_->[0],"__W2.${xi}__")->toString); |
373
|
|
|
|
|
|
|
} |
374
|
0
|
|
|
|
|
|
++$xi; |
375
|
|
|
|
|
|
|
} |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
##-- common restrictions (trunk/2015-10-28: these are too expensive for large corpora (timeouts): ignore 'em |
378
|
|
|
|
|
|
|
#push(@qxstrs, qq(\$p=/$coldb->{pgood}/)) if ($coldb->{pgood}); |
379
|
|
|
|
|
|
|
#push(@qxstrs, qq(\$=!/$coldb->{pbad}/)) if ($coldb->{pbad}); |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
##-- utf8 |
382
|
0
|
|
|
|
|
|
foreach (@q1strs,@q2strs,@qxstrs,@fstrs) { |
383
|
0
|
0
|
|
|
|
|
utf8::decode($_) if (!utf8::is_utf8($_)); |
384
|
|
|
|
|
|
|
} |
385
|
|
|
|
|
|
|
|
386
|
0
|
|
|
|
|
|
return (\@q1strs,\@q2strs,\@qxstrs,\@fstrs); |
387
|
|
|
|
|
|
|
} |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
##============================================================================== |
391
|
|
|
|
|
|
|
## Footer |
392
|
|
|
|
|
|
|
1; |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
__END__ |