| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
## -*- Mode: CPerl -*- |
|
2
|
|
|
|
|
|
|
## File: DiaColloDB::Compat::v0_09::Relation.pm |
|
3
|
|
|
|
|
|
|
## Author: Bryan Jurish <moocow@cpan.org> |
|
4
|
|
|
|
|
|
|
## Description: collocation db, relation API (abstract & utilities) |
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package DiaColloDB::Compat::v0_09::Relation; |
|
7
|
1
|
|
|
1
|
|
6
|
use DiaColloDB::Compat; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
24
|
|
|
8
|
1
|
|
|
1
|
|
4
|
use DiaColloDB::Relation; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
35
|
|
|
9
|
1
|
|
|
1
|
|
6
|
use DiaColloDB::Utils qw(:si); |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
31
|
|
|
10
|
1
|
|
|
1
|
|
173
|
use Algorithm::BinarySearch::Vec qw(:api); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
29
|
|
|
11
|
1
|
|
|
1
|
|
230
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
100
|
|
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
##============================================================================== |
|
14
|
|
|
|
|
|
|
## Globals & Constants |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB::Relation DiaColloDB::Compat); |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
##============================================================================== |
|
19
|
|
|
|
|
|
|
## Constructors etc. |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
## $rel = CLASS_OR_OBJECT->new(%args) |
|
22
|
|
|
|
|
|
|
## + %args, object structure: see subclases |
|
23
|
|
|
|
|
|
|
sub new { |
|
24
|
0
|
|
|
0
|
1
|
|
my ($that,%args) = @_; |
|
25
|
0
|
|
0
|
|
|
|
return bless({ %args }, ref($that)||$that); |
|
26
|
|
|
|
|
|
|
} |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
##============================================================================== |
|
29
|
|
|
|
|
|
|
## Relation API: create |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
## $rel = $CLASS_OR_OBJECT->create($coldb, $tokdat_file, %opts) |
|
32
|
|
|
|
|
|
|
## + populates current database from $tokdat_file, |
|
33
|
|
|
|
|
|
|
## a tt-style text file containing 1 token-id perl line with optional blank lines |
|
34
|
|
|
|
|
|
|
## + %opts: clobber %$rel |
|
35
|
|
|
|
|
|
|
## + DISABLED |
|
36
|
|
|
|
|
|
|
BEGIN { |
|
37
|
1
|
|
|
1
|
|
8
|
*create = DiaColloDB::Compat->nocompat('create'); |
|
38
|
|
|
|
|
|
|
} |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
##============================================================================== |
|
41
|
|
|
|
|
|
|
## Relation API: union |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
## $rel = $CLASS_OR_OBJECT->union($coldb, \@pairs, %opts) |
|
44
|
|
|
|
|
|
|
## + merge multiple co-frequency indices into new object |
|
45
|
|
|
|
|
|
|
## + @pairs : array of pairs ([$argrel,\@xi2u],...) |
|
46
|
|
|
|
|
|
|
## of relation-objects $argrel and tuple-id maps \@xi2u for $rel |
|
47
|
|
|
|
|
|
|
## + %opts: clobber %$rel |
|
48
|
|
|
|
|
|
|
## + implicitly flushes the new index |
|
49
|
|
|
|
|
|
|
## + DISABLED |
|
50
|
|
|
|
|
|
|
BEGIN { |
|
51
|
1
|
|
|
1
|
|
6
|
*union = DiaColloDB::Compat->nocompat('union'); |
|
52
|
|
|
|
|
|
|
} |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
##============================================================================== |
|
55
|
|
|
|
|
|
|
## Relation API: info |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
## \%info = $rel->dbinfo($coldb) |
|
58
|
|
|
|
|
|
|
## + embedded info-hash for $coldb->dbinfo() |
|
59
|
|
|
|
|
|
|
sub dbinfo { |
|
60
|
0
|
|
|
0
|
0
|
|
my $rel = shift; |
|
61
|
0
|
|
|
|
|
|
my $info = { class=>ref($rel) }; |
|
62
|
0
|
0
|
|
|
|
|
if ($rel->can('du')) { |
|
63
|
0
|
|
|
|
|
|
$info->{du_b} = $rel->du(); |
|
64
|
0
|
|
|
|
|
|
$info->{du_h} = si_str($info->{du_b}); |
|
65
|
|
|
|
|
|
|
} |
|
66
|
0
|
|
|
|
|
|
return $info; |
|
67
|
|
|
|
|
|
|
} |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
##============================================================================== |
|
71
|
|
|
|
|
|
|
## Relation API: profiling & comparison: top-level |
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
74
|
|
|
|
|
|
|
## Relation API: profile |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
## $mprf = $rel->profile($coldb, %opts) |
|
77
|
|
|
|
|
|
|
## + get a relation profile for selected items as a DiaColloDB::Profile::Multi object |
|
78
|
|
|
|
|
|
|
## + %opts: |
|
79
|
|
|
|
|
|
|
## ( |
|
80
|
|
|
|
|
|
|
## ##-- selection parameters |
|
81
|
|
|
|
|
|
|
## query => $query, ##-- target request ATTR:REQ... |
|
82
|
|
|
|
|
|
|
## date => $date1, ##-- string or array or range "MIN-MAX" (inclusive) : default=all |
|
83
|
|
|
|
|
|
|
## ## |
|
84
|
|
|
|
|
|
|
## ##-- aggregation parameters |
|
85
|
|
|
|
|
|
|
## slice => $slice, ##-- date slice (default=1, 0 for global profile) |
|
86
|
|
|
|
|
|
|
## groupby => $groupby, ##-- string or array "ATTR1[:HAVING1] ...": default=$coldb->attrs; see groupby() method |
|
87
|
|
|
|
|
|
|
## ## |
|
88
|
|
|
|
|
|
|
## ##-- scoring and trimming parameters |
|
89
|
|
|
|
|
|
|
## eps => $eps, ##-- smoothing constant (default=0) |
|
90
|
|
|
|
|
|
|
## score => $func, ##-- scoring function ("f"|"lf"|"fm"|"lfm"|"mi"|"ld"|"ll") : default="f" |
|
91
|
|
|
|
|
|
|
## kbest => $k, ##-- return only $k best collocates per date (slice) : default=-1:all |
|
92
|
|
|
|
|
|
|
## cutoff => $cutoff, ##-- minimum score |
|
93
|
|
|
|
|
|
|
## global => $bool, ##-- trim profiles globally (vs. locally for each date-slice?) (default=0) |
|
94
|
|
|
|
|
|
|
## ## |
|
95
|
|
|
|
|
|
|
## ##-- profiling and debugging parameters |
|
96
|
|
|
|
|
|
|
## strings => $bool, ##-- do/don't stringify item keys (default=do) |
|
97
|
|
|
|
|
|
|
## packed => $bool, ##-- leave item keys packed (default=don't) |
|
98
|
|
|
|
|
|
|
## fill => $bool, ##-- if true, returned multi-profile will have null profiles inserted for missing slices |
|
99
|
|
|
|
|
|
|
## onepass => $bool, ##-- if true, use fast but incorrect 1-pass method (Cofreqs subclass only) |
|
100
|
|
|
|
|
|
|
## ) |
|
101
|
|
|
|
|
|
|
## + default implementation |
|
102
|
|
|
|
|
|
|
## - calls $rel->subprofile1() for every requested date-slice, then |
|
103
|
|
|
|
|
|
|
## - calls $rel->subprofile2() to compute item2 frequencies, and finally |
|
104
|
|
|
|
|
|
|
## - collects the result in a DiaColloDB::Profile::Multi object |
|
105
|
|
|
|
|
|
|
## + default values for %opts should be set by higher-level call, e.g. DiaColloDB::profile() |
|
106
|
|
|
|
|
|
|
sub profile { |
|
107
|
0
|
|
|
0
|
1
|
|
my ($reldb,$coldb,%opts) = @_; |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
##-- sanity check(s) |
|
110
|
0
|
0
|
0
|
|
|
|
$reldb->logconfess("profile(): incompatible DB class ".ref($coldb).", v".($coldb->{version}//'???')." for directory $coldb->{dbdir}") |
|
111
|
|
|
|
|
|
|
if (!$coldb->isa('DiaColloDB::Compat::v0_09::DiaColloDB')); |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
##-- common variables |
|
114
|
0
|
|
|
|
|
|
my $logProfile = $coldb->{logProfile}; |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
##-- variables: by attribute |
|
117
|
0
|
|
|
|
|
|
my $groupby= $coldb->groupby($opts{groupby}); |
|
118
|
0
|
|
|
|
|
|
my $attrs = $coldb->attrs(); |
|
119
|
0
|
|
|
|
|
|
my $adata = $coldb->attrData($attrs); |
|
120
|
0
|
|
|
|
|
|
my $a2data = {map {($_->{a}=>$_)} @$adata}; |
|
|
0
|
|
|
|
|
|
|
|
121
|
0
|
|
|
|
|
|
my $areqs = $coldb->parseRequest($opts{query}, logas=>'query', default=>$attrs->[0]); |
|
122
|
0
|
|
|
|
|
|
foreach (@$areqs) { |
|
123
|
0
|
|
|
|
|
|
$a2data->{$_->[0]}{req} = $_->[1]; |
|
124
|
|
|
|
|
|
|
} |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
##-- sanity check(s) |
|
127
|
0
|
0
|
|
|
|
|
if (!@$areqs) { |
|
128
|
0
|
|
|
|
|
|
$reldb->logwarn($coldb->{error}="profile(): no target attributes specified (supported attributes: ".join(' ',@{$coldb->attrs}).")"); |
|
|
0
|
|
|
|
|
|
|
|
129
|
0
|
|
|
|
|
|
return undef; |
|
130
|
|
|
|
|
|
|
} |
|
131
|
0
|
0
|
|
|
|
|
if (!@{$groupby->{attrs}}) { |
|
|
0
|
|
|
|
|
|
|
|
132
|
0
|
|
|
|
|
|
$reldb->logconfess($coldb->{error}="profile(): cannot profile with empty groupby clause"); |
|
133
|
0
|
|
|
|
|
|
return undef; |
|
134
|
|
|
|
|
|
|
} |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
##-- prepare: get target IDs (by attribute) |
|
137
|
0
|
|
|
|
|
|
my ($ac); |
|
138
|
0
|
|
0
|
|
|
|
foreach $ac (grep {($_->{req}//'') ne ''} @$adata) { |
|
|
0
|
|
|
|
|
|
|
|
139
|
0
|
|
|
|
|
|
$ac->{reqids} = $coldb->enumIds($ac->{enum},$ac->{req},logLevel=>$logProfile,logPrefix=>"profile(): get target $ac->{a}-values"); |
|
140
|
0
|
0
|
|
|
|
|
if (!@{$ac->{reqids}}) { |
|
|
0
|
|
|
|
|
|
|
|
141
|
0
|
|
|
|
|
|
$reldb->logwarn($coldb->{error}="profile(): no $ac->{a}-attribute values match user query '$ac->{req}'"); |
|
142
|
0
|
|
|
|
|
|
return undef; |
|
143
|
|
|
|
|
|
|
} |
|
144
|
|
|
|
|
|
|
} |
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
##-- prepare: get tuple-ids (by attribute) |
|
147
|
0
|
|
|
|
|
|
$reldb->vlog($logProfile, "profile(): get target tuple IDs"); |
|
148
|
0
|
|
|
|
|
|
my $xivec = undef; |
|
149
|
0
|
|
|
|
|
|
my $nbits = undef; |
|
150
|
0
|
|
|
|
|
|
my $pack_xv = undef; |
|
151
|
0
|
|
|
|
|
|
my $test_xv = undef; ##-- test value via vec() |
|
152
|
0
|
|
|
|
|
|
foreach $ac (grep {$_->{reqids}} @$adata) { |
|
|
0
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
##-- sanity checks |
|
154
|
0
|
|
0
|
|
|
|
$nbits //= $ac->{a2x}{len_i}*8; |
|
155
|
0
|
|
0
|
|
|
|
$pack_xv //= "$ac->{a2x}{pack_i}*"; |
|
156
|
0
|
0
|
|
|
|
|
vec($test_xv='',0,$nbits) = 0x12345678 if (!defined($test_xv)); |
|
157
|
|
|
|
|
|
|
$reldb->logconfess("profile(): multimap pack-size mismatch: nbits($ac->{a2x}{base}.*) != $nbits") |
|
158
|
0
|
0
|
|
|
|
|
if ($ac->{a2x}{len_i} != $nbits/8); |
|
159
|
|
|
|
|
|
|
$reldb->logconfess("profile(): multimap pack-template '$ac->{a2x}{pack_i}' for $ac->{a2x}{base}.* is not big-endian") |
|
160
|
0
|
0
|
|
|
|
|
if (pack($ac->{a2x}{pack_i},0x12345678) ne $test_xv); |
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
##-- target set construction |
|
163
|
0
|
|
|
|
|
|
my $axiset = ''; |
|
164
|
0
|
|
|
|
|
|
$axiset = vunion($axiset, $ac->{a2x}->fetchraw($_), $nbits) foreach (@{$ac->{reqids}}); |
|
|
0
|
|
|
|
|
|
|
|
165
|
0
|
0
|
|
|
|
|
$xivec = defined($xivec) ? vintersect($xivec, $axiset, $nbits) : $axiset; |
|
166
|
|
|
|
|
|
|
} |
|
167
|
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
##-- check maxExpand |
|
169
|
0
|
|
0
|
|
|
|
$nbits //= packsize($coldb->{pack_id}); |
|
170
|
0
|
0
|
|
|
|
|
my $nxis = $xivec ? length($xivec)/($nbits/8) : 0; |
|
171
|
0
|
0
|
0
|
|
|
|
if ($coldb->{maxExpand}>0 && $nxis > $coldb->{maxExpand}) { |
|
172
|
0
|
|
|
|
|
|
$reldb->logwarn("profile(): Warning: target set exceeds max expansion size ($nxis > $coldb->{maxExpand}): truncating"); |
|
173
|
0
|
|
|
|
|
|
substr($xivec, -($nxis - $coldb->{maxExpand})*($nbits/8)) = ''; |
|
174
|
|
|
|
|
|
|
} |
|
175
|
0
|
0
|
|
|
|
|
my $xis = [$xivec ? unpack($pack_xv, $xivec) : qw()]; |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
##-- prepare: parse and filter tuples |
|
178
|
0
|
|
|
|
|
|
$reldb->vlog($logProfile, "profile(): parse and filter target tuples (date=$opts{date}, slice=$opts{slice}, fill=$opts{fill})"); |
|
179
|
0
|
|
|
|
|
|
my $d2xis = $coldb->xidsByDate($xis, @opts{qw(date slice fill)}); |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
##-- profile: get relation profiles (by date-slice, pass 1: f12) |
|
182
|
0
|
|
0
|
|
|
|
my $onepass = $opts{onepass} || ($reldb->can('subprofile2') eq \&subprofile2); |
|
183
|
0
|
0
|
|
|
|
|
$reldb->vlog($logProfile, "profile(): get frequency profile(s): ".($onepass ? 'single-pass' : 'pass-1')); |
|
184
|
0
|
|
|
|
|
|
my %d2prf = qw(); |
|
185
|
0
|
|
|
|
|
|
my @slices = sort {$a<=>$b} keys %$d2xis; |
|
|
0
|
|
|
|
|
|
|
|
186
|
0
|
|
|
|
|
|
my ($d,$prf); |
|
187
|
0
|
|
|
|
|
|
foreach $d (@slices) { |
|
188
|
0
|
|
|
|
|
|
$prf = $reldb->subprofile1($d2xis->{$d}, groupby=>$groupby->{xi2g}, coldb=>$coldb, onepass=>$onepass, opts=>\%opts); |
|
189
|
0
|
|
|
|
|
|
$prf->{label} = $d; |
|
190
|
0
|
|
|
|
|
|
$prf->{titles} = $groupby->{titles}; |
|
191
|
0
|
|
|
|
|
|
$d2prf{$d} = $prf; |
|
192
|
|
|
|
|
|
|
} |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
##-- profile: complete slice-wise profiles (pass 2: f2) |
|
195
|
0
|
0
|
|
|
|
|
if (!$onepass) { |
|
196
|
0
|
|
|
|
|
|
$reldb->vlog($logProfile, "profile(): get frequency profile(s): pass-2"); |
|
197
|
0
|
|
|
|
|
|
$reldb->subprofile2(\%d2prf, %opts, coldb=>$coldb, groupby=>$groupby, a2data=>$a2data, opts=>\%opts); |
|
198
|
|
|
|
|
|
|
} |
|
199
|
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
##-- compile & collect: multi-profile |
|
201
|
0
|
|
|
|
|
|
foreach $prf (values %d2prf) { |
|
202
|
0
|
|
|
|
|
|
$prf->compile($opts{score}, eps=>$opts{eps}); |
|
203
|
|
|
|
|
|
|
} |
|
204
|
|
|
|
|
|
|
my $mp = DiaColloDB::Profile::Multi->new(profiles=>[@d2prf{@slices}], |
|
205
|
|
|
|
|
|
|
titles=>$groupby->{titles}, |
|
206
|
0
|
|
|
|
|
|
qinfo =>$reldb->qinfo($coldb, %opts, qreqs=>$areqs, gbreq=>$groupby), |
|
207
|
|
|
|
|
|
|
); |
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
##-- trim and stringify |
|
210
|
0
|
|
|
|
|
|
$reldb->vlog($logProfile, "profile(): trim and stringify"); |
|
211
|
0
|
|
|
|
|
|
$mp->trim(%opts, empty=>!$opts{fill}); |
|
212
|
0
|
0
|
|
|
|
|
if (!$opts{packed}) { |
|
213
|
0
|
0
|
0
|
|
|
|
if ($opts{strings}//1) { |
|
214
|
0
|
|
|
|
|
|
$mp->stringify($groupby->{g2s}); |
|
215
|
|
|
|
|
|
|
} else { |
|
216
|
0
|
|
|
|
|
|
$mp->stringify($groupby->{g2txt}); |
|
217
|
|
|
|
|
|
|
} |
|
218
|
|
|
|
|
|
|
} |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
##-- return |
|
221
|
0
|
|
|
|
|
|
return $mp; |
|
222
|
|
|
|
|
|
|
} |
|
223
|
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
226
|
|
|
|
|
|
|
## Relation API: comparison (diff) |
|
227
|
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
## $mpdiff = $rel->compare($coldb, %opts) |
|
229
|
|
|
|
|
|
|
## + get a relation comparison profile for selected items as a DiaColloDB::Profile::MultiDiff object |
|
230
|
|
|
|
|
|
|
## + %opts: |
|
231
|
|
|
|
|
|
|
## ( |
|
232
|
|
|
|
|
|
|
## ##-- selection parameters |
|
233
|
|
|
|
|
|
|
## (a|b)?query => $query, ##-- target query as for parseRequest() |
|
234
|
|
|
|
|
|
|
## (a|b)?date => $date1, ##-- string or array or range "MIN-MAX" (inclusive) : default=all |
|
235
|
|
|
|
|
|
|
## ## |
|
236
|
|
|
|
|
|
|
## ##-- aggregation parameters |
|
237
|
|
|
|
|
|
|
## groupby => $groupby, ##-- string or array "ATTR1[:HAVING1] ...": default=$coldb->attrs; see groupby() method |
|
238
|
|
|
|
|
|
|
## (a|b)?slice => $slice, ##-- date slice (default=1, 0 for global profile) |
|
239
|
|
|
|
|
|
|
## ## |
|
240
|
|
|
|
|
|
|
## ##-- scoring and trimming parameters |
|
241
|
|
|
|
|
|
|
## eps => $eps, ##-- smoothing constant (default=0) |
|
242
|
|
|
|
|
|
|
## score => $func, ##-- scoring function ("f"|"lf"|"fm"|"lfm"|"mi"|"ld"|"ll") : default="f" |
|
243
|
|
|
|
|
|
|
## kbest => $k, ##-- return only $k best collocates per date (slice) : default=-1:all |
|
244
|
|
|
|
|
|
|
## cutoff => $cutoff, ##-- minimum score |
|
245
|
|
|
|
|
|
|
## global => $bool, ##-- trim profiles globally (vs. locally for each date-slice?) (default=0) |
|
246
|
|
|
|
|
|
|
## diff => $diff, ##-- low-level score-diff operation (adiff|diff|sum|min|max|avg|havg); default='adiff' |
|
247
|
|
|
|
|
|
|
## ## |
|
248
|
|
|
|
|
|
|
## ##-- profiling and debugging parameters |
|
249
|
|
|
|
|
|
|
## strings => $bool, ##-- do/don't stringify item keys (default=do) |
|
250
|
|
|
|
|
|
|
## packed => $bool, ##-- leave item keys packed (override stringification; default=don't) |
|
251
|
|
|
|
|
|
|
## ## |
|
252
|
|
|
|
|
|
|
## ##-- sublcass abstraction parameters |
|
253
|
|
|
|
|
|
|
## _gbparse => $bool, ##-- if true (default), 'groupby' clause will be parsed only once, using $coldb->groupby() method |
|
254
|
|
|
|
|
|
|
## _abkeys => \@abkeys, ##-- additional key-suffixes KEY s.t. (KEY=>VAL) gets passed to profile() calls if e.g. (aKEY=>VAL) is in %opts |
|
255
|
|
|
|
|
|
|
## ) |
|
256
|
|
|
|
|
|
|
## + default implementation wraps profile() method |
|
257
|
|
|
|
|
|
|
## + default values for %opts should be set by higher-level call, e.g. DiaColloDB::compare() |
|
258
|
|
|
|
|
|
|
sub compare { |
|
259
|
0
|
|
|
0
|
1
|
|
my ($reldb,$coldb,%opts) = @_; |
|
260
|
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
##-- common variables |
|
262
|
0
|
|
|
|
|
|
my $logProfile = $coldb->{logProfile}; |
|
263
|
0
|
|
0
|
|
|
|
my $groupby = $opts{groupby} || [@{$coldb->attrs}]; |
|
264
|
0
|
0
|
0
|
|
|
|
$groupby = $coldb->groupby($groupby) if ($opts{_gbparse}//1); |
|
265
|
0
|
0
|
0
|
|
|
|
my %aopts = map {exists($opts{"a$_"}) ? ($_=>$opts{"a$_"}) : qw()} (qw(query date slice), @{$opts{_abkeys}//[]}); |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
266
|
0
|
0
|
0
|
|
|
|
my %bopts = map {exists($opts{"b$_"}) ? ($_=>$opts{"b$_"}) : qw()} (qw(query date slice), @{$opts{_abkeys}//[]}); |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
267
|
0
|
|
|
|
|
|
my %popts = (kbest=>-1,cutoff=>'',global=>0,strings=>0,packed=>1,fill=>1, groupby=>$groupby); |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
##-- get profiles to compare |
|
270
|
0
|
0
|
|
|
|
|
my $mpa = $reldb->profile($coldb,%opts, %aopts,%popts) or return undef; |
|
271
|
0
|
0
|
|
|
|
|
my $mpb = $reldb->profile($coldb,%opts, %bopts,%popts) or return undef; |
|
272
|
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
##-- alignment and trimming |
|
274
|
0
|
0
|
|
|
|
|
$reldb->vlog($logProfile, "compare(): align and trim (".($opts{global} ? 'global' : 'local').")"); |
|
275
|
0
|
|
|
|
|
|
my $ppairs = DiaColloDB::Profile::MultiDiff->align($mpa,$mpb); |
|
276
|
0
|
|
|
|
|
|
DiaColloDB::Profile::MultiDiff->trimPairs($ppairs, %opts); |
|
277
|
0
|
|
|
|
|
|
my $diff = DiaColloDB::Profile::MultiDiff->new($mpa,$mpb, titles=>$mpa->{titles}, diff=>$opts{diff}); |
|
278
|
0
|
0
|
|
|
|
|
$diff->trim( DiaColloDB::Profile::Diff->diffkbest($opts{diff})=>$opts{kbest} ) if (!$opts{global}); |
|
279
|
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
##-- finalize: stringify |
|
281
|
0
|
0
|
|
|
|
|
if (!$opts{packed}) { |
|
282
|
0
|
0
|
0
|
|
|
|
if ($opts{strings}//1) { |
|
283
|
0
|
|
|
|
|
|
$diff->stringify($groupby->{g2s}); |
|
284
|
|
|
|
|
|
|
} else { |
|
285
|
0
|
|
|
|
|
|
$diff->stringify($groupby->{g2txt}); |
|
286
|
|
|
|
|
|
|
} |
|
287
|
|
|
|
|
|
|
} |
|
288
|
|
|
|
|
|
|
|
|
289
|
0
|
|
|
|
|
|
return $diff; |
|
290
|
|
|
|
|
|
|
} |
|
291
|
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
## $mpdiff = $rel->diff($coldb, %opts) |
|
293
|
|
|
|
|
|
|
## + alias for compare() |
|
294
|
|
|
|
|
|
|
sub diff { |
|
295
|
0
|
|
|
0
|
1
|
|
my $rel = shift; |
|
296
|
0
|
|
|
|
|
|
return $rel->compare(@_); |
|
297
|
|
|
|
|
|
|
} |
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
##============================================================================== |
|
301
|
|
|
|
|
|
|
## Relation API: default: subprofile1() |
|
302
|
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
## $prf = $rel->subprofile1(\@xids, %opts) |
|
304
|
|
|
|
|
|
|
## + get joint frequency profile for @xids (db must be opened) |
|
305
|
|
|
|
|
|
|
## + %opts: |
|
306
|
|
|
|
|
|
|
## groupby => \&gbsub, ##-- key-extractor $key2_or_undef = $gbsub->($i2) |
|
307
|
|
|
|
|
|
|
## coldb => $coldb, ##-- parent DiaColloDB object (for shared data, debugging) |
|
308
|
|
|
|
|
|
|
## opts => \%opts, ##-- pass-through for options to top-level profile() method |
|
309
|
|
|
|
|
|
|
sub subprofile1 { |
|
310
|
0
|
|
|
0
|
1
|
|
my ($rel,$ids,%opts) = @_; |
|
311
|
0
|
|
|
|
|
|
$rel->logconfess("subprofile(): abstract method called"); |
|
312
|
|
|
|
|
|
|
} |
|
313
|
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
## \%slice2prf = $rel->subprofile2(\%slice2prf, %opts) |
|
315
|
|
|
|
|
|
|
## + populate f2 frequencies for profiles in \%slice2prf |
|
316
|
|
|
|
|
|
|
## + %opts: |
|
317
|
|
|
|
|
|
|
## groupby => \%gbreq, ##-- parsed groupby object |
|
318
|
|
|
|
|
|
|
## a2data => \%a2data, ##-- maps indexed attributes to associated datastructures |
|
319
|
|
|
|
|
|
|
## coldb => $coldb, ##-- parent DiaColloDB object (for shared data, debugging) |
|
320
|
|
|
|
|
|
|
## opts => \%opts, ##-- pass-through for options to top-level profile() method |
|
321
|
|
|
|
|
|
|
## + default implementation just returns \%slice2prf |
|
322
|
|
|
|
|
|
|
sub subprofile2 { |
|
323
|
|
|
|
|
|
|
#my ($rel,$slice2prf,%opts) = @_; |
|
324
|
0
|
|
|
0
|
1
|
|
return $_[1]; |
|
325
|
|
|
|
|
|
|
} |
|
326
|
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
##============================================================================== |
|
328
|
|
|
|
|
|
|
## Relation API: default: qinfo() |
|
329
|
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
## \%qinfo = $rel->qinfo($coldb, %opts) |
|
331
|
|
|
|
|
|
|
## + get query-info hash for profile administrivia (ddc hit links) |
|
332
|
|
|
|
|
|
|
## + %opts: as for profile(), additionally: |
|
333
|
|
|
|
|
|
|
## ( |
|
334
|
|
|
|
|
|
|
## qreqs => \@areqs, ##-- as returned by $coldb->parseRequest($opts{query}) |
|
335
|
|
|
|
|
|
|
## gbreq => \%groupby, ##-- as returned by $coldb->groupby($opts{groupby}) |
|
336
|
|
|
|
|
|
|
## ) |
|
337
|
|
|
|
|
|
|
## + returned hash \%qinfo should have keys: |
|
338
|
|
|
|
|
|
|
## ( |
|
339
|
|
|
|
|
|
|
## fcoef => $fcoef, ##-- frequency coefficient (2*$coldb->{dmax} for CoFreqs) |
|
340
|
|
|
|
|
|
|
## qtemplate => $qtemplate, ##-- query template with __W1.I1__ rsp __W2.I2__ replacing groupby fields |
|
341
|
|
|
|
|
|
|
## ) |
|
342
|
|
|
|
|
|
|
sub qinfo { |
|
343
|
0
|
|
|
0
|
1
|
|
my ($rel,$coldb,%opts) = @_; |
|
344
|
0
|
|
|
|
|
|
$rel->logconfess("qinfo(): abstract method called"); |
|
345
|
|
|
|
|
|
|
} |
|
346
|
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
## (\@q1strs,\@q2strs,\@qxstrs,\@fstrs) = $rel->qinfoData($coldb,%opts) |
|
348
|
|
|
|
|
|
|
## + parses @opts{qw(qreqs gbreq)} into conditions on w1, w2 and metadata filters (for ddc linkup) |
|
349
|
|
|
|
|
|
|
## + call this from subclass qinfo() methods |
|
350
|
|
|
|
|
|
|
sub qinfoData { |
|
351
|
0
|
|
|
0
|
1
|
|
my ($rel,$coldb,%opts) = @_; |
|
352
|
0
|
|
|
|
|
|
my (@q1strs,@q2strs,@qxstrs,@fstrs,$q,$q2); |
|
353
|
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
##-- query clause |
|
355
|
0
|
|
|
|
|
|
foreach (@{$opts{qreqs}}) { |
|
|
0
|
|
|
|
|
|
|
|
356
|
0
|
|
|
|
|
|
$q = $coldb->attrQuery(@$_); |
|
357
|
0
|
0
|
0
|
|
|
|
if (UNIVERSAL::isa($q,'DDC::Any::CQFilter')) { |
|
|
|
0
|
|
|
|
|
|
|
358
|
0
|
|
|
|
|
|
push(@fstrs, $q->toString); |
|
359
|
|
|
|
|
|
|
} |
|
360
|
|
|
|
|
|
|
elsif (defined($q) && !UNIVERSAL::isa($q,'DDC::Any::CQTokAny')) { |
|
361
|
0
|
|
|
|
|
|
push(@q1strs, $q->toString); |
|
362
|
|
|
|
|
|
|
} |
|
363
|
|
|
|
|
|
|
} |
|
364
|
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
##-- groupby clause |
|
366
|
0
|
|
|
|
|
|
my $xi=1; |
|
367
|
0
|
|
|
|
|
|
foreach (@{$opts{gbreq}{areqs}}) { |
|
|
0
|
|
|
|
|
|
|
|
368
|
0
|
0
|
|
|
|
|
if ($_->[0] =~ /^doc\.(.*)/) { |
|
369
|
0
|
|
|
|
|
|
push(@fstrs, DDC::Any::CQFHasField->new($1,"__W2.${xi}__")->toString); |
|
370
|
|
|
|
|
|
|
} |
|
371
|
|
|
|
|
|
|
else { |
|
372
|
0
|
|
|
|
|
|
push(@q2strs, DDC::Any::CQTokExact->new($_->[0],"__W2.${xi}__")->toString); |
|
373
|
|
|
|
|
|
|
} |
|
374
|
0
|
|
|
|
|
|
++$xi; |
|
375
|
|
|
|
|
|
|
} |
|
376
|
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
##-- common restrictions (trunk/2015-10-28: these are too expensive for large corpora (timeouts): ignore 'em |
|
378
|
|
|
|
|
|
|
#push(@qxstrs, qq(\$p=/$coldb->{pgood}/)) if ($coldb->{pgood}); |
|
379
|
|
|
|
|
|
|
#push(@qxstrs, qq(\$=!/$coldb->{pbad}/)) if ($coldb->{pbad}); |
|
380
|
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
##-- utf8 |
|
382
|
0
|
|
|
|
|
|
foreach (@q1strs,@q2strs,@qxstrs,@fstrs) { |
|
383
|
0
|
0
|
|
|
|
|
utf8::decode($_) if (!utf8::is_utf8($_)); |
|
384
|
|
|
|
|
|
|
} |
|
385
|
|
|
|
|
|
|
|
|
386
|
0
|
|
|
|
|
|
return (\@q1strs,\@q2strs,\@qxstrs,\@fstrs); |
|
387
|
|
|
|
|
|
|
} |
|
388
|
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
##============================================================================== |
|
391
|
|
|
|
|
|
|
## Footer |
|
392
|
|
|
|
|
|
|
1; |
|
393
|
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
__END__ |