line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
## -*- Mode: CPerl -*- |
2
|
|
|
|
|
|
|
## File: DiaColloDB::Relation.pm |
3
|
|
|
|
|
|
|
## Author: Bryan Jurish <moocow@cpan.org> |
4
|
|
|
|
|
|
|
## Description: collocation db, relation API (abstract & utilities) |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package DiaColloDB::Relation; |
7
|
1
|
|
|
1
|
|
8
|
use DiaColloDB::Persistent; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
32
|
|
8
|
1
|
|
|
1
|
|
545
|
use DiaColloDB::Profile; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
34
|
|
9
|
1
|
|
|
1
|
|
487
|
use DiaColloDB::Profile::Multi; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
43
|
|
10
|
1
|
|
|
1
|
|
7
|
use DiaColloDB::Utils qw(:si :pack :math); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
35
|
|
11
|
1
|
|
|
1
|
|
897
|
use Algorithm::BinarySearch::Vec qw(:api); |
|
1
|
|
|
|
|
5755
|
|
|
1
|
|
|
|
|
55
|
|
12
|
1
|
|
|
1
|
|
288
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
2746
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
##============================================================================== |
15
|
|
|
|
|
|
|
## Globals & Constants |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB::Persistent); |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
##============================================================================== |
20
|
|
|
|
|
|
|
## Constructors etc. |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
## $rel = CLASS_OR_OBJECT->new(%args) |
23
|
|
|
|
|
|
|
## + %args, object structure: see subclases |
24
|
|
|
|
|
|
|
sub new { |
25
|
0
|
|
|
0
|
1
|
|
my ($that,%args) = @_; |
26
|
0
|
|
0
|
|
|
|
return bless({ %args }, ref($that)||$that); |
27
|
|
|
|
|
|
|
} |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
##============================================================================== |
30
|
|
|
|
|
|
|
## Relation API: create |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
## $rel = $CLASS_OR_OBJECT->create($coldb, $tokdat_file, %opts) |
33
|
|
|
|
|
|
|
## + populates current database from $tokdat_file, |
34
|
|
|
|
|
|
|
## a tt-style text file containing with lines of the form: |
35
|
|
|
|
|
|
|
## TID DATE ##-- single token |
36
|
|
|
|
|
|
|
## "\n" ##-- blank line --> EOS |
37
|
|
|
|
|
|
|
## + %opts: clobber %$rel |
38
|
|
|
|
|
|
|
sub create { |
39
|
0
|
|
|
0
|
1
|
|
my ($rel,$coldb,$datfile,%opts) = @_; |
40
|
0
|
|
|
|
|
|
$rel->logconfess($coldb->{error}="create(): abstract method called"); |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
##============================================================================== |
44
|
|
|
|
|
|
|
## Relation API: union |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
## $rel = $CLASS_OR_OBJECT->union($coldb, \@pairs, %opts) |
47
|
|
|
|
|
|
|
## + merge multiple co-frequency indices into new object |
48
|
|
|
|
|
|
|
## + @pairs : array of pairs ([$argrel,\@ti2u],...) |
49
|
|
|
|
|
|
|
## of relation-objects $argrel and tuple-id maps \@ti2u for $rel |
50
|
|
|
|
|
|
|
## + %opts: clobber %$rel |
51
|
|
|
|
|
|
|
## + implicitly flushes the new index |
52
|
|
|
|
|
|
|
sub union { |
53
|
0
|
|
|
0
|
1
|
|
my ($rel,$coldb, $pairs,%opts) = @_; |
54
|
0
|
|
|
|
|
|
$rel->logconfess($coldb->{error}="union(): abstract method called"); |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
##============================================================================== |
58
|
|
|
|
|
|
|
## Relation API: info |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
## \%info = $rel->dbinfo($coldb) |
61
|
|
|
|
|
|
|
## + embedded info-hash for $coldb->dbinfo() |
62
|
|
|
|
|
|
|
sub dbinfo { |
63
|
0
|
|
|
0
|
0
|
|
my $rel = shift; |
64
|
0
|
|
|
|
|
|
my $info = { class=>ref($rel) }; |
65
|
0
|
0
|
|
|
|
|
if ($rel->can('du')) { |
66
|
0
|
|
|
|
|
|
$info->{du_b} = $rel->du(); |
67
|
0
|
|
|
|
|
|
$info->{du_h} = si_str($info->{du_b}); |
68
|
|
|
|
|
|
|
} |
69
|
0
|
|
|
|
|
|
return $info; |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
##============================================================================== |
74
|
|
|
|
|
|
|
## Relation API: profiling & comparison: top-level |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
77
|
|
|
|
|
|
|
## Relation API: profile |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
## $mprf = $rel->profile($coldb, %opts) |
80
|
|
|
|
|
|
|
## + get a relation profile for selected items as a DiaColloDB::Profile::Multi object |
81
|
|
|
|
|
|
|
## + %opts: |
82
|
|
|
|
|
|
|
## ( |
83
|
|
|
|
|
|
|
## ##-- selection parameters |
84
|
|
|
|
|
|
|
## query => $query, ##-- target request ATTR:REQ... |
85
|
|
|
|
|
|
|
## date => $date1, ##-- string or array or range "MIN-MAX" (inclusive) : default=all |
86
|
|
|
|
|
|
|
## ## |
87
|
|
|
|
|
|
|
## ##-- aggregation parameters |
88
|
|
|
|
|
|
|
## slice => $slice, ##-- date slice (default=1, 0 for global profile) |
89
|
|
|
|
|
|
|
## groupby => $groupby, ##-- string or array "ATTR1[:HAVING1] ...": default=$coldb->attrs; see groupby() method |
90
|
|
|
|
|
|
|
## ## |
91
|
|
|
|
|
|
|
## ##-- scoring and trimming parameters |
92
|
|
|
|
|
|
|
## eps => $eps, ##-- smoothing constant (default=0) |
93
|
|
|
|
|
|
|
## score => $func, ##-- scoring function ("f"|"lf"|"fm"|"lfm"|"mi"|"ld"|"ll") : default="f" |
94
|
|
|
|
|
|
|
## kbest => $k, ##-- return only $k best collocates per date (slice) : default=-1:all |
95
|
|
|
|
|
|
|
## cutoff => $cutoff, ##-- minimum score |
96
|
|
|
|
|
|
|
## global => $bool, ##-- trim profiles globally (vs. locally for each date-slice?) (default=0) |
97
|
|
|
|
|
|
|
## extend => \%label2gkeys, ##-- maps slice-labels to selected (packed) group-keys, for extend() method |
98
|
|
|
|
|
|
|
## ## |
99
|
|
|
|
|
|
|
## ##-- profiling and debugging parameters |
100
|
|
|
|
|
|
|
## strings => $bool, ##-- do/don't stringify item keys (default=do) |
101
|
|
|
|
|
|
|
## packed => $bool, ##-- leave item keys packed (default=don't) |
102
|
|
|
|
|
|
|
## fill => $bool, ##-- if true, returned multi-profile will have null profiles inserted for missing slices |
103
|
|
|
|
|
|
|
## onepass => $bool, ##-- if true, use fast but incorrect 1-pass method (Cofreqs subclass only, >= v0.09.001) |
104
|
|
|
|
|
|
|
## ) |
105
|
|
|
|
|
|
|
## + default implementation |
106
|
|
|
|
|
|
|
## - parses request and extracts target tuple-ids |
107
|
|
|
|
|
|
|
## - calls $rel->subprofile1() to compute slice-wise joint frequency profiles (f12) |
108
|
|
|
|
|
|
|
## - calls $rel->subprofile2() to compute independent collocate frequencies (f2), and finally |
109
|
|
|
|
|
|
|
## - collects the result in a DiaColloDB::Profile::Multi object |
110
|
|
|
|
|
|
|
## + default values for %opts should be set by higher-level call, e.g. DiaColloDB::profile() |
111
|
|
|
|
|
|
|
sub profile { |
112
|
0
|
|
|
0
|
1
|
|
my ($reldb,$coldb,%opts) = @_; |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
##-- common variables |
115
|
0
|
|
|
|
|
|
$opts{coldb} = $coldb; ##-- pass-down to subprofile() methods |
116
|
0
|
|
|
|
|
|
my $logProfile = $coldb->{logProfile}; |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
##-- variables: by attribute |
119
|
0
|
|
|
|
|
|
my $groupby= $opts{groupby} = $coldb->groupby($opts{groupby}); |
120
|
0
|
|
|
|
|
|
my $attrs = $coldb->attrs(); |
121
|
0
|
|
|
|
|
|
my $adata = $coldb->attrData($attrs); |
122
|
0
|
|
|
|
|
|
my $a2data = $opts{a2data} = {map {($_->{a}=>$_)} @$adata}; |
|
0
|
|
|
|
|
|
|
123
|
0
|
|
|
|
|
|
my $areqs = $coldb->parseRequest($opts{query}, logas=>'query', default=>$attrs->[0], qref=>\$opts{qobj}); |
124
|
0
|
|
|
|
|
|
foreach (@$areqs) { |
125
|
0
|
|
|
|
|
|
$a2data->{$_->[0]}{req} = $_->[1]; |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
##-- sanity check(s) |
129
|
0
|
0
|
|
|
|
|
if (!@$areqs) { |
130
|
0
|
|
|
|
|
|
$reldb->logwarn($coldb->{error}="profile(): no target attributes specified (supported attributes: ".join(' ',@{$coldb->attrs}).")"); |
|
0
|
|
|
|
|
|
|
131
|
0
|
|
|
|
|
|
return undef; |
132
|
|
|
|
|
|
|
} |
133
|
0
|
0
|
|
|
|
|
if (!@{$groupby->{attrs}}) { |
|
0
|
|
|
|
|
|
|
134
|
0
|
|
|
|
|
|
$reldb->logconfess($coldb->{error}="profile(): cannot profile with empty groupby clause"); |
135
|
0
|
|
|
|
|
|
return undef; |
136
|
|
|
|
|
|
|
} |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
##-- prepare: get target IDs (by attribute) |
139
|
0
|
|
|
|
|
|
my ($ac); |
140
|
0
|
|
0
|
|
|
|
foreach $ac (grep {($_->{req}//'') ne ''} @$adata) { |
|
0
|
|
|
|
|
|
|
141
|
0
|
|
|
|
|
|
$ac->{reqids} = $coldb->enumIds($ac->{enum},$ac->{req},logLevel=>$logProfile,logPrefix=>"profile(): get target $ac->{a}-values"); |
142
|
0
|
0
|
|
|
|
|
if (!@{$ac->{reqids}}) { |
|
0
|
|
|
|
|
|
|
143
|
0
|
|
|
|
|
|
$reldb->logwarn($coldb->{error}="profile(): no $ac->{a}-attribute values match user query '$ac->{req}'"); |
144
|
0
|
|
|
|
|
|
return undef; |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
} |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
##-- prepare: get tuple-ids (by attribute) |
149
|
0
|
|
|
|
|
|
$reldb->vlog($logProfile, "profile(): get target tuple IDs"); |
150
|
0
|
|
|
|
|
|
my $tivec = undef; |
151
|
0
|
|
|
|
|
|
my $nbits = undef; |
152
|
0
|
|
|
|
|
|
my $pack_tv = undef; |
153
|
0
|
|
|
|
|
|
my $test_tv = undef; ##-- test value via vec() |
154
|
0
|
|
|
|
|
|
foreach $ac (grep {$_->{reqids}} @$adata) { |
|
0
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
##-- sanity checks |
156
|
0
|
|
0
|
|
|
|
$nbits //= $ac->{a2t}{len_i}*8; |
157
|
0
|
|
0
|
|
|
|
$pack_tv //= "$ac->{a2t}{pack_i}*"; |
158
|
0
|
0
|
|
|
|
|
vec($test_tv='',0,$nbits) = 0x12345678 if (!defined($test_tv)); |
159
|
|
|
|
|
|
|
$reldb->logconfess($coldb->{error}="profile(): multimap pack-size mismatch: nbits($ac->{a2t}{base}.*) != $nbits") |
160
|
0
|
0
|
|
|
|
|
if ($ac->{a2t}{len_i} != $nbits/8); |
161
|
|
|
|
|
|
|
$reldb->logconfess($coldb->{error}="profile(): multimap pack-template '$ac->{a2t}{pack_i}' for $ac->{a2t}{base}.* is not big-endian") |
162
|
0
|
0
|
|
|
|
|
if (pack($ac->{a2t}{pack_i},0x12345678) ne $test_tv); |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
##-- target set construction |
165
|
0
|
|
|
|
|
|
my $atiset = ''; |
166
|
0
|
|
|
|
|
|
$atiset = vunion($atiset, $ac->{a2t}->fetchraw($_), $nbits) foreach (@{$ac->{reqids}}); |
|
0
|
|
|
|
|
|
|
167
|
0
|
0
|
|
|
|
|
$tivec = defined($tivec) ? vintersect($tivec, $atiset, $nbits) : $atiset; |
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
##-- check maxExpand |
171
|
0
|
|
0
|
|
|
|
$nbits //= packsize($coldb->{pack_id}); |
172
|
0
|
0
|
|
|
|
|
my $ntis = $tivec ? length($tivec)/($nbits/8) : 0; |
173
|
0
|
0
|
0
|
|
|
|
if ($coldb->{maxExpand}>0 && $ntis > $coldb->{maxExpand}) { |
174
|
0
|
|
|
|
|
|
$reldb->logwarn("profile(): Warning: target set exceeds max expansion size ($ntis > $coldb->{maxExpand}): truncating"); |
175
|
0
|
|
|
|
|
|
substr($tivec, -($ntis - $coldb->{maxExpand})*($nbits/8)) = ''; |
176
|
|
|
|
|
|
|
} |
177
|
0
|
0
|
|
|
|
|
my $tis = [$tivec ? unpack($pack_tv, $tivec) : qw()]; |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
##-- parse date request (no filtering here) |
180
|
0
|
|
|
|
|
|
$reldb->vlog($logProfile, "profile(): parse date request (date=$opts{date}, slice=$opts{slice}, fill=$opts{fill})"); |
181
|
0
|
|
|
|
|
|
my $dreq = $opts{dreq} = $coldb->parseDateRequest(@opts{qw(date slice fill)}); |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
##-- profile: get relation profiles (by date-slice, pass 1: f12) |
184
|
0
|
|
0
|
|
|
|
my $onepass = $opts{onepass} || ($reldb->can('subprofile2') eq \&subprofile2); |
185
|
0
|
0
|
|
|
|
|
$reldb->vlog($logProfile, "profile(): get frequency profile(s): ".($onepass ? 'single-pass' : 'pass-1')); |
186
|
0
|
|
|
|
|
|
my $s2prf = $reldb->subprofile1($tis, \%opts); |
187
|
0
|
|
|
|
|
|
foreach (keys %$s2prf) { |
188
|
0
|
|
|
|
|
|
@{$s2prf->{$_}}{qw(label titles)} = ($_,$groupby->{titles}); |
|
0
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
} |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
##-- profile/extend: insert extension keys |
192
|
0
|
|
|
|
|
|
my $extend = $opts{extend}; |
193
|
0
|
0
|
|
|
|
|
if ($extend) { |
194
|
0
|
|
|
|
|
|
my ($slice,$prf,$sxkeys); |
195
|
0
|
|
|
|
|
|
while (($slice,$prf) = each %$s2prf) { |
196
|
0
|
|
0
|
|
|
|
$sxkeys = $extend->{$slice}//{}; |
197
|
0
|
|
0
|
|
|
|
$prf->{f12}{$_} //= 0 foreach (keys %$sxkeys); |
198
|
|
|
|
|
|
|
} |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
##-- profile: complete slice-wise profiles (pass 2: f2) |
202
|
0
|
0
|
0
|
|
|
|
if (!$onepass || !$opts{onepass}) { |
203
|
0
|
|
|
|
|
|
$reldb->vlog($logProfile, "profile(): get frequency profile(s): pass-2"); |
204
|
0
|
|
|
|
|
|
$reldb->subprofile2($s2prf, \%opts); |
205
|
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
##-- compile & collect: multi-profile |
208
|
0
|
|
|
|
|
|
foreach (values %$s2prf) { |
209
|
0
|
|
|
|
|
|
$_->compile($opts{score}, eps=>$opts{eps}); |
210
|
|
|
|
|
|
|
} |
211
|
0
|
|
|
|
|
|
my $mp = DiaColloDB::Profile::Multi->new(profiles=>[@$s2prf{sort {$a<=>$b} keys %$s2prf}], |
212
|
|
|
|
|
|
|
titles=>$groupby->{titles}, |
213
|
0
|
|
|
|
|
|
qinfo =>$reldb->qinfo($coldb, %opts, qreqs=>$areqs, gbreq=>$groupby), |
214
|
|
|
|
|
|
|
); |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
##-- trim and stringify |
217
|
0
|
|
|
|
|
|
$reldb->vlog($logProfile, "profile(): trim and stringify"); |
218
|
0
|
|
|
|
|
|
$mp->trim(%opts, empty=>!$opts{fill}); |
219
|
0
|
0
|
|
|
|
|
if (!$opts{packed}) { |
220
|
0
|
0
|
0
|
|
|
|
if ($opts{strings}//1) { |
221
|
0
|
|
|
|
|
|
$mp->stringify($groupby->{g2s}); |
222
|
|
|
|
|
|
|
} else { |
223
|
0
|
|
|
|
|
|
$mp->stringify($groupby->{g2txt}); |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
##-- return |
228
|
0
|
|
|
|
|
|
return $mp; |
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
232
|
|
|
|
|
|
|
## Relation API: extend (pass-2 for multi-clients) |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
## $mprf = $rel->extend($coldb, %opts) |
235
|
|
|
|
|
|
|
## + extend f12 and f2 frequencies for \%slice2keys = $opts{slice2keys} |
236
|
|
|
|
|
|
|
## + calls $rel->profile($coldb, %opts,extend=>\%slice2keys_packed) |
237
|
|
|
|
|
|
|
## + returns a DiaColloDB::Profile::Multi containing the appropriate f12 and f2 entries |
238
|
|
|
|
|
|
|
sub extend { |
239
|
0
|
|
|
0
|
1
|
|
my ($reldb,$coldb,%opts) = @_; |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
##-- common variables |
242
|
0
|
|
|
|
|
|
$opts{coldb} = $coldb; ##-- pass-down to subprofile() methods |
243
|
0
|
|
|
|
|
|
my $logProfile = $coldb->{logProfile}; |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
##-- sanity check(s) |
246
|
0
|
|
0
|
|
|
|
my $slice2keys = $opts{slice2keys} || $opts{extend}; |
247
|
0
|
0
|
|
|
|
|
if (!$slice2keys) { |
|
|
0
|
|
|
|
|
|
248
|
0
|
|
|
|
|
|
$reldb->logwarn($coldb->{error}="extend(): no 'slice2keys' or 'extend' parameter specified!"); |
249
|
0
|
|
|
|
|
|
return undef; |
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
elsif (!UNIVERSAL::isa($slice2keys,'HASH')) { |
252
|
0
|
|
|
|
|
|
$reldb->logwarn($coldb->{error}="extend(): failed to parse 'slice2keys' or 'extend' parameter"); |
253
|
0
|
|
|
|
|
|
return undef; |
254
|
|
|
|
|
|
|
} |
255
|
0
|
|
|
|
|
|
delete $opts{slice2keys}; |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
##-- get packed group-keys (avoid temporary dummy-profiles: they can't handle unknown group-components) |
258
|
0
|
|
|
|
|
|
my $groupby = $opts{groupby} = $coldb->groupby($opts{groupby}); |
259
|
0
|
|
|
|
|
|
my $s2gx = $groupby->{s2gx}; |
260
|
0
|
|
|
|
|
|
my ($xslice,$xkeys, $xgkeys,$xkey,$xg, %extend); |
261
|
0
|
|
|
|
|
|
while (($xslice,$xkeys) = each %$slice2keys) { |
262
|
0
|
|
|
|
|
|
$xgkeys = $extend{$xslice} = {}; |
263
|
0
|
0
|
|
|
|
|
foreach $xkey (UNIVERSAL::isa($xkeys,'HASH') ? keys(%$xkeys) : @$xkeys) { |
264
|
0
|
0
|
|
|
|
|
next if (!defined($xg = $s2gx->($xkey))); |
265
|
0
|
|
|
|
|
|
$xgkeys->{$xg} = undef; |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
} |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
##-- guts: dispatch to profile() |
270
|
0
|
|
|
|
|
|
my $mp = $reldb->profile($coldb, %opts, kbest=>0,kbesta=>0,cutoff=>undef,global=>0,fill=>1, extend=>\%extend); |
271
|
|
|
|
|
|
|
|
272
|
0
|
|
|
|
|
|
return $mp; |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
276
|
|
|
|
|
|
|
## Relation API: comparison (diff) |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
## $mpdiff = $rel->compare($coldb, %opts) |
279
|
|
|
|
|
|
|
## + get a relation comparison profile for selected items as a DiaColloDB::Profile::MultiDiff object |
280
|
|
|
|
|
|
|
## + %opts: |
281
|
|
|
|
|
|
|
## ( |
282
|
|
|
|
|
|
|
## ##-- selection parameters |
283
|
|
|
|
|
|
|
## (a|b)?query => $query, ##-- target query as for parseRequest() |
284
|
|
|
|
|
|
|
## (a|b)?date => $date1, ##-- string or array or range "MIN-MAX" (inclusive) : default=all |
285
|
|
|
|
|
|
|
## ## |
286
|
|
|
|
|
|
|
## ##-- aggregation parameters |
287
|
|
|
|
|
|
|
## groupby => $groupby, ##-- string or array "ATTR1[:HAVING1] ...": default=$coldb->attrs; see groupby() method |
288
|
|
|
|
|
|
|
## (a|b)?slice => $slice, ##-- date slice (default=1, 0 for global profile) |
289
|
|
|
|
|
|
|
## ## |
290
|
|
|
|
|
|
|
## ##-- scoring and trimming parameters |
291
|
|
|
|
|
|
|
## eps => $eps, ##-- smoothing constant (default=0) |
292
|
|
|
|
|
|
|
## score => $func, ##-- scoring function ("f"|"lf"|"fm"|"lfm"|"mi"|"ld"|"ll") : default="f" |
293
|
|
|
|
|
|
|
## kbest => $k, ##-- return only $k best collocates per date (slice) : default=-1:all |
294
|
|
|
|
|
|
|
## cutoff => $cutoff, ##-- minimum score |
295
|
|
|
|
|
|
|
## global => $bool, ##-- trim profiles globally (vs. locally for each date-slice?) (default=0) |
296
|
|
|
|
|
|
|
## diff => $diff, ##-- low-level score-diff operation (adiff|diff|sum|min|max|avg|havg); default='adiff' |
297
|
|
|
|
|
|
|
## ## |
298
|
|
|
|
|
|
|
## ##-- profiling and debugging parameters |
299
|
|
|
|
|
|
|
## strings => $bool, ##-- do/don't stringify item keys (default=do) |
300
|
|
|
|
|
|
|
## packed => $bool, ##-- leave item keys packed (override stringification; default=don't) |
301
|
|
|
|
|
|
|
## ## |
302
|
|
|
|
|
|
|
## ##-- sublcass abstraction parameters |
303
|
|
|
|
|
|
|
## _gbparse => $bool, ##-- if true (default), 'groupby' clause will be parsed only once, using $coldb->groupby() method |
304
|
|
|
|
|
|
|
## _abkeys => \@abkeys, ##-- additional key-suffixes KEY s.t. (KEY=>VAL) gets passed to profile() calls if e.g. (aKEY=>VAL) is in %opts |
305
|
|
|
|
|
|
|
## ) |
306
|
|
|
|
|
|
|
## + default implementation wraps profile() method |
307
|
|
|
|
|
|
|
## + default values for %opts should be set by higher-level call, e.g. DiaColloDB::compare() |
308
|
|
|
|
|
|
|
sub compare { |
309
|
0
|
|
|
0
|
1
|
|
my ($reldb,$coldb,%opts) = @_; |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
##-- common variables |
312
|
0
|
|
|
|
|
|
my $logProfile = $coldb->{logProfile}; |
313
|
0
|
|
0
|
|
|
|
my $groupby = $opts{groupby} || [@{$coldb->attrs}]; |
314
|
0
|
0
|
0
|
|
|
|
$groupby = $coldb->groupby($groupby) if ($opts{_gbparse}//1); |
315
|
0
|
0
|
0
|
|
|
|
my %aopts = map {exists($opts{"a$_"}) ? ($_=>$opts{"a$_"}) : qw()} (qw(query date slice), @{$opts{_abkeys}//[]}); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
316
|
0
|
0
|
0
|
|
|
|
my %bopts = map {exists($opts{"b$_"}) ? ($_=>$opts{"b$_"}) : qw()} (qw(query date slice), @{$opts{_abkeys}//[]}); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
317
|
0
|
|
|
|
|
|
my %popts = (kbest=>-1,cutoff=>'',global=>0,strings=>0,packed=>1,fill=>1, groupby=>$groupby); |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
##-- get profiles to compare |
320
|
0
|
0
|
|
|
|
|
my $mpa = $reldb->profile($coldb,%opts, %aopts,%popts) or return undef; |
321
|
0
|
0
|
|
|
|
|
my $mpb = $reldb->profile($coldb,%opts, %bopts,%popts) or return undef; |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
##-- alignment and trimming |
324
|
0
|
0
|
|
|
|
|
$reldb->vlog($logProfile, "compare(): align and trim (".($opts{global} ? 'global' : 'local').")"); |
325
|
0
|
|
|
|
|
|
my $ppairs = DiaColloDB::Profile::MultiDiff->align($mpa,$mpb); |
326
|
0
|
|
|
|
|
|
DiaColloDB::Profile::MultiDiff->trimPairs($ppairs, %opts); |
327
|
0
|
|
|
|
|
|
my $diff = DiaColloDB::Profile::MultiDiff->new($mpa,$mpb, titles=>$mpa->{titles}, diff=>$opts{diff}); |
328
|
0
|
0
|
|
|
|
|
$diff->trim( DiaColloDB::Profile::Diff->diffkbest($opts{diff})=>$opts{kbest} ) if (!$opts{global}); |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
##-- finalize: stringify |
331
|
0
|
0
|
|
|
|
|
if (!$opts{packed}) { |
332
|
0
|
0
|
0
|
|
|
|
if ($opts{strings}//1) { |
333
|
|
|
|
|
|
|
$diff->stringify($groupby->{g2s}) if (ref($groupby) && $groupby->{g2s}) |
334
|
0
|
0
|
0
|
|
|
|
} else { |
335
|
0
|
0
|
0
|
|
|
|
$diff->stringify($groupby->{g2txt}) if (ref($groupby) && $groupby->{g2txt}); |
336
|
|
|
|
|
|
|
} |
337
|
|
|
|
|
|
|
} |
338
|
|
|
|
|
|
|
|
339
|
0
|
|
|
|
|
|
return $diff; |
340
|
|
|
|
|
|
|
} |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
## $mpdiff = $rel->diff($coldb, %opts) |
343
|
|
|
|
|
|
|
## + alias for compare() |
344
|
|
|
|
|
|
|
sub diff { |
345
|
0
|
|
|
0
|
1
|
|
my $rel = shift; |
346
|
0
|
|
|
|
|
|
return $rel->compare(@_); |
347
|
|
|
|
|
|
|
} |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
##============================================================================== |
351
|
|
|
|
|
|
|
## Relation API: default |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
354
|
|
|
|
|
|
|
## Relation API: default: sliceN |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
## $N = $rel->sliceN($sliceBy, $dateLo) |
357
|
|
|
|
|
|
|
## + get total slice-wise co-occurrence count for a slice of size $sliceBy starting at $dateLo |
358
|
|
|
|
|
|
|
## + not called by any default methods, but useful for sub-classes |
359
|
|
|
|
|
|
|
## + default implementation is really only appropriate for Cofreqs and Unigrams relations; |
360
|
|
|
|
|
|
|
## uses $rel properties 'N', 'sizeN', 'ymin', 'rN' |
361
|
|
|
|
|
|
|
sub sliceN { |
362
|
0
|
|
|
0
|
0
|
|
my ($rel,$slice,$dlo) = @_; |
363
|
0
|
0
|
0
|
|
|
|
return $rel->{N} if ($slice==0 || !UNIVERSAL::can($rel->{rN},'fetch')); |
364
|
0
|
|
0
|
|
|
|
my $ymin = ($rel->{ymin}//0); |
365
|
0
|
|
|
|
|
|
my $rN = $rel->{rN}; |
366
|
0
|
|
0
|
|
|
|
my $ihi = min2( $dlo-$ymin+$slice, $rel->{sizeN}//$rN->size ); |
367
|
0
|
|
|
|
|
|
my $ilo = max2( $dlo-$ymin, 0); |
368
|
0
|
|
|
|
|
|
my $N = 0; |
369
|
0
|
|
|
|
|
|
for (my $i=$ilo; $i < $ihi; ++$i) { |
370
|
0
|
|
|
|
|
|
$N += $rN->fetch($i); |
371
|
|
|
|
|
|
|
} |
372
|
0
|
|
|
|
|
|
return $N; |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
376
|
|
|
|
|
|
|
## Relation API: default: subprofile1 |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
## \%slice2prf = $rel->subprofile1(\@tids,\%opts) |
379
|
|
|
|
|
|
|
## + get slice-wise joint frequency profile(s) for \@tids (db must be opened) |
380
|
|
|
|
|
|
|
## + %opts: as for profile(), also: |
381
|
|
|
|
|
|
|
## coldb => $coldb, ##-- parent DiaColloDB object (for shared data, debugging) |
382
|
|
|
|
|
|
|
## a2data => \%a2data, ##-- maps indexed attributes to associated data structures |
383
|
|
|
|
|
|
|
## dreq => \%dreq, ##-- parsed date request |
384
|
|
|
|
|
|
|
sub subprofile1 { |
385
|
0
|
|
|
0
|
1
|
|
my ($rel,$tids,$opts) = @_; |
386
|
0
|
|
|
|
|
|
$rel->logconfess($opts->{coldb}{error}="subprofile(): abstract method called"); |
387
|
|
|
|
|
|
|
} |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
390
|
|
|
|
|
|
|
## Relation API: default: subprofile2 |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
## \%slice2prf = $rel->subprofile2(\%slice2prf,\%opts) |
393
|
|
|
|
|
|
|
## + populate f2 frequencies for profiles in \%slice2prf |
394
|
|
|
|
|
|
|
## + %opts: as for subprofile1() |
395
|
|
|
|
|
|
|
## + default implementation just returns \%slice2prf |
396
|
|
|
|
|
|
|
sub subprofile2 { |
397
|
|
|
|
|
|
|
#my ($rel,$slice2prf,$opts) = @_; |
398
|
0
|
|
|
0
|
1
|
|
return $_[1]; |
399
|
|
|
|
|
|
|
} |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
402
|
|
|
|
|
|
|
## Relation API: default: qinfo |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
## \%qinfo = $rel->qinfo($coldb, %opts) |
405
|
|
|
|
|
|
|
## + get query-info hash for profile administrivia (ddc hit links) |
406
|
|
|
|
|
|
|
## + %opts: as for profile(), additionally: |
407
|
|
|
|
|
|
|
## ( |
408
|
|
|
|
|
|
|
## qreqs => \@areqs, ##-- as returned by $coldb->parseRequest($opts{query}) |
409
|
|
|
|
|
|
|
## gbreq => \%groupby, ##-- as returned by $coldb->groupby($opts{groupby}) |
410
|
|
|
|
|
|
|
## ) |
411
|
|
|
|
|
|
|
## + returned hash \%qinfo should have keys: |
412
|
|
|
|
|
|
|
## ( |
413
|
|
|
|
|
|
|
## fcoef => $fcoef, ##-- frequency coefficient (2*$coldb->{dmax} for CoFreqs) |
414
|
|
|
|
|
|
|
## qtemplate => $qtemplate, ##-- query template with __W1.I1__ rsp __W2.I2__ replacing groupby fields |
415
|
|
|
|
|
|
|
## qcanon => $qcanon, ##-- canonical query string (after parsing) |
416
|
|
|
|
|
|
|
## ) |
417
|
|
|
|
|
|
|
sub qinfo { |
418
|
0
|
|
|
0
|
1
|
|
my ($rel,$coldb,%opts) = @_; |
419
|
0
|
|
|
|
|
|
$rel->logconfess("qinfo(): abstract method called"); |
420
|
|
|
|
|
|
|
} |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
## (\@q1strs,\@q2strs,\@qxstrs,\@fstrs) = $rel->qinfoData($coldb,%opts) |
423
|
|
|
|
|
|
|
## + parses @opts{qw(qreqs gbreq)} into conditions on w1, w2 and metadata filters (for ddc linkup) |
424
|
|
|
|
|
|
|
## + call this from subclass qinfo() methods |
425
|
|
|
|
|
|
|
sub qinfoData { |
426
|
0
|
|
|
0
|
1
|
|
my ($rel,$coldb,%opts) = @_; |
427
|
0
|
|
|
|
|
|
my (@q1strs,@q2strs,@qxstrs,@fstrs,$q,$q2); |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
##-- query clause |
430
|
0
|
|
|
|
|
|
foreach (@{$opts{qreqs}}) { |
|
0
|
|
|
|
|
|
|
431
|
0
|
|
|
|
|
|
$q = $coldb->attrQuery(@$_); |
432
|
0
|
0
|
0
|
|
|
|
if (UNIVERSAL::isa($q,'DDC::Any::CQFilter')) { |
|
|
0
|
|
|
|
|
|
433
|
0
|
|
|
|
|
|
push(@fstrs, $q->toString); |
434
|
|
|
|
|
|
|
} |
435
|
|
|
|
|
|
|
elsif (defined($q) && !UNIVERSAL::isa($q,'DDC::Any::CQTokAny')) { |
436
|
0
|
|
|
|
|
|
push(@q1strs, $q->toString); |
437
|
|
|
|
|
|
|
} |
438
|
|
|
|
|
|
|
} |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
##-- groupby clause |
441
|
0
|
|
|
|
|
|
my $xi=1; |
442
|
0
|
|
|
|
|
|
foreach (@{$opts{gbreq}{areqs}}) { |
|
0
|
|
|
|
|
|
|
443
|
0
|
0
|
|
|
|
|
if ($_->[0] =~ /^doc\.(.*)/) { |
444
|
0
|
|
|
|
|
|
push(@fstrs, DDC::Any::CQFHasField->new("$1","__W2.${xi}__")->toString); |
445
|
|
|
|
|
|
|
} |
446
|
|
|
|
|
|
|
else { |
447
|
0
|
|
|
|
|
|
push(@q2strs, DDC::Any::CQTokExact->new($_->[0],"__W2.${xi}__")->toString); |
448
|
|
|
|
|
|
|
} |
449
|
0
|
|
|
|
|
|
++$xi; |
450
|
|
|
|
|
|
|
} |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
##-- common restrictions (trunk/2015-10-28: these are too expensive for large corpora (timeouts): ignore 'em |
453
|
|
|
|
|
|
|
#push(@qxstrs, qq(\$p=/$coldb->{pgood}/)) if ($coldb->{pgood}); |
454
|
|
|
|
|
|
|
#push(@qxstrs, qq(\$=!/$coldb->{pbad}/)) if ($coldb->{pbad}); |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
##-- utf8 |
457
|
0
|
|
|
|
|
|
foreach (@q1strs,@q2strs,@qxstrs,@fstrs) { |
458
|
0
|
0
|
|
|
|
|
utf8::decode($_) if (!utf8::is_utf8($_)); |
459
|
|
|
|
|
|
|
} |
460
|
|
|
|
|
|
|
|
461
|
0
|
|
|
|
|
|
return (\@q1strs,\@q2strs,\@qxstrs,\@fstrs); |
462
|
|
|
|
|
|
|
} |
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
## $qstr_or_undef = $rel->qcanon($coldb,%opts) |
465
|
|
|
|
|
|
|
## + returns "canonical" query string for %opts |
466
|
|
|
|
|
|
|
## + default implementation uses: |
467
|
|
|
|
|
|
|
## - $opts{qcanon} if defined |
468
|
|
|
|
|
|
|
## - $opts{qobj}->toStringFull() if available |
469
|
|
|
|
|
|
|
## - undef |
470
|
|
|
|
|
|
|
sub qcanon { |
471
|
0
|
|
|
0
|
0
|
|
my ($rel,$coldb,%opts) = @_; |
472
|
0
|
|
0
|
|
|
|
my $q = $opts{qcanon} // $opts{qobj}; |
473
|
0
|
0
|
0
|
|
|
|
$q = $q->toStringFull if (ref($q) && UNIVERSAL::can($q,'toStringFull')); |
474
|
0
|
0
|
|
|
|
|
utf8::decode($q) if (!utf8::is_utf8($q)); |
475
|
0
|
|
|
|
|
|
return $q; |
476
|
|
|
|
|
|
|
} |
477
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
##============================================================================== |
480
|
|
|
|
|
|
|
## Footer |
481
|
|
|
|
|
|
|
1; |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
__END__ |