| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
## -*- Mode: CPerl -*- |
|
2
|
|
|
|
|
|
|
## File: DiaColloDB::Relation.pm |
|
3
|
|
|
|
|
|
|
## Author: Bryan Jurish <moocow@cpan.org> |
|
4
|
|
|
|
|
|
|
## Description: collocation db, relation API (abstract & utilities) |
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package DiaColloDB::Relation; |
|
7
|
1
|
|
|
1
|
|
8
|
use DiaColloDB::Persistent; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
32
|
|
|
8
|
1
|
|
|
1
|
|
545
|
use DiaColloDB::Profile; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
34
|
|
|
9
|
1
|
|
|
1
|
|
487
|
use DiaColloDB::Profile::Multi; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
43
|
|
|
10
|
1
|
|
|
1
|
|
7
|
use DiaColloDB::Utils qw(:si :pack :math); |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
35
|
|
|
11
|
1
|
|
|
1
|
|
897
|
use Algorithm::BinarySearch::Vec qw(:api); |
|
|
1
|
|
|
|
|
5755
|
|
|
|
1
|
|
|
|
|
55
|
|
|
12
|
1
|
|
|
1
|
|
288
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
2746
|
|
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
##============================================================================== |
|
15
|
|
|
|
|
|
|
## Globals & Constants |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB::Persistent); |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
##============================================================================== |
|
20
|
|
|
|
|
|
|
## Constructors etc. |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
## $rel = CLASS_OR_OBJECT->new(%args) |
|
23
|
|
|
|
|
|
|
## + %args, object structure: see subclases |
|
24
|
|
|
|
|
|
|
sub new { |
|
25
|
0
|
|
|
0
|
1
|
|
my ($that,%args) = @_; |
|
26
|
0
|
|
0
|
|
|
|
return bless({ %args }, ref($that)||$that); |
|
27
|
|
|
|
|
|
|
} |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
##============================================================================== |
|
30
|
|
|
|
|
|
|
## Relation API: create |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
## $rel = $CLASS_OR_OBJECT->create($coldb, $tokdat_file, %opts) |
|
33
|
|
|
|
|
|
|
## + populates current database from $tokdat_file, |
|
34
|
|
|
|
|
|
|
## a tt-style text file containing with lines of the form: |
|
35
|
|
|
|
|
|
|
## TID DATE ##-- single token |
|
36
|
|
|
|
|
|
|
## "\n" ##-- blank line --> EOS |
|
37
|
|
|
|
|
|
|
## + %opts: clobber %$rel |
|
38
|
|
|
|
|
|
|
sub create { |
|
39
|
0
|
|
|
0
|
1
|
|
my ($rel,$coldb,$datfile,%opts) = @_; |
|
40
|
0
|
|
|
|
|
|
$rel->logconfess($coldb->{error}="create(): abstract method called"); |
|
41
|
|
|
|
|
|
|
} |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
##============================================================================== |
|
44
|
|
|
|
|
|
|
## Relation API: union |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
## $rel = $CLASS_OR_OBJECT->union($coldb, \@pairs, %opts) |
|
47
|
|
|
|
|
|
|
## + merge multiple co-frequency indices into new object |
|
48
|
|
|
|
|
|
|
## + @pairs : array of pairs ([$argrel,\@ti2u],...) |
|
49
|
|
|
|
|
|
|
## of relation-objects $argrel and tuple-id maps \@ti2u for $rel |
|
50
|
|
|
|
|
|
|
## + %opts: clobber %$rel |
|
51
|
|
|
|
|
|
|
## + implicitly flushes the new index |
|
52
|
|
|
|
|
|
|
sub union { |
|
53
|
0
|
|
|
0
|
1
|
|
my ($rel,$coldb, $pairs,%opts) = @_; |
|
54
|
0
|
|
|
|
|
|
$rel->logconfess($coldb->{error}="union(): abstract method called"); |
|
55
|
|
|
|
|
|
|
} |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
##============================================================================== |
|
58
|
|
|
|
|
|
|
## Relation API: info |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
## \%info = $rel->dbinfo($coldb) |
|
61
|
|
|
|
|
|
|
## + embedded info-hash for $coldb->dbinfo() |
|
62
|
|
|
|
|
|
|
sub dbinfo { |
|
63
|
0
|
|
|
0
|
0
|
|
my $rel = shift; |
|
64
|
0
|
|
|
|
|
|
my $info = { class=>ref($rel) }; |
|
65
|
0
|
0
|
|
|
|
|
if ($rel->can('du')) { |
|
66
|
0
|
|
|
|
|
|
$info->{du_b} = $rel->du(); |
|
67
|
0
|
|
|
|
|
|
$info->{du_h} = si_str($info->{du_b}); |
|
68
|
|
|
|
|
|
|
} |
|
69
|
0
|
|
|
|
|
|
return $info; |
|
70
|
|
|
|
|
|
|
} |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
##============================================================================== |
|
74
|
|
|
|
|
|
|
## Relation API: profiling & comparison: top-level |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
77
|
|
|
|
|
|
|
## Relation API: profile |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
## $mprf = $rel->profile($coldb, %opts) |
|
80
|
|
|
|
|
|
|
## + get a relation profile for selected items as a DiaColloDB::Profile::Multi object |
|
81
|
|
|
|
|
|
|
## + %opts: |
|
82
|
|
|
|
|
|
|
## ( |
|
83
|
|
|
|
|
|
|
## ##-- selection parameters |
|
84
|
|
|
|
|
|
|
## query => $query, ##-- target request ATTR:REQ... |
|
85
|
|
|
|
|
|
|
## date => $date1, ##-- string or array or range "MIN-MAX" (inclusive) : default=all |
|
86
|
|
|
|
|
|
|
## ## |
|
87
|
|
|
|
|
|
|
## ##-- aggregation parameters |
|
88
|
|
|
|
|
|
|
## slice => $slice, ##-- date slice (default=1, 0 for global profile) |
|
89
|
|
|
|
|
|
|
## groupby => $groupby, ##-- string or array "ATTR1[:HAVING1] ...": default=$coldb->attrs; see groupby() method |
|
90
|
|
|
|
|
|
|
## ## |
|
91
|
|
|
|
|
|
|
## ##-- scoring and trimming parameters |
|
92
|
|
|
|
|
|
|
## eps => $eps, ##-- smoothing constant (default=0) |
|
93
|
|
|
|
|
|
|
## score => $func, ##-- scoring function ("f"|"lf"|"fm"|"lfm"|"mi"|"ld"|"ll") : default="f" |
|
94
|
|
|
|
|
|
|
## kbest => $k, ##-- return only $k best collocates per date (slice) : default=-1:all |
|
95
|
|
|
|
|
|
|
## cutoff => $cutoff, ##-- minimum score |
|
96
|
|
|
|
|
|
|
## global => $bool, ##-- trim profiles globally (vs. locally for each date-slice?) (default=0) |
|
97
|
|
|
|
|
|
|
## extend => \%label2gkeys, ##-- maps slice-labels to selected (packed) group-keys, for extend() method |
|
98
|
|
|
|
|
|
|
## ## |
|
99
|
|
|
|
|
|
|
## ##-- profiling and debugging parameters |
|
100
|
|
|
|
|
|
|
## strings => $bool, ##-- do/don't stringify item keys (default=do) |
|
101
|
|
|
|
|
|
|
## packed => $bool, ##-- leave item keys packed (default=don't) |
|
102
|
|
|
|
|
|
|
## fill => $bool, ##-- if true, returned multi-profile will have null profiles inserted for missing slices |
|
103
|
|
|
|
|
|
|
## onepass => $bool, ##-- if true, use fast but incorrect 1-pass method (Cofreqs subclass only, >= v0.09.001) |
|
104
|
|
|
|
|
|
|
## ) |
|
105
|
|
|
|
|
|
|
## + default implementation |
|
106
|
|
|
|
|
|
|
## - parses request and extracts target tuple-ids |
|
107
|
|
|
|
|
|
|
## - calls $rel->subprofile1() to compute slice-wise joint frequency profiles (f12) |
|
108
|
|
|
|
|
|
|
## - calls $rel->subprofile2() to compute independent collocate frequencies (f2), and finally |
|
109
|
|
|
|
|
|
|
## - collects the result in a DiaColloDB::Profile::Multi object |
|
110
|
|
|
|
|
|
|
## + default values for %opts should be set by higher-level call, e.g. DiaColloDB::profile() |
|
111
|
|
|
|
|
|
|
sub profile { |
|
112
|
0
|
|
|
0
|
1
|
|
my ($reldb,$coldb,%opts) = @_; |
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
##-- common variables |
|
115
|
0
|
|
|
|
|
|
$opts{coldb} = $coldb; ##-- pass-down to subprofile() methods |
|
116
|
0
|
|
|
|
|
|
my $logProfile = $coldb->{logProfile}; |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
##-- variables: by attribute |
|
119
|
0
|
|
|
|
|
|
my $groupby= $opts{groupby} = $coldb->groupby($opts{groupby}); |
|
120
|
0
|
|
|
|
|
|
my $attrs = $coldb->attrs(); |
|
121
|
0
|
|
|
|
|
|
my $adata = $coldb->attrData($attrs); |
|
122
|
0
|
|
|
|
|
|
my $a2data = $opts{a2data} = {map {($_->{a}=>$_)} @$adata}; |
|
|
0
|
|
|
|
|
|
|
|
123
|
0
|
|
|
|
|
|
my $areqs = $coldb->parseRequest($opts{query}, logas=>'query', default=>$attrs->[0], qref=>\$opts{qobj}); |
|
124
|
0
|
|
|
|
|
|
foreach (@$areqs) { |
|
125
|
0
|
|
|
|
|
|
$a2data->{$_->[0]}{req} = $_->[1]; |
|
126
|
|
|
|
|
|
|
} |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
##-- sanity check(s) |
|
129
|
0
|
0
|
|
|
|
|
if (!@$areqs) { |
|
130
|
0
|
|
|
|
|
|
$reldb->logwarn($coldb->{error}="profile(): no target attributes specified (supported attributes: ".join(' ',@{$coldb->attrs}).")"); |
|
|
0
|
|
|
|
|
|
|
|
131
|
0
|
|
|
|
|
|
return undef; |
|
132
|
|
|
|
|
|
|
} |
|
133
|
0
|
0
|
|
|
|
|
if (!@{$groupby->{attrs}}) { |
|
|
0
|
|
|
|
|
|
|
|
134
|
0
|
|
|
|
|
|
$reldb->logconfess($coldb->{error}="profile(): cannot profile with empty groupby clause"); |
|
135
|
0
|
|
|
|
|
|
return undef; |
|
136
|
|
|
|
|
|
|
} |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
##-- prepare: get target IDs (by attribute) |
|
139
|
0
|
|
|
|
|
|
my ($ac); |
|
140
|
0
|
|
0
|
|
|
|
foreach $ac (grep {($_->{req}//'') ne ''} @$adata) { |
|
|
0
|
|
|
|
|
|
|
|
141
|
0
|
|
|
|
|
|
$ac->{reqids} = $coldb->enumIds($ac->{enum},$ac->{req},logLevel=>$logProfile,logPrefix=>"profile(): get target $ac->{a}-values"); |
|
142
|
0
|
0
|
|
|
|
|
if (!@{$ac->{reqids}}) { |
|
|
0
|
|
|
|
|
|
|
|
143
|
0
|
|
|
|
|
|
$reldb->logwarn($coldb->{error}="profile(): no $ac->{a}-attribute values match user query '$ac->{req}'"); |
|
144
|
0
|
|
|
|
|
|
return undef; |
|
145
|
|
|
|
|
|
|
} |
|
146
|
|
|
|
|
|
|
} |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
##-- prepare: get tuple-ids (by attribute) |
|
149
|
0
|
|
|
|
|
|
$reldb->vlog($logProfile, "profile(): get target tuple IDs"); |
|
150
|
0
|
|
|
|
|
|
my $tivec = undef; |
|
151
|
0
|
|
|
|
|
|
my $nbits = undef; |
|
152
|
0
|
|
|
|
|
|
my $pack_tv = undef; |
|
153
|
0
|
|
|
|
|
|
my $test_tv = undef; ##-- test value via vec() |
|
154
|
0
|
|
|
|
|
|
foreach $ac (grep {$_->{reqids}} @$adata) { |
|
|
0
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
##-- sanity checks |
|
156
|
0
|
|
0
|
|
|
|
$nbits //= $ac->{a2t}{len_i}*8; |
|
157
|
0
|
|
0
|
|
|
|
$pack_tv //= "$ac->{a2t}{pack_i}*"; |
|
158
|
0
|
0
|
|
|
|
|
vec($test_tv='',0,$nbits) = 0x12345678 if (!defined($test_tv)); |
|
159
|
|
|
|
|
|
|
$reldb->logconfess($coldb->{error}="profile(): multimap pack-size mismatch: nbits($ac->{a2t}{base}.*) != $nbits") |
|
160
|
0
|
0
|
|
|
|
|
if ($ac->{a2t}{len_i} != $nbits/8); |
|
161
|
|
|
|
|
|
|
$reldb->logconfess($coldb->{error}="profile(): multimap pack-template '$ac->{a2t}{pack_i}' for $ac->{a2t}{base}.* is not big-endian") |
|
162
|
0
|
0
|
|
|
|
|
if (pack($ac->{a2t}{pack_i},0x12345678) ne $test_tv); |
|
163
|
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
##-- target set construction |
|
165
|
0
|
|
|
|
|
|
my $atiset = ''; |
|
166
|
0
|
|
|
|
|
|
$atiset = vunion($atiset, $ac->{a2t}->fetchraw($_), $nbits) foreach (@{$ac->{reqids}}); |
|
|
0
|
|
|
|
|
|
|
|
167
|
0
|
0
|
|
|
|
|
$tivec = defined($tivec) ? vintersect($tivec, $atiset, $nbits) : $atiset; |
|
168
|
|
|
|
|
|
|
} |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
##-- check maxExpand |
|
171
|
0
|
|
0
|
|
|
|
$nbits //= packsize($coldb->{pack_id}); |
|
172
|
0
|
0
|
|
|
|
|
my $ntis = $tivec ? length($tivec)/($nbits/8) : 0; |
|
173
|
0
|
0
|
0
|
|
|
|
if ($coldb->{maxExpand}>0 && $ntis > $coldb->{maxExpand}) { |
|
174
|
0
|
|
|
|
|
|
$reldb->logwarn("profile(): Warning: target set exceeds max expansion size ($ntis > $coldb->{maxExpand}): truncating"); |
|
175
|
0
|
|
|
|
|
|
substr($tivec, -($ntis - $coldb->{maxExpand})*($nbits/8)) = ''; |
|
176
|
|
|
|
|
|
|
} |
|
177
|
0
|
0
|
|
|
|
|
my $tis = [$tivec ? unpack($pack_tv, $tivec) : qw()]; |
|
178
|
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
##-- parse date request (no filtering here) |
|
180
|
0
|
|
|
|
|
|
$reldb->vlog($logProfile, "profile(): parse date request (date=$opts{date}, slice=$opts{slice}, fill=$opts{fill})"); |
|
181
|
0
|
|
|
|
|
|
my $dreq = $opts{dreq} = $coldb->parseDateRequest(@opts{qw(date slice fill)}); |
|
182
|
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
##-- profile: get relation profiles (by date-slice, pass 1: f12) |
|
184
|
0
|
|
0
|
|
|
|
my $onepass = $opts{onepass} || ($reldb->can('subprofile2') eq \&subprofile2); |
|
185
|
0
|
0
|
|
|
|
|
$reldb->vlog($logProfile, "profile(): get frequency profile(s): ".($onepass ? 'single-pass' : 'pass-1')); |
|
186
|
0
|
|
|
|
|
|
my $s2prf = $reldb->subprofile1($tis, \%opts); |
|
187
|
0
|
|
|
|
|
|
foreach (keys %$s2prf) { |
|
188
|
0
|
|
|
|
|
|
@{$s2prf->{$_}}{qw(label titles)} = ($_,$groupby->{titles}); |
|
|
0
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
} |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
##-- profile/extend: insert extension keys |
|
192
|
0
|
|
|
|
|
|
my $extend = $opts{extend}; |
|
193
|
0
|
0
|
|
|
|
|
if ($extend) { |
|
194
|
0
|
|
|
|
|
|
my ($slice,$prf,$sxkeys); |
|
195
|
0
|
|
|
|
|
|
while (($slice,$prf) = each %$s2prf) { |
|
196
|
0
|
|
0
|
|
|
|
$sxkeys = $extend->{$slice}//{}; |
|
197
|
0
|
|
0
|
|
|
|
$prf->{f12}{$_} //= 0 foreach (keys %$sxkeys); |
|
198
|
|
|
|
|
|
|
} |
|
199
|
|
|
|
|
|
|
} |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
##-- profile: complete slice-wise profiles (pass 2: f2) |
|
202
|
0
|
0
|
0
|
|
|
|
if (!$onepass || !$opts{onepass}) { |
|
203
|
0
|
|
|
|
|
|
$reldb->vlog($logProfile, "profile(): get frequency profile(s): pass-2"); |
|
204
|
0
|
|
|
|
|
|
$reldb->subprofile2($s2prf, \%opts); |
|
205
|
|
|
|
|
|
|
} |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
##-- compile & collect: multi-profile |
|
208
|
0
|
|
|
|
|
|
foreach (values %$s2prf) { |
|
209
|
0
|
|
|
|
|
|
$_->compile($opts{score}, eps=>$opts{eps}); |
|
210
|
|
|
|
|
|
|
} |
|
211
|
0
|
|
|
|
|
|
my $mp = DiaColloDB::Profile::Multi->new(profiles=>[@$s2prf{sort {$a<=>$b} keys %$s2prf}], |
|
212
|
|
|
|
|
|
|
titles=>$groupby->{titles}, |
|
213
|
0
|
|
|
|
|
|
qinfo =>$reldb->qinfo($coldb, %opts, qreqs=>$areqs, gbreq=>$groupby), |
|
214
|
|
|
|
|
|
|
); |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
##-- trim and stringify |
|
217
|
0
|
|
|
|
|
|
$reldb->vlog($logProfile, "profile(): trim and stringify"); |
|
218
|
0
|
|
|
|
|
|
$mp->trim(%opts, empty=>!$opts{fill}); |
|
219
|
0
|
0
|
|
|
|
|
if (!$opts{packed}) { |
|
220
|
0
|
0
|
0
|
|
|
|
if ($opts{strings}//1) { |
|
221
|
0
|
|
|
|
|
|
$mp->stringify($groupby->{g2s}); |
|
222
|
|
|
|
|
|
|
} else { |
|
223
|
0
|
|
|
|
|
|
$mp->stringify($groupby->{g2txt}); |
|
224
|
|
|
|
|
|
|
} |
|
225
|
|
|
|
|
|
|
} |
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
##-- return |
|
228
|
0
|
|
|
|
|
|
return $mp; |
|
229
|
|
|
|
|
|
|
} |
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
232
|
|
|
|
|
|
|
## Relation API: extend (pass-2 for multi-clients) |
|
233
|
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
## $mprf = $rel->extend($coldb, %opts) |
|
235
|
|
|
|
|
|
|
## + extend f12 and f2 frequencies for \%slice2keys = $opts{slice2keys} |
|
236
|
|
|
|
|
|
|
## + calls $rel->profile($coldb, %opts,extend=>\%slice2keys_packed) |
|
237
|
|
|
|
|
|
|
## + returns a DiaColloDB::Profile::Multi containing the appropriate f12 and f2 entries |
|
238
|
|
|
|
|
|
|
sub extend { |
|
239
|
0
|
|
|
0
|
1
|
|
my ($reldb,$coldb,%opts) = @_; |
|
240
|
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
##-- common variables |
|
242
|
0
|
|
|
|
|
|
$opts{coldb} = $coldb; ##-- pass-down to subprofile() methods |
|
243
|
0
|
|
|
|
|
|
my $logProfile = $coldb->{logProfile}; |
|
244
|
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
##-- sanity check(s) |
|
246
|
0
|
|
0
|
|
|
|
my $slice2keys = $opts{slice2keys} || $opts{extend}; |
|
247
|
0
|
0
|
|
|
|
|
if (!$slice2keys) { |
|
|
|
0
|
|
|
|
|
|
|
248
|
0
|
|
|
|
|
|
$reldb->logwarn($coldb->{error}="extend(): no 'slice2keys' or 'extend' parameter specified!"); |
|
249
|
0
|
|
|
|
|
|
return undef; |
|
250
|
|
|
|
|
|
|
} |
|
251
|
|
|
|
|
|
|
elsif (!UNIVERSAL::isa($slice2keys,'HASH')) { |
|
252
|
0
|
|
|
|
|
|
$reldb->logwarn($coldb->{error}="extend(): failed to parse 'slice2keys' or 'extend' parameter"); |
|
253
|
0
|
|
|
|
|
|
return undef; |
|
254
|
|
|
|
|
|
|
} |
|
255
|
0
|
|
|
|
|
|
delete $opts{slice2keys}; |
|
256
|
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
##-- get packed group-keys (avoid temporary dummy-profiles: they can't handle unknown group-components) |
|
258
|
0
|
|
|
|
|
|
my $groupby = $opts{groupby} = $coldb->groupby($opts{groupby}); |
|
259
|
0
|
|
|
|
|
|
my $s2gx = $groupby->{s2gx}; |
|
260
|
0
|
|
|
|
|
|
my ($xslice,$xkeys, $xgkeys,$xkey,$xg, %extend); |
|
261
|
0
|
|
|
|
|
|
while (($xslice,$xkeys) = each %$slice2keys) { |
|
262
|
0
|
|
|
|
|
|
$xgkeys = $extend{$xslice} = {}; |
|
263
|
0
|
0
|
|
|
|
|
foreach $xkey (UNIVERSAL::isa($xkeys,'HASH') ? keys(%$xkeys) : @$xkeys) { |
|
264
|
0
|
0
|
|
|
|
|
next if (!defined($xg = $s2gx->($xkey))); |
|
265
|
0
|
|
|
|
|
|
$xgkeys->{$xg} = undef; |
|
266
|
|
|
|
|
|
|
} |
|
267
|
|
|
|
|
|
|
} |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
##-- guts: dispatch to profile() |
|
270
|
0
|
|
|
|
|
|
my $mp = $reldb->profile($coldb, %opts, kbest=>0,kbesta=>0,cutoff=>undef,global=>0,fill=>1, extend=>\%extend); |
|
271
|
|
|
|
|
|
|
|
|
272
|
0
|
|
|
|
|
|
return $mp; |
|
273
|
|
|
|
|
|
|
} |
|
274
|
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
276
|
|
|
|
|
|
|
## Relation API: comparison (diff) |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
## $mpdiff = $rel->compare($coldb, %opts) |
|
279
|
|
|
|
|
|
|
## + get a relation comparison profile for selected items as a DiaColloDB::Profile::MultiDiff object |
|
280
|
|
|
|
|
|
|
## + %opts: |
|
281
|
|
|
|
|
|
|
## ( |
|
282
|
|
|
|
|
|
|
## ##-- selection parameters |
|
283
|
|
|
|
|
|
|
## (a|b)?query => $query, ##-- target query as for parseRequest() |
|
284
|
|
|
|
|
|
|
## (a|b)?date => $date1, ##-- string or array or range "MIN-MAX" (inclusive) : default=all |
|
285
|
|
|
|
|
|
|
## ## |
|
286
|
|
|
|
|
|
|
## ##-- aggregation parameters |
|
287
|
|
|
|
|
|
|
## groupby => $groupby, ##-- string or array "ATTR1[:HAVING1] ...": default=$coldb->attrs; see groupby() method |
|
288
|
|
|
|
|
|
|
## (a|b)?slice => $slice, ##-- date slice (default=1, 0 for global profile) |
|
289
|
|
|
|
|
|
|
## ## |
|
290
|
|
|
|
|
|
|
## ##-- scoring and trimming parameters |
|
291
|
|
|
|
|
|
|
## eps => $eps, ##-- smoothing constant (default=0) |
|
292
|
|
|
|
|
|
|
## score => $func, ##-- scoring function ("f"|"lf"|"fm"|"lfm"|"mi"|"ld"|"ll") : default="f" |
|
293
|
|
|
|
|
|
|
## kbest => $k, ##-- return only $k best collocates per date (slice) : default=-1:all |
|
294
|
|
|
|
|
|
|
## cutoff => $cutoff, ##-- minimum score |
|
295
|
|
|
|
|
|
|
## global => $bool, ##-- trim profiles globally (vs. locally for each date-slice?) (default=0) |
|
296
|
|
|
|
|
|
|
## diff => $diff, ##-- low-level score-diff operation (adiff|diff|sum|min|max|avg|havg); default='adiff' |
|
297
|
|
|
|
|
|
|
## ## |
|
298
|
|
|
|
|
|
|
## ##-- profiling and debugging parameters |
|
299
|
|
|
|
|
|
|
## strings => $bool, ##-- do/don't stringify item keys (default=do) |
|
300
|
|
|
|
|
|
|
## packed => $bool, ##-- leave item keys packed (override stringification; default=don't) |
|
301
|
|
|
|
|
|
|
## ## |
|
302
|
|
|
|
|
|
|
## ##-- sublcass abstraction parameters |
|
303
|
|
|
|
|
|
|
## _gbparse => $bool, ##-- if true (default), 'groupby' clause will be parsed only once, using $coldb->groupby() method |
|
304
|
|
|
|
|
|
|
## _abkeys => \@abkeys, ##-- additional key-suffixes KEY s.t. (KEY=>VAL) gets passed to profile() calls if e.g. (aKEY=>VAL) is in %opts |
|
305
|
|
|
|
|
|
|
## ) |
|
306
|
|
|
|
|
|
|
## + default implementation wraps profile() method |
|
307
|
|
|
|
|
|
|
## + default values for %opts should be set by higher-level call, e.g. DiaColloDB::compare() |
|
308
|
|
|
|
|
|
|
sub compare { |
|
309
|
0
|
|
|
0
|
1
|
|
my ($reldb,$coldb,%opts) = @_; |
|
310
|
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
##-- common variables |
|
312
|
0
|
|
|
|
|
|
my $logProfile = $coldb->{logProfile}; |
|
313
|
0
|
|
0
|
|
|
|
my $groupby = $opts{groupby} || [@{$coldb->attrs}]; |
|
314
|
0
|
0
|
0
|
|
|
|
$groupby = $coldb->groupby($groupby) if ($opts{_gbparse}//1); |
|
315
|
0
|
0
|
0
|
|
|
|
my %aopts = map {exists($opts{"a$_"}) ? ($_=>$opts{"a$_"}) : qw()} (qw(query date slice), @{$opts{_abkeys}//[]}); |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
316
|
0
|
0
|
0
|
|
|
|
my %bopts = map {exists($opts{"b$_"}) ? ($_=>$opts{"b$_"}) : qw()} (qw(query date slice), @{$opts{_abkeys}//[]}); |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
317
|
0
|
|
|
|
|
|
my %popts = (kbest=>-1,cutoff=>'',global=>0,strings=>0,packed=>1,fill=>1, groupby=>$groupby); |
|
318
|
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
##-- get profiles to compare |
|
320
|
0
|
0
|
|
|
|
|
my $mpa = $reldb->profile($coldb,%opts, %aopts,%popts) or return undef; |
|
321
|
0
|
0
|
|
|
|
|
my $mpb = $reldb->profile($coldb,%opts, %bopts,%popts) or return undef; |
|
322
|
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
##-- alignment and trimming |
|
324
|
0
|
0
|
|
|
|
|
$reldb->vlog($logProfile, "compare(): align and trim (".($opts{global} ? 'global' : 'local').")"); |
|
325
|
0
|
|
|
|
|
|
my $ppairs = DiaColloDB::Profile::MultiDiff->align($mpa,$mpb); |
|
326
|
0
|
|
|
|
|
|
DiaColloDB::Profile::MultiDiff->trimPairs($ppairs, %opts); |
|
327
|
0
|
|
|
|
|
|
my $diff = DiaColloDB::Profile::MultiDiff->new($mpa,$mpb, titles=>$mpa->{titles}, diff=>$opts{diff}); |
|
328
|
0
|
0
|
|
|
|
|
$diff->trim( DiaColloDB::Profile::Diff->diffkbest($opts{diff})=>$opts{kbest} ) if (!$opts{global}); |
|
329
|
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
##-- finalize: stringify |
|
331
|
0
|
0
|
|
|
|
|
if (!$opts{packed}) { |
|
332
|
0
|
0
|
0
|
|
|
|
if ($opts{strings}//1) { |
|
333
|
|
|
|
|
|
|
$diff->stringify($groupby->{g2s}) if (ref($groupby) && $groupby->{g2s}) |
|
334
|
0
|
0
|
0
|
|
|
|
} else { |
|
335
|
0
|
0
|
0
|
|
|
|
$diff->stringify($groupby->{g2txt}) if (ref($groupby) && $groupby->{g2txt}); |
|
336
|
|
|
|
|
|
|
} |
|
337
|
|
|
|
|
|
|
} |
|
338
|
|
|
|
|
|
|
|
|
339
|
0
|
|
|
|
|
|
return $diff; |
|
340
|
|
|
|
|
|
|
} |
|
341
|
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
## $mpdiff = $rel->diff($coldb, %opts) |
|
343
|
|
|
|
|
|
|
## + alias for compare() |
|
344
|
|
|
|
|
|
|
sub diff { |
|
345
|
0
|
|
|
0
|
1
|
|
my $rel = shift; |
|
346
|
0
|
|
|
|
|
|
return $rel->compare(@_); |
|
347
|
|
|
|
|
|
|
} |
|
348
|
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
##============================================================================== |
|
351
|
|
|
|
|
|
|
## Relation API: default |
|
352
|
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
354
|
|
|
|
|
|
|
## Relation API: default: sliceN |
|
355
|
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
## $N = $rel->sliceN($sliceBy, $dateLo) |
|
357
|
|
|
|
|
|
|
## + get total slice-wise co-occurrence count for a slice of size $sliceBy starting at $dateLo |
|
358
|
|
|
|
|
|
|
## + not called by any default methods, but useful for sub-classes |
|
359
|
|
|
|
|
|
|
## + default implementation is really only appropriate for Cofreqs and Unigrams relations; |
|
360
|
|
|
|
|
|
|
## uses $rel properties 'N', 'sizeN', 'ymin', 'rN' |
|
361
|
|
|
|
|
|
|
sub sliceN { |
|
362
|
0
|
|
|
0
|
0
|
|
my ($rel,$slice,$dlo) = @_; |
|
363
|
0
|
0
|
0
|
|
|
|
return $rel->{N} if ($slice==0 || !UNIVERSAL::can($rel->{rN},'fetch')); |
|
364
|
0
|
|
0
|
|
|
|
my $ymin = ($rel->{ymin}//0); |
|
365
|
0
|
|
|
|
|
|
my $rN = $rel->{rN}; |
|
366
|
0
|
|
0
|
|
|
|
my $ihi = min2( $dlo-$ymin+$slice, $rel->{sizeN}//$rN->size ); |
|
367
|
0
|
|
|
|
|
|
my $ilo = max2( $dlo-$ymin, 0); |
|
368
|
0
|
|
|
|
|
|
my $N = 0; |
|
369
|
0
|
|
|
|
|
|
for (my $i=$ilo; $i < $ihi; ++$i) { |
|
370
|
0
|
|
|
|
|
|
$N += $rN->fetch($i); |
|
371
|
|
|
|
|
|
|
} |
|
372
|
0
|
|
|
|
|
|
return $N; |
|
373
|
|
|
|
|
|
|
} |
|
374
|
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
376
|
|
|
|
|
|
|
## Relation API: default: subprofile1 |
|
377
|
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
## \%slice2prf = $rel->subprofile1(\@tids,\%opts) |
|
379
|
|
|
|
|
|
|
## + get slice-wise joint frequency profile(s) for \@tids (db must be opened) |
|
380
|
|
|
|
|
|
|
## + %opts: as for profile(), also: |
|
381
|
|
|
|
|
|
|
## coldb => $coldb, ##-- parent DiaColloDB object (for shared data, debugging) |
|
382
|
|
|
|
|
|
|
## a2data => \%a2data, ##-- maps indexed attributes to associated data structures |
|
383
|
|
|
|
|
|
|
## dreq => \%dreq, ##-- parsed date request |
|
384
|
|
|
|
|
|
|
sub subprofile1 { |
|
385
|
0
|
|
|
0
|
1
|
|
my ($rel,$tids,$opts) = @_; |
|
386
|
0
|
|
|
|
|
|
$rel->logconfess($opts->{coldb}{error}="subprofile(): abstract method called"); |
|
387
|
|
|
|
|
|
|
} |
|
388
|
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
390
|
|
|
|
|
|
|
## Relation API: default: subprofile2 |
|
391
|
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
## \%slice2prf = $rel->subprofile2(\%slice2prf,\%opts) |
|
393
|
|
|
|
|
|
|
## + populate f2 frequencies for profiles in \%slice2prf |
|
394
|
|
|
|
|
|
|
## + %opts: as for subprofile1() |
|
395
|
|
|
|
|
|
|
## + default implementation just returns \%slice2prf |
|
396
|
|
|
|
|
|
|
sub subprofile2 { |
|
397
|
|
|
|
|
|
|
#my ($rel,$slice2prf,$opts) = @_; |
|
398
|
0
|
|
|
0
|
1
|
|
return $_[1]; |
|
399
|
|
|
|
|
|
|
} |
|
400
|
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
402
|
|
|
|
|
|
|
## Relation API: default: qinfo |
|
403
|
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
## \%qinfo = $rel->qinfo($coldb, %opts) |
|
405
|
|
|
|
|
|
|
## + get query-info hash for profile administrivia (ddc hit links) |
|
406
|
|
|
|
|
|
|
## + %opts: as for profile(), additionally: |
|
407
|
|
|
|
|
|
|
## ( |
|
408
|
|
|
|
|
|
|
## qreqs => \@areqs, ##-- as returned by $coldb->parseRequest($opts{query}) |
|
409
|
|
|
|
|
|
|
## gbreq => \%groupby, ##-- as returned by $coldb->groupby($opts{groupby}) |
|
410
|
|
|
|
|
|
|
## ) |
|
411
|
|
|
|
|
|
|
## + returned hash \%qinfo should have keys: |
|
412
|
|
|
|
|
|
|
## ( |
|
413
|
|
|
|
|
|
|
## fcoef => $fcoef, ##-- frequency coefficient (2*$coldb->{dmax} for CoFreqs) |
|
414
|
|
|
|
|
|
|
## qtemplate => $qtemplate, ##-- query template with __W1.I1__ rsp __W2.I2__ replacing groupby fields |
|
415
|
|
|
|
|
|
|
## qcanon => $qcanon, ##-- canonical query string (after parsing) |
|
416
|
|
|
|
|
|
|
## ) |
|
417
|
|
|
|
|
|
|
sub qinfo { |
|
418
|
0
|
|
|
0
|
1
|
|
my ($rel,$coldb,%opts) = @_; |
|
419
|
0
|
|
|
|
|
|
$rel->logconfess("qinfo(): abstract method called"); |
|
420
|
|
|
|
|
|
|
} |
|
421
|
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
## (\@q1strs,\@q2strs,\@qxstrs,\@fstrs) = $rel->qinfoData($coldb,%opts) |
|
423
|
|
|
|
|
|
|
## + parses @opts{qw(qreqs gbreq)} into conditions on w1, w2 and metadata filters (for ddc linkup) |
|
424
|
|
|
|
|
|
|
## + call this from subclass qinfo() methods |
|
425
|
|
|
|
|
|
|
sub qinfoData { |
|
426
|
0
|
|
|
0
|
1
|
|
my ($rel,$coldb,%opts) = @_; |
|
427
|
0
|
|
|
|
|
|
my (@q1strs,@q2strs,@qxstrs,@fstrs,$q,$q2); |
|
428
|
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
##-- query clause |
|
430
|
0
|
|
|
|
|
|
foreach (@{$opts{qreqs}}) { |
|
|
0
|
|
|
|
|
|
|
|
431
|
0
|
|
|
|
|
|
$q = $coldb->attrQuery(@$_); |
|
432
|
0
|
0
|
0
|
|
|
|
if (UNIVERSAL::isa($q,'DDC::Any::CQFilter')) { |
|
|
|
0
|
|
|
|
|
|
|
433
|
0
|
|
|
|
|
|
push(@fstrs, $q->toString); |
|
434
|
|
|
|
|
|
|
} |
|
435
|
|
|
|
|
|
|
elsif (defined($q) && !UNIVERSAL::isa($q,'DDC::Any::CQTokAny')) { |
|
436
|
0
|
|
|
|
|
|
push(@q1strs, $q->toString); |
|
437
|
|
|
|
|
|
|
} |
|
438
|
|
|
|
|
|
|
} |
|
439
|
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
##-- groupby clause |
|
441
|
0
|
|
|
|
|
|
my $xi=1; |
|
442
|
0
|
|
|
|
|
|
foreach (@{$opts{gbreq}{areqs}}) { |
|
|
0
|
|
|
|
|
|
|
|
443
|
0
|
0
|
|
|
|
|
if ($_->[0] =~ /^doc\.(.*)/) { |
|
444
|
0
|
|
|
|
|
|
push(@fstrs, DDC::Any::CQFHasField->new("$1","__W2.${xi}__")->toString); |
|
445
|
|
|
|
|
|
|
} |
|
446
|
|
|
|
|
|
|
else { |
|
447
|
0
|
|
|
|
|
|
push(@q2strs, DDC::Any::CQTokExact->new($_->[0],"__W2.${xi}__")->toString); |
|
448
|
|
|
|
|
|
|
} |
|
449
|
0
|
|
|
|
|
|
++$xi; |
|
450
|
|
|
|
|
|
|
} |
|
451
|
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
##-- common restrictions (trunk/2015-10-28: these are too expensive for large corpora (timeouts): ignore 'em |
|
453
|
|
|
|
|
|
|
#push(@qxstrs, qq(\$p=/$coldb->{pgood}/)) if ($coldb->{pgood}); |
|
454
|
|
|
|
|
|
|
#push(@qxstrs, qq(\$=!/$coldb->{pbad}/)) if ($coldb->{pbad}); |
|
455
|
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
##-- utf8 |
|
457
|
0
|
|
|
|
|
|
foreach (@q1strs,@q2strs,@qxstrs,@fstrs) { |
|
458
|
0
|
0
|
|
|
|
|
utf8::decode($_) if (!utf8::is_utf8($_)); |
|
459
|
|
|
|
|
|
|
} |
|
460
|
|
|
|
|
|
|
|
|
461
|
0
|
|
|
|
|
|
return (\@q1strs,\@q2strs,\@qxstrs,\@fstrs); |
|
462
|
|
|
|
|
|
|
} |
|
463
|
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
## $qstr_or_undef = $rel->qcanon($coldb,%opts) |
|
465
|
|
|
|
|
|
|
## + returns "canonical" query string for %opts |
|
466
|
|
|
|
|
|
|
## + default implementation uses: |
|
467
|
|
|
|
|
|
|
## - $opts{qcanon} if defined |
|
468
|
|
|
|
|
|
|
## - $opts{qobj}->toStringFull() if available |
|
469
|
|
|
|
|
|
|
## - undef |
|
470
|
|
|
|
|
|
|
sub qcanon { |
|
471
|
0
|
|
|
0
|
0
|
|
my ($rel,$coldb,%opts) = @_; |
|
472
|
0
|
|
0
|
|
|
|
my $q = $opts{qcanon} // $opts{qobj}; |
|
473
|
0
|
0
|
0
|
|
|
|
$q = $q->toStringFull if (ref($q) && UNIVERSAL::can($q,'toStringFull')); |
|
474
|
0
|
0
|
|
|
|
|
utf8::decode($q) if (!utf8::is_utf8($q)); |
|
475
|
0
|
|
|
|
|
|
return $q; |
|
476
|
|
|
|
|
|
|
} |
|
477
|
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
##============================================================================== |
|
480
|
|
|
|
|
|
|
## Footer |
|
481
|
|
|
|
|
|
|
1; |
|
482
|
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
__END__ |