line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
## -*- Mode: CPerl -*- |
2
|
|
|
|
|
|
|
## |
3
|
|
|
|
|
|
|
## File: Compat::v0_09::DiaColloDB.pm |
4
|
|
|
|
|
|
|
## Author: Bryan Jurish <moocow@cpan.org> |
5
|
|
|
|
|
|
|
## Description: collocation db, top-level: v0.09.x compatibility hack |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
package DiaColloDB::Compat::v0_09::DiaColloDB; |
8
|
1
|
|
|
1
|
|
8
|
use DiaColloDB; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
31
|
|
9
|
1
|
|
|
1
|
|
5
|
use DiaColloDB::Compat::v0_09::Relation; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
19
|
|
10
|
1
|
|
|
1
|
|
7
|
use DiaColloDB::Compat::v0_09::Relation::Unigrams; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
18
|
|
11
|
1
|
|
|
1
|
|
4
|
use DiaColloDB::Compat::v0_09::Relation::Cofreqs; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
37
|
|
12
|
1
|
|
|
1
|
|
6
|
use DiaColloDB::Utils qw(:math :fcntl :json :sort :pack :regex :file :si :run :env :temp); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
37
|
|
13
|
1
|
|
|
1
|
|
541
|
use DDC::Any; ##-- for query parsing |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
9
|
|
14
|
1
|
|
|
1
|
|
63
|
use File::Path qw(make_path remove_tree); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
26
|
|
15
|
1
|
|
|
1
|
|
131
|
use Fcntl; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
78
|
|
16
|
1
|
|
|
1
|
|
247
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
1553
|
|
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
##============================================================================== |
19
|
|
|
|
|
|
|
## Globals & Constants |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB DiaColloDB::Compat); |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
## $ECLASS |
24
|
|
|
|
|
|
|
## + enum class |
25
|
|
|
|
|
|
|
#our $ECLASS = 'DiaColloDB::EnumFile'; |
26
|
|
|
|
|
|
|
our $ECLASS = 'DiaColloDB::EnumFile::MMap'; |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
## $XECLASS |
29
|
|
|
|
|
|
|
## + fixed-length enum class |
30
|
|
|
|
|
|
|
#our $XECLASS = 'DiaColloDB::EnumFile::FixedLen'; |
31
|
|
|
|
|
|
|
our $XECLASS = 'DiaColloDB::EnumFile::FixedLen::MMap'; |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
## $MMCLASS |
34
|
|
|
|
|
|
|
## + multimap class |
35
|
|
|
|
|
|
|
#our $MMCLASS = 'DiaColloDB::MultiMapFile'; |
36
|
|
|
|
|
|
|
our $MMCLASS = 'DiaColloDB::MultiMapFile::MMap'; |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
##============================================================================== |
39
|
|
|
|
|
|
|
## Constructors etc. |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
## $coldb = CLASS_OR_OBJECT->new(%args) |
42
|
|
|
|
|
|
|
## + %args, object structure: |
43
|
|
|
|
|
|
|
## ( |
44
|
|
|
|
|
|
|
## ##-- options |
45
|
|
|
|
|
|
|
## dbdir => $dbdir, ##-- database directory; REQUIRED |
46
|
|
|
|
|
|
|
## flags => $fcflags, ##-- fcntl flags or open()-style mode string; default='r' |
47
|
|
|
|
|
|
|
## attrs => \@attrs, ##-- index attributes (input as space-separated or array; compiled to array); default=undef (==>['l']) |
48
|
|
|
|
|
|
|
## ## + each attribute can be token-attribute qw(w p l) or a document metadata attribute "doc.ATTR" |
49
|
|
|
|
|
|
|
## ## + document "date" attribute is always indexed |
50
|
|
|
|
|
|
|
## info => \%info, ##-- additional data to return in info() method (e.g. collection, maintainer) |
51
|
|
|
|
|
|
|
## #bos => $bos, ##-- special string to use for BOS, undef or empty for none (default=undef) DISABLED |
52
|
|
|
|
|
|
|
## #eos => $eos, ##-- special string to use for EOS, undef or empty for none (default=undef) DISABLED |
53
|
|
|
|
|
|
|
## pack_id => $fmt, ##-- pack-format for IDs (default='N') |
54
|
|
|
|
|
|
|
## pack_f => $fmt, ##-- pack-format for frequencies (default='N') |
55
|
|
|
|
|
|
|
## pack_date => $fmt, ##-- pack-format for dates (default='n') |
56
|
|
|
|
|
|
|
## pack_off => $fmt, ##-- pack-format for file offsets (default='N') |
57
|
|
|
|
|
|
|
## pack_len => $len, ##-- pack-format for string lengths (default='n') |
58
|
|
|
|
|
|
|
## dmax => $dmax, ##-- maximum distance for collocation-frequencies and implicit ddc near() queries (default=5) |
59
|
|
|
|
|
|
|
## cfmin => $cfmin, ##-- minimum co-occurrence frequency for Cofreqs and ddc queries (default=2) |
60
|
|
|
|
|
|
|
## tfmin => $tfmin, ##-- minimum global term-frequency WITHOUT date component (default=2) |
61
|
|
|
|
|
|
|
## fmin_${a} => $fmin, ##-- minimum independent frequency for value of attribute ${a} (default=undef:from $tfmin) |
62
|
|
|
|
|
|
|
## keeptmp => $bool, ##-- keep temporary files? (default=0) |
63
|
|
|
|
|
|
|
## index_tdf => $bool, ##-- tdf: create/use (term x document) frequency matrix index? (default=undef: if available) |
64
|
|
|
|
|
|
|
## index_cof => $bool, ##-- cof: create/use co-frequency index (default=1) |
65
|
|
|
|
|
|
|
## dbreak => $dbreak, ##-- tdf: use break-type $break for tdf index (default=undef: files) |
66
|
|
|
|
|
|
|
## tdfopts=>\%tdfopts, ##-- tdf: options for DiaColloDB::Relation::TDF->new(); default=undef (all inherited from %TDF_OPTS) |
67
|
|
|
|
|
|
|
## ## |
68
|
|
|
|
|
|
|
## ##-- runtime ddc relation options |
69
|
|
|
|
|
|
|
## ddcServer => "$host:$port", ##-- server for ddc relation |
70
|
|
|
|
|
|
|
## ddcTimeout => $seconds, ##-- timeout for ddc relation |
71
|
|
|
|
|
|
|
## ## |
72
|
|
|
|
|
|
|
## ##-- source filtering (for create()) |
73
|
|
|
|
|
|
|
## pgood => $regex, ##-- positive filter regex for part-of-speech tags |
74
|
|
|
|
|
|
|
## pbad => $regex, ##-- negative filter regex for part-of-speech tags |
75
|
|
|
|
|
|
|
## wgood => $regex, ##-- positive filter regex for word text |
76
|
|
|
|
|
|
|
## wbad => $regex, ##-- negative filter regex for word text |
77
|
|
|
|
|
|
|
## lgood => $regex, ##-- positive filter regex for lemma text |
78
|
|
|
|
|
|
|
## lbad => $regex, ##-- negative filter regex for lemma text |
79
|
|
|
|
|
|
|
## ## |
80
|
|
|
|
|
|
|
## ##-- logging |
81
|
|
|
|
|
|
|
## logOpen => $level, ##-- log-level for open/close (default='info') |
82
|
|
|
|
|
|
|
## logCreate => $level, ##-- log-level for create messages (default='info') |
83
|
|
|
|
|
|
|
## logCorpusFile => $level, ##-- log-level for corpus file-parsing (default='info') |
84
|
|
|
|
|
|
|
## logCorpusFileN => $N, ##-- log corpus file-parsing only for every N files (0 for none; default:undef ~ $corpus->size()/100) |
85
|
|
|
|
|
|
|
## logExport => $level, ##-- log-level for export messages (default='info') |
86
|
|
|
|
|
|
|
## logProfile => $level, ##-- log-level for verbose profiling messages (default='trace') |
87
|
|
|
|
|
|
|
## logRequest => $level, ##-- log-level for request-level profiling messages (default='debug') |
88
|
|
|
|
|
|
|
## ## |
89
|
|
|
|
|
|
|
## ##-- runtime limits |
90
|
|
|
|
|
|
|
## maxExpand => $size, ##-- maximum number of elements in query expansions (default=65535) |
91
|
|
|
|
|
|
|
## ## |
92
|
|
|
|
|
|
|
## ##-- administrivia |
93
|
|
|
|
|
|
|
## version=>$version, ##-- DiaColloDB version of stored db (==$DiaColloDB::VERSION) |
94
|
|
|
|
|
|
|
## upgraded=>\@upgraded, ##-- optional administrative information about auto-magic upgrades |
95
|
|
|
|
|
|
|
## ## |
96
|
|
|
|
|
|
|
## ##-- attribute data |
97
|
|
|
|
|
|
|
## ${a}enum => $aenum, ##-- attribute enum: $aenum : ($dbdir/${a}_enum.*) : $astr<=>$ai : A*<=>N |
98
|
|
|
|
|
|
|
## ## e.g. lemmata: $lenum : ($dbdir/l_enum.* ) : $lstr<=>$li : A*<=>N |
99
|
|
|
|
|
|
|
## ${a}2x => $a2x, ##-- attribute multimap: $a2x : ($dbdir/${a}_2x.*) : $ai=>@xis : N=>N* |
100
|
|
|
|
|
|
|
## pack_x$a => $fmt ##-- pack format: extract attribute-id $ai from a packed tuple-string $xs ; $ai=unpack($coldb->{"pack_x$a"},$xs) |
101
|
|
|
|
|
|
|
## ## |
102
|
|
|
|
|
|
|
## ##-- tuple data (+dates) |
103
|
|
|
|
|
|
|
## xenum => $xenum, ##-- enum: tuples ($dbdir/xenum.*) : [@ais,$di]<=>$xi : N*n<=>N |
104
|
|
|
|
|
|
|
## pack_x => $fmt, ##-- symbol pack-format for $xenum : "${pack_id}[Nattrs]${pack_date}" |
105
|
|
|
|
|
|
|
## xdmin => $xdmin, ##-- minimum date |
106
|
|
|
|
|
|
|
## xdmax => $xdmax, ##-- maximum date |
107
|
|
|
|
|
|
|
## ## |
108
|
|
|
|
|
|
|
## ##-- relation data |
109
|
|
|
|
|
|
|
## xf => $xf, ##-- ug: $xi => $f($xi) : N=>N |
110
|
|
|
|
|
|
|
## cof => $cof, ##-- cf: [$xi1,$xi2] => $f12 |
111
|
|
|
|
|
|
|
## ddc => $ddc, ##-- ddc: ddc client relation |
112
|
|
|
|
|
|
|
## tdf => $tdf, ##-- tdf: (term x document) frequency matrix relation |
113
|
|
|
|
|
|
|
## ) |
114
|
|
|
|
|
|
|
sub new { |
115
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
116
|
0
|
|
0
|
|
|
|
my $coldb = bless({ |
117
|
|
|
|
|
|
|
##-- options |
118
|
|
|
|
|
|
|
dbdir => undef, |
119
|
|
|
|
|
|
|
flags => 'r', |
120
|
|
|
|
|
|
|
attrs => undef, |
121
|
|
|
|
|
|
|
#bos => undef, |
122
|
|
|
|
|
|
|
#eos => undef, |
123
|
|
|
|
|
|
|
pack_id => 'N', |
124
|
|
|
|
|
|
|
pack_f => 'N', |
125
|
|
|
|
|
|
|
pack_date => 'n', |
126
|
|
|
|
|
|
|
pack_off => 'N', |
127
|
|
|
|
|
|
|
pack_len =>'n', |
128
|
|
|
|
|
|
|
dmax => 5, |
129
|
|
|
|
|
|
|
cfmin => 2, |
130
|
|
|
|
|
|
|
tfmin => 2, |
131
|
|
|
|
|
|
|
#keeptmp => 0, |
132
|
|
|
|
|
|
|
index_tdf => undef, |
133
|
|
|
|
|
|
|
index_cof => 1, |
134
|
|
|
|
|
|
|
dbreak => undef, |
135
|
|
|
|
|
|
|
tdfopts => {}, |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
##-- filters |
138
|
|
|
|
|
|
|
pgood => $DiaColloDB::PGOOD_DEFAULT, |
139
|
|
|
|
|
|
|
pbad => $DiaColloDB::PBAD_DEFAULT, |
140
|
|
|
|
|
|
|
wgood => $DiaColloDB::WGOOD_DEFAULT, |
141
|
|
|
|
|
|
|
wbad => $DiaColloDB::WBAD_DEFAULT, |
142
|
|
|
|
|
|
|
lgood => $DiaColloDB::LGOOD_DEFAULT, |
143
|
|
|
|
|
|
|
lbad => $DiaColloDB::LBAD_DEFAULT, |
144
|
|
|
|
|
|
|
#vsmgood => $DiaColloDB::TDF_MGOOD_DEFAULT, |
145
|
|
|
|
|
|
|
#vsmbad => $DiaColloDB::TDF_MBAD_DEFAULT, |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
##-- logging |
148
|
|
|
|
|
|
|
logOpen => 'info', |
149
|
|
|
|
|
|
|
logCreate => 'info', |
150
|
|
|
|
|
|
|
logCorpusFile => 'info', |
151
|
|
|
|
|
|
|
logCorpusFileN => undef, |
152
|
|
|
|
|
|
|
logExport => 'info', |
153
|
|
|
|
|
|
|
logProfile => 'trace', |
154
|
|
|
|
|
|
|
logRequest => 'debug', |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
##-- limits |
157
|
|
|
|
|
|
|
maxExpand => 65535, |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
##-- administrivia |
160
|
|
|
|
|
|
|
version => "v0.09.000", |
161
|
|
|
|
|
|
|
#upgraded=>[], |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
##-- attributes |
164
|
|
|
|
|
|
|
#lenum => undef, #$ECLASS->new(pack_i=>$coldb->{pack_id}, pack_o=>$coldb->{pack_off}, pack_l=>$coldb->{pack_len}), |
165
|
|
|
|
|
|
|
#l2x => undef, #$MMCLASS->new(pack_i=>$coldb->{pack_id}, pack_o=>$coldb->{pack_off}, pack_l=>$coldb->{pack_id}), |
166
|
|
|
|
|
|
|
#pack_xl => 'N', |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
##-- tuples (+dates) |
169
|
|
|
|
|
|
|
#xenum => undef, #$XECLASS::FixedLen->new(pack_i=>$coldb->{pack_id}, pack_s=>$coldb->{pack_x}), |
170
|
|
|
|
|
|
|
#pack_x => 'Nn', |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
##-- relations |
173
|
|
|
|
|
|
|
#xf => undef, #DiaColloDB::Relation::Unigrams->new(packas=>$coldb->{pack_f}), |
174
|
|
|
|
|
|
|
#cof => undef, #DiaColloDB::Relation::Cofreqs->new(pack_f=>$pack_f, pack_i=>$pack_i, dmax=>$dmax, fmin=>$cfmin), |
175
|
|
|
|
|
|
|
#ddc => undef, #DiaColloDB::Relation::DDC->new(), |
176
|
|
|
|
|
|
|
#tdf => undef, #DiaColloDB::Relation::TDF->new(), |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
@_, ##-- user arguments |
179
|
|
|
|
|
|
|
}, |
180
|
|
|
|
|
|
|
ref($that)||$that); |
181
|
0
|
|
|
|
|
|
$coldb->{class} = ref($coldb); |
182
|
0
|
|
|
|
|
|
$coldb->{pack_w} = $coldb->{pack_id}; |
183
|
0
|
|
|
|
|
|
$coldb->{pack_x} = $coldb->{pack_w} . $coldb->{pack_date}; |
184
|
0
|
0
|
|
|
|
|
if (defined($coldb->{dbdir})) { |
185
|
|
|
|
|
|
|
##-- avoid initial close() if called with dbdir=>$dbdir argument |
186
|
0
|
|
|
|
|
|
my $dbdir = $coldb->{dbdir}; |
187
|
0
|
|
|
|
|
|
delete $coldb->{dbdir}; |
188
|
0
|
|
|
|
|
|
return $coldb->open($dbdir); |
189
|
|
|
|
|
|
|
} |
190
|
0
|
|
|
|
|
|
return $coldb; |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
## undef = $obj->DESTROY |
194
|
|
|
|
|
|
|
## + destructor calls close() if necessary |
195
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Client |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
## $cli_or_undef = $cli->promote($class,%opts) |
198
|
|
|
|
|
|
|
## + DiaColloDB::Client method override |
199
|
|
|
|
|
|
|
sub promote { |
200
|
0
|
|
|
0
|
1
|
|
$_[0]->logconfess("promote(): not supported"); |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
##============================================================================== |
204
|
|
|
|
|
|
|
## I/O: open/close |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
## $coldb_or_undef = $coldb->open($dbdir,%opts) |
207
|
|
|
|
|
|
|
## $coldb_or_undef = $coldb->open() |
208
|
|
|
|
|
|
|
sub open { |
209
|
0
|
|
|
0
|
1
|
|
my ($coldb,$dbdir,%opts) = @_; |
210
|
0
|
|
|
|
|
|
DiaColloDB::Logger->ensureLog(); |
211
|
0
|
0
|
|
|
|
|
$coldb = $coldb->new() if (!ref($coldb)); |
212
|
|
|
|
|
|
|
#@$coldb{keys %opts} = values %opts; ##-- clobber options after loadHeader() |
213
|
0
|
|
0
|
|
|
|
$dbdir //= $coldb->{dbdir}; |
214
|
0
|
|
|
|
|
|
$dbdir =~ s{/$}{}; |
215
|
0
|
0
|
|
|
|
|
$coldb->close() if ($coldb->opened); |
216
|
0
|
|
|
|
|
|
$coldb->{dbdir} = $dbdir; |
217
|
0
|
|
0
|
|
|
|
my $flags = fcflags($opts{flags} // $coldb->{flags}); |
218
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logOpen}, "open($dbdir)"); |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
##-- open: truncate |
221
|
0
|
0
|
|
|
|
|
if (fctrunc($flags)) { |
222
|
0
|
|
|
|
|
|
$flags |= O_CREAT; |
223
|
0
|
0
|
0
|
|
|
|
!-d $dbdir |
224
|
|
|
|
|
|
|
or remove_tree($dbdir) |
225
|
|
|
|
|
|
|
or $coldb->logconfess("open(): could not remove old $dbdir: $!"); |
226
|
|
|
|
|
|
|
} |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
##-- open: create |
229
|
0
|
0
|
|
|
|
|
if (!-d $dbdir) { |
230
|
0
|
0
|
|
|
|
|
$coldb->logconfess("open(): no such directory '$dbdir'") if (!fccreat($flags)); |
231
|
0
|
0
|
|
|
|
|
make_path($dbdir) |
232
|
|
|
|
|
|
|
or $coldb->logconfess("open(): could not create DB directory '$dbdir': $!"); |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
##-- open: header |
236
|
|
|
|
|
|
|
$coldb->loadHeader() |
237
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("open(): failed to load header file", $coldb->headerFile, ": $!"); |
238
|
0
|
|
|
|
|
|
@$coldb{keys %opts} = values %opts; ##-- clobber header options with user-supplied values |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
##-- open: tdf: require |
241
|
0
|
0
|
|
|
|
|
$coldb->{index_tdf} = 0 if (!-r "$dbdir/tdf.hdr"); |
242
|
0
|
0
|
|
|
|
|
if ($coldb->{index_tdf}) { |
243
|
0
|
0
|
|
|
|
|
if (!require "DiaColloDB/Relation/TDF.pm") { |
244
|
0
|
0
|
|
|
|
|
$coldb->logwarn("open(): require failed for DiaColloDB/Relation/TDF.pm ; (term x document) matrix modelling disabled", ($@ ? "\n: $@" : '')); |
245
|
0
|
|
|
|
|
|
$coldb->{index_tdf} = 0; |
246
|
|
|
|
|
|
|
} |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
##-- open: common options |
250
|
0
|
|
|
|
|
|
my %efopts = (flags=>$flags, pack_i=>$coldb->{pack_id}, pack_o=>$coldb->{pack_off}, pack_l=>$coldb->{pack_len}); |
251
|
0
|
|
|
|
|
|
my %mmopts = (flags=>$flags, pack_i=>$coldb->{pack_id}); |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
##-- open: attributes |
254
|
0
|
|
|
|
|
|
my $attrs = $coldb->{attrs} = $coldb->attrs(undef,['l']); |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
##-- open: by attribute |
257
|
0
|
|
|
|
|
|
my $axat = 0; |
258
|
0
|
|
|
|
|
|
foreach my $attr (@$attrs) { |
259
|
|
|
|
|
|
|
##-- open: ${attr}* |
260
|
0
|
0
|
|
|
|
|
my $abase = (-r "$dbdir/${attr}_enum.hdr" ? "$dbdir/${attr}_" : "$dbdir/${attr}"); ##-- v0.03-compatibility hack |
261
|
0
|
0
|
|
|
|
|
$coldb->{"${attr}enum"} = $ECLASS->new(base=>"${abase}enum", %efopts) |
262
|
|
|
|
|
|
|
or $coldb->logconfess("open(): failed to open enum ${abase}enum.*: $!"); |
263
|
0
|
0
|
|
|
|
|
$coldb->{"${attr}2x"} = $MMCLASS->new(base=>"${abase}2x", %mmopts) |
264
|
|
|
|
|
|
|
or $coldb->logconfess("open(): failed to open expansion multimap ${abase}2x.*: $!"); |
265
|
0
|
|
0
|
|
|
|
$coldb->{"pack_x$attr"} //= "\@${axat}$coldb->{pack_id}"; |
266
|
0
|
|
|
|
|
|
$axat += packsize($coldb->{pack_id}); |
267
|
|
|
|
|
|
|
} |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
##-- open: xenum |
270
|
|
|
|
|
|
|
$coldb->{xenum} = $XECLASS->new(base=>"$dbdir/xenum", %efopts, pack_s=>$coldb->{pack_x}) |
271
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("open(): failed to open tuple-enum $dbdir/xenum.*: $!"); |
272
|
0
|
0
|
0
|
|
|
|
if (!defined($coldb->{xdmin}) || !defined($coldb->{xdmax})) { |
273
|
|
|
|
|
|
|
##-- hack: guess date-range if not specified |
274
|
0
|
|
|
|
|
|
$coldb->vlog('warn', "Warning: extracting date-range from xenum: you should update $coldb->{dbdir}/header.json"); |
275
|
0
|
|
|
|
|
|
my $pack_xdate = '@'.(packsize($coldb->{pack_id}) * scalar(@{$coldb->attrs})).$coldb->{pack_date}; |
|
0
|
|
|
|
|
|
|
276
|
0
|
|
|
|
|
|
my ($dmin,$dmax,$d) = ('inf','-inf'); |
277
|
0
|
|
|
|
|
|
foreach (@{$coldb->{xenum}->toArray}) { |
|
0
|
|
|
|
|
|
|
278
|
0
|
0
|
|
|
|
|
next if (!$_); |
279
|
0
|
0
|
|
|
|
|
next if (!defined($d = unpack($pack_xdate,$_))); ##-- strangeness: getting only 9-bytes in $_ for 10-byte values in file and toArray(): why?! |
280
|
0
|
0
|
|
|
|
|
$dmin = $d if ($d < $dmin); |
281
|
0
|
0
|
|
|
|
|
$dmax = $d if ($d > $dmax); |
282
|
|
|
|
|
|
|
} |
283
|
0
|
|
|
|
|
|
$coldb->vlog('warn', "extracted date-range \"xdmin\":$dmin, \"xdmax\":$dmax"); |
284
|
0
|
|
|
|
|
|
@$coldb{qw(xdmin xdmax)} = ($dmin,$dmax); |
285
|
|
|
|
|
|
|
} |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
##-- open: xf |
288
|
0
|
0
|
|
|
|
|
$coldb->{xf} = DiaColloDB::Compat::v0_09::Relation::Unigrams->new(file=>"$dbdir/xf.dba", flags=>$flags, packas=>$coldb->{pack_f}, logCompat=>'off') |
289
|
|
|
|
|
|
|
or $coldb->logconfess("open(): failed to open tuple-unigrams $dbdir/xf.dba: $!"); |
290
|
0
|
0
|
0
|
|
|
|
$coldb->{xf}{N} = $coldb->{xN} if ($coldb->{xN} && !$coldb->{xf}{N}); ##-- compat |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
##-- open: cof |
293
|
0
|
0
|
0
|
|
|
|
if ($coldb->{index_cof}//1) { |
294
|
|
|
|
|
|
|
$coldb->{cof} = DiaColloDB::Compat::v0_09::Relation::Cofreqs->new(base=>"$dbdir/cof", flags=>$flags, |
295
|
|
|
|
|
|
|
pack_i=>$coldb->{pack_id}, pack_f=>$coldb->{pack_f}, |
296
|
|
|
|
|
|
|
dmax=>$coldb->{dmax}, fmin=>$coldb->{cfmin}, |
297
|
0
|
0
|
|
|
|
|
logCompat=>'off', |
298
|
|
|
|
|
|
|
) |
299
|
|
|
|
|
|
|
or $coldb->logconfess("open(): failed to open co-frequency file $dbdir/cof.*: $!"); |
300
|
|
|
|
|
|
|
} |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
##-- open: ddc (undef if ddcServer isn't set in ddc.hdr or $coldb) |
303
|
0
|
0
|
0
|
|
|
|
$coldb->{ddc} = DiaColloDB::Relation::DDC->new(-r "$dbdir/ddc.hdr" ? (base=>"$dbdir/ddc") : qw())->fromDB($coldb) |
304
|
|
|
|
|
|
|
// 'DiaColloDB::Relation::DDC'; |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
##-- open: tdf (if available) |
307
|
0
|
0
|
|
|
|
|
if ($coldb->{index_tdf}) { |
308
|
0
|
|
0
|
|
|
|
$coldb->{tdfopts} //= {}; |
309
|
0
|
|
0
|
|
|
|
$coldb->{tdfopts}{$_} //= $DiaColloDB::TDF_OPTS{$_} foreach (keys %DiaColloDB::TDF_OPTS); ##-- tdf: default options |
310
|
|
|
|
|
|
|
$coldb->{tdf} = DiaColloDB::Relation::TDF->new((-r "$dbdir/tdf.hdr" ? (base=>"$dbdir/tdf") : qw()), |
311
|
|
|
|
|
|
|
dbreak => $coldb->{dbreak}, |
312
|
0
|
0
|
|
|
|
|
%{$coldb->{tdfopts}}, |
|
0
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
); |
314
|
|
|
|
|
|
|
} |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
##-- all done |
317
|
0
|
|
|
|
|
|
return $coldb; |
318
|
|
|
|
|
|
|
} |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
## @dbkeys = $coldb->dbkeys() |
322
|
|
|
|
|
|
|
sub dbkeys { |
323
|
|
|
|
|
|
|
return ( |
324
|
0
|
0
|
|
0
|
1
|
|
(ref($_[0]) ? (map {($_."enum",$_."2x")} @{$_[0]->attrs}) : qw()), |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
qw(xenum xf cof tdf), |
326
|
|
|
|
|
|
|
); |
327
|
|
|
|
|
|
|
} |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
## $coldb_or_undef = $coldb->close() |
330
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
## $bool = $coldb->opened() |
333
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
## @files = $obj->diskFiles() |
336
|
|
|
|
|
|
|
## + returns list of dist files for this db |
337
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
##============================================================================== |
340
|
|
|
|
|
|
|
## Create/compile |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
343
|
|
|
|
|
|
|
## create: utils |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
## $multimap = $coldb->create_xmap($base, \%xs2i, $packfmt, $label="multimap") |
346
|
1
|
|
|
1
|
|
11
|
BEGIN { *create_xmap = DiaColloDB::Compat->nocompat('create_xmap'); } |
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
## \@attrs = $coldb->attrs() |
349
|
|
|
|
|
|
|
## \@attrs = $coldb->attrs($attrs=$coldb->{attrs}, $default=[]) |
350
|
|
|
|
|
|
|
## + parse attributes in $attrs as array |
351
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
## $aname = $CLASS_OR_OBJECT->attrName($attr) |
354
|
|
|
|
|
|
|
## + returns canonical (short) attribute name for $attr |
355
|
|
|
|
|
|
|
## + supports aliases in %ATTR_ALIAS = ($alias=>$name, ...) |
356
|
|
|
|
|
|
|
## + see also: |
357
|
|
|
|
|
|
|
## %ATTR_RALIAS = ($name=>\@aliases, ...) |
358
|
|
|
|
|
|
|
## %ATTR_CBEXPR = ($name=>$ddcCountByExpr, ...) |
359
|
|
|
|
|
|
|
## %ATTR_TITLE = ($name_or_alias=>$title, ...) |
360
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
361
|
|
|
|
|
|
|
our %ATTR_ALIAS = %DiaColloDB::ATTR_ALIAS; |
362
|
|
|
|
|
|
|
our %ATTR_RALIAS = %DiaColloDB::ATTR_RALIAS; |
363
|
|
|
|
|
|
|
our %ATTR_TITLE = %DiaColloDB::ATTR_TITLE; |
364
|
|
|
|
|
|
|
our %ATTR_CBEXPR = %DiaColloDB::ATTR_CBEXPR; |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
## $atitle = $CLASS_OR_OBJECT->attrTitle($attr_or_alias) |
367
|
|
|
|
|
|
|
## + returns an attribute title for $attr_or_alias |
368
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
## $acbexpr = $CLASS_OR_OBJECT->attrCountBy($attr_or_alias,$matchid=0) |
371
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
## $aquery_or_filter_or_undef = $CLASS_OR_OBJECT->attrQuery($attr_or_alias,$cquery) |
374
|
|
|
|
|
|
|
## + returns a CQuery or CQFilter object for condition $cquery on $attr_or_alias |
375
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
## \@attrdata = $coldb->attrData() |
378
|
|
|
|
|
|
|
## \@attrdata = $coldb->attrData(\@attrs=$coldb->attrs) |
379
|
|
|
|
|
|
|
## + get attribute data for \@attrs |
380
|
|
|
|
|
|
|
## + return @attrdata = ({a=>$attr, i=>$i, enum=>$aenum, pack_x=>$pack_xa, a2x=>$a2x, ...}) |
381
|
|
|
|
|
|
|
sub attrData { |
382
|
0
|
|
|
0
|
0
|
|
my ($coldb,$attrs) = @_; |
383
|
0
|
|
0
|
|
|
|
$attrs //= $coldb->attrs; |
384
|
0
|
|
|
|
|
|
my ($attr); |
385
|
|
|
|
|
|
|
return [map { |
386
|
0
|
|
|
|
|
|
$attr = $coldb->attrName($attrs->[$_]); |
|
0
|
|
|
|
|
|
|
387
|
0
|
|
|
|
|
|
{i=>$_, a=>$attr, enum=>$coldb->{"${attr}enum"}, pack_x=>$coldb->{"pack_x$attr"}, a2x=>$coldb->{"${attr}2x"}} |
388
|
|
|
|
|
|
|
} (0..$#$attrs)]; |
389
|
|
|
|
|
|
|
} |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
## $bool = $coldb->hasAttr($attr) |
392
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
396
|
|
|
|
|
|
|
## create: from corpus |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
## $bool = $coldb->create($corpus,%opts) |
399
|
|
|
|
|
|
|
## + %opts: |
400
|
|
|
|
|
|
|
## $key => $val, ##-- clobbers $coldb->{$key} |
401
|
|
|
|
|
|
|
## + DISABLED |
402
|
1
|
|
|
1
|
|
7
|
BEGIN { *create = DiaColloDB::Compat->nocompat('create'); } |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
405
|
|
|
|
|
|
|
## create: union (aka merge) |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
## $coldb = $CLASS_OR_OBJECT->union(\@coldbs_or_dbdirs,%opts) |
408
|
|
|
|
|
|
|
## + populates $coldb as union over @coldbs_or_dbdirs |
409
|
|
|
|
|
|
|
## + clobbers argument dbs {_union_${a}i2u}, {_union_xi2u}, {_union_argi} |
410
|
|
|
|
|
|
|
## + DISABLED |
411
|
1
|
|
|
1
|
|
6
|
BEGIN { *merge = *union = DiaColloDB::Compat->nocompat('union'); } |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
414
|
|
|
|
|
|
|
## I/O: header |
415
|
|
|
|
|
|
|
## + largely INHERITED from DiaColloDB::Persistent |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
## @keys = $coldb->headerKeys() |
418
|
|
|
|
|
|
|
## + keys to save as header |
419
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
## $bool = $coldb->loadHeaderData() |
422
|
|
|
|
|
|
|
## $bool = $coldb->loadHeaderData($data) |
423
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
## $bool = $coldb->saveHeader() |
426
|
|
|
|
|
|
|
## $bool = $coldb->saveHeader($headerFile) |
427
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Persistent |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
##============================================================================== |
430
|
|
|
|
|
|
|
## Export/Import |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
## $bool = $coldb->dbexport() |
433
|
|
|
|
|
|
|
## $bool = $coldb->dbexport($outdir,%opts) |
434
|
|
|
|
|
|
|
## + $outdir defaults to "$coldb->{dbdir}/export" |
435
|
|
|
|
|
|
|
## + %opts: |
436
|
|
|
|
|
|
|
## export_sdat => $bool, ##-- whether to export *.sdat (stringified tuple files for debugging; default=0) |
437
|
|
|
|
|
|
|
## export_cof => $bool, ##-- do/don't export cof.* (default=do) |
438
|
|
|
|
|
|
|
## export_tdf => $bool, ##-- do/don't export tdf.* (default=do) |
439
|
|
|
|
|
|
|
sub dbexport { |
440
|
0
|
|
|
0
|
0
|
|
my ($coldb,$outdir,%opts) = @_; |
441
|
0
|
0
|
|
|
|
|
$coldb->logconfess("cannot dbexport() an un-opened DB") if (!$coldb->opened); |
442
|
0
|
|
0
|
|
|
|
$outdir //= "$coldb->{dbdir}/export"; |
443
|
0
|
|
|
|
|
|
$outdir =~ s{/$}{}; |
444
|
0
|
|
|
|
|
|
$coldb->vlog('info', "export($outdir/)"); |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
##-- options |
447
|
0
|
0
|
|
|
|
|
my $export_sdat = exists($opts{export_sdat}) ? $opts{export_sdat} : 0; |
448
|
0
|
0
|
|
|
|
|
my $export_cof = exists($opts{export_cof}) ? $opts{export_cof} : 1; |
449
|
0
|
0
|
|
|
|
|
my $export_tdf = exists($opts{export_tdf}) ? $opts{export_tdf} : 1; |
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
##-- create export directory |
452
|
0
|
0
|
0
|
|
|
|
-d $outdir |
453
|
|
|
|
|
|
|
or make_path($outdir) |
454
|
|
|
|
|
|
|
or $coldb->logconfess("dbexport(): could not create export directory $outdir: $!"); |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
##-- dump: header |
457
|
0
|
0
|
|
|
|
|
$coldb->saveHeader("$outdir/header.json") |
458
|
|
|
|
|
|
|
or $coldb->logconfess("dbexport(): could not export header to $outdir/header.json: $!"); |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
##-- dump: load enums |
461
|
0
|
|
|
|
|
|
my $adata = $coldb->attrData(); |
462
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): loading enums to memory"); |
463
|
0
|
0
|
0
|
|
|
|
$coldb->{xenum}->load() if ($coldb->{xenum} && !$coldb->{xenum}->loaded); |
464
|
0
|
|
|
|
|
|
foreach (@$adata) { |
465
|
0
|
0
|
0
|
|
|
|
$_->{enum}->load() if ($_->{enum} && !$_->{enum}->loaded); |
466
|
|
|
|
|
|
|
} |
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
##-- dump: common: stringification |
469
|
0
|
|
|
|
|
|
my $pack_x = $coldb->{pack_x}; |
470
|
0
|
|
|
|
|
|
my ($xs2txt,$xi2txt); |
471
|
0
|
0
|
|
|
|
|
if ($export_sdat) { |
472
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): preparing tuple-stringification structures"); |
473
|
|
|
|
|
|
|
|
474
|
0
|
|
|
|
|
|
foreach (@$adata) { |
475
|
0
|
|
|
|
|
|
my $i2s = $_->{i2s} = $_->{enum}->toArray; |
476
|
0
|
|
0
|
0
|
|
|
$_->{i2txt} = sub { return $i2s->[$_[0]//0]//''; }; |
|
0
|
|
0
|
|
|
|
|
477
|
|
|
|
|
|
|
} |
478
|
|
|
|
|
|
|
|
479
|
0
|
|
|
|
|
|
my $xi2s = $coldb->{xenum}->toArray; |
480
|
0
|
|
|
|
|
|
my @ai2s = map {$_->{i2s}} @$adata; |
|
0
|
|
|
|
|
|
|
481
|
0
|
|
|
|
|
|
my (@x); |
482
|
|
|
|
|
|
|
$xs2txt = sub { |
483
|
0
|
|
|
0
|
|
|
@x = unpack($pack_x,$_[0]); |
484
|
0
|
|
0
|
|
|
|
return join("\t", (map {$ai2s[$_][$x[$_]//0]//''} (0..$#ai2s)), $x[$#x]//0); |
|
0
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
485
|
0
|
|
|
|
|
|
}; |
486
|
|
|
|
|
|
|
$xi2txt = sub { |
487
|
0
|
|
0
|
0
|
|
|
@x = unpack($pack_x, $xi2s->[$_[0]//0]//''); |
|
|
|
0
|
|
|
|
|
488
|
0
|
|
0
|
|
|
|
return join("\t", (map {$ai2s[$_][$x[$_]//0]//''} (0..$#ai2s)), $x[$#x]//0); |
|
0
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
489
|
0
|
|
|
|
|
|
}; |
490
|
|
|
|
|
|
|
} |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
##-- dump: xenum: raw |
493
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): exporting raw tuple-enum file $outdir/xenum.dat"); |
494
|
0
|
0
|
|
|
|
|
$coldb->{xenum}->saveTextFile("$outdir/xenum.dat", pack_s=>$pack_x) |
495
|
|
|
|
|
|
|
or $coldb->logconfess("export failed for $outdir/xenum.dat"); |
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
##-- dump: xenum: stringified |
498
|
0
|
0
|
|
|
|
|
if ($export_sdat) { |
499
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): exporting stringified tuple-enum file $outdir/xenum.sdat"); |
500
|
0
|
0
|
|
|
|
|
$coldb->{xenum}->saveTextFile("$outdir/xenum.sdat", pack_s=>$xs2txt) |
501
|
|
|
|
|
|
|
or $coldb->logconfess("dbexport() failed for $outdir/xenum.sdat"); |
502
|
|
|
|
|
|
|
} |
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
##-- dump: by attribute: enum |
505
|
0
|
|
|
|
|
|
foreach (@$adata) { |
506
|
|
|
|
|
|
|
##-- dump: by attribute: enum |
507
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): exporting attribute enum file $outdir/$_->{a}_enum.dat"); |
508
|
0
|
0
|
|
|
|
|
$_->{enum}->saveTextFile("$outdir/$_->{a}_enum.dat") |
509
|
|
|
|
|
|
|
or $coldb->logconfess("dbexport() failed for $outdir/$_->{a}_enum.dat"); |
510
|
|
|
|
|
|
|
} |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
##-- dump: by attribute: a2x |
513
|
0
|
|
|
|
|
|
foreach (@$adata) { |
514
|
|
|
|
|
|
|
##-- dump: by attribute: a2x: raw |
515
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): exporting attribute expansion multimap $outdir/$_->{a}_2x.dat (raw)"); |
516
|
0
|
0
|
|
|
|
|
$_->{a2x}->saveTextFile("$outdir/$_->{a}_2x.dat") |
517
|
|
|
|
|
|
|
or $coldb->logconfess("dbexport() failed for $outdir/$_->{a}_2x.dat"); |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
##-- dump: by attribute: a2x: stringified |
520
|
0
|
0
|
|
|
|
|
if ($export_sdat) { |
521
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): exporting attribute expansion multimap $outdir/$_->{a}_2x.sdat (strings)"); |
522
|
0
|
0
|
|
|
|
|
$_->{a2x}->saveTextFile("$outdir/$_->{a}_2x.sdat", a2s=>$_->{i2txt}, b2s=>$xi2txt) |
523
|
|
|
|
|
|
|
or $coldb->logconfess("dbexport() failed for $outdir/$_->{a}_2x.sdat"); |
524
|
|
|
|
|
|
|
} |
525
|
|
|
|
|
|
|
} |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
##-- dump: xf |
528
|
0
|
0
|
|
|
|
|
if ($coldb->{xf}) { |
529
|
|
|
|
|
|
|
##-- dump: xf: raw |
530
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): exporting tuple-frequency index $outdir/xf.dat"); |
531
|
0
|
|
|
|
|
|
$coldb->{xf}->setFilters($coldb->{pack_f}); |
532
|
0
|
0
|
|
|
|
|
$coldb->{xf}->saveTextFile("$outdir/xf.dat", keys=>1) |
533
|
|
|
|
|
|
|
or $coldb->logconfess("export failed for $outdir/xf.dat"); |
534
|
0
|
|
|
|
|
|
$coldb->{xf}->setFilters(); |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
##-- dump: xf: stringified |
537
|
0
|
0
|
|
|
|
|
if ($export_sdat) { |
538
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): exporting stringified tuple-frequency index $outdir/xf.sdat"); |
539
|
0
|
0
|
|
|
|
|
$coldb->{xf}->saveTextFile("$outdir/xf.sdat", key2s=>$xi2txt) |
540
|
|
|
|
|
|
|
or $coldb->logconfess("dbexport() failed for $outdir/xf.sdat"); |
541
|
|
|
|
|
|
|
} |
542
|
|
|
|
|
|
|
} |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
##-- dump: cof |
545
|
0
|
0
|
0
|
|
|
|
if ($coldb->{cof} && $export_cof) { |
546
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): exporting raw co-frequency index $outdir/cof.dat"); |
547
|
0
|
0
|
|
|
|
|
$coldb->{cof}->saveTextFile("$outdir/cof.dat") |
548
|
|
|
|
|
|
|
or $coldb->logconfess("export failed for $outdir/cof.dat"); |
549
|
|
|
|
|
|
|
|
550
|
0
|
0
|
|
|
|
|
if ($export_sdat) { |
551
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): exporting stringified co-frequency index $outdir/cof.sdat"); |
552
|
0
|
0
|
|
|
|
|
$coldb->{cof}->saveTextFile("$outdir/cof.sdat", i2s=>$xi2txt) |
553
|
|
|
|
|
|
|
or $coldb->logconfess("export failed for $outdir/cof.sdat"); |
554
|
|
|
|
|
|
|
} |
555
|
|
|
|
|
|
|
} |
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
##-- dump: tdf |
558
|
0
|
0
|
0
|
|
|
|
if ($coldb->{tdf} && $coldb->{index_tdf} && $export_tdf) { |
|
|
|
0
|
|
|
|
|
559
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): exporting term-document index $outdir/tdf.*"); |
560
|
0
|
0
|
|
|
|
|
$coldb->{tdf}->export("$outdir/tdf", $coldb) |
561
|
|
|
|
|
|
|
or $coldb->logconfess("export failed for $outdir/tdf.*"); |
562
|
|
|
|
|
|
|
} |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
##-- all done |
565
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): export to $outdir complete."); |
566
|
0
|
|
|
|
|
|
return $coldb; |
567
|
|
|
|
|
|
|
} |
568
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
## $coldb = $coldb->dbimport() |
570
|
|
|
|
|
|
|
## $coldb = $coldb->dbimport($txtdir,%opts) |
571
|
|
|
|
|
|
|
## + import ColocDB data from $txtdir |
572
|
|
|
|
|
|
|
## + TODO |
573
|
|
|
|
|
|
|
sub dbimport { |
574
|
0
|
|
|
0
|
0
|
|
my ($coldb,$txtdir,%opts) = @_; |
575
|
0
|
0
|
|
|
|
|
$coldb = $coldb->new() if (!ref($coldb)); |
576
|
0
|
|
|
|
|
|
$coldb->logconfess("dbimport(): not yet implemented"); |
577
|
|
|
|
|
|
|
} |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
##============================================================================== |
580
|
|
|
|
|
|
|
## Info |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
## \%info = $coldb->dbinfo() |
583
|
|
|
|
|
|
|
## + get db info |
584
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
585
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
##============================================================================== |
588
|
|
|
|
|
|
|
## Profiling |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
591
|
|
|
|
|
|
|
## Profiling: Wrappers |
592
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Client |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
## $mprf = $coldb->query($rel,%opts) |
595
|
|
|
|
|
|
|
## + get a generic DiaColloDB::Profile::Multi object for $rel |
596
|
|
|
|
|
|
|
## + calls $coldb->profile() or $coldb->compare() as appropriate |
597
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Client |
598
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
## $mprf = $coldb->profile1(%opts) |
600
|
|
|
|
|
|
|
## + get unigram frequency profile for selected items as a DiaColloDB::Profile::Multi object |
601
|
|
|
|
|
|
|
## + really just wraps $coldb->profile('xf', %opts) |
602
|
|
|
|
|
|
|
## + %opts: see profile() method |
603
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Client |
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
## $mprf = $coldb->profile2(%opts) |
606
|
|
|
|
|
|
|
## + get co-frequency profile for selected items as a DiaColloDB::Profile::Multi object |
607
|
|
|
|
|
|
|
## + really just wraps $coldb->profile('cof', %opts) |
608
|
|
|
|
|
|
|
## + %opts: see profile() method |
609
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Client |
610
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
## $mprf = $coldb->compare1(%opts) |
612
|
|
|
|
|
|
|
## + get unigram comparison profile for selected items as a DiaColloDB::Profile::MultiDiff object |
613
|
|
|
|
|
|
|
## + really just wraps $coldb->compare('xf', %opts) |
614
|
|
|
|
|
|
|
## + %opts: see compare() method |
615
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Client |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
## $mprf = $coldb->compare2(%opts) |
618
|
|
|
|
|
|
|
## + get co-frequency comparison profile for selected items as a DiaColloDB::Profile::MultiDiff object |
619
|
|
|
|
|
|
|
## + really just wraps $coldb->profile('cof', %opts) |
620
|
|
|
|
|
|
|
## + %opts: see compare() method |
621
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Client |
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
625
|
|
|
|
|
|
|
## Profiling: Utils |
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
628
|
|
|
|
|
|
|
## $relname = $coldb->relname($rel) |
629
|
|
|
|
|
|
|
## + returns an appropriate relation name for profile() and friends |
630
|
|
|
|
|
|
|
## + returns $rel if $coldb->{$rel} supports a profile() method |
631
|
|
|
|
|
|
|
## + otherwise heuristically parses $relationName /xf|f?1|ug/ or /f1?2|c/ |
632
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
635
|
|
|
|
|
|
|
## $obj_or_undef = $coldb->relation($rel) |
636
|
|
|
|
|
|
|
## + returns an appropriate relation-like object for profile() and friends |
637
|
|
|
|
|
|
|
## + wraps $coldb->{$coldb->relname($rel)} |
638
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
641
|
|
|
|
|
|
|
## @relnames = $coldb->relations() |
642
|
|
|
|
|
|
|
## + gets list of defined relations |
643
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
646
|
|
|
|
|
|
|
## \@ids = $coldb->enumIds($enum,$req,%opts) |
647
|
|
|
|
|
|
|
## + parses enum IDs for $req, which is one of: |
648
|
|
|
|
|
|
|
## - a DDC::Any::CQTokExact, ::CQTokInfl, ::CQTokSet, ::CQTokSetInfl, or ::CQTokRegex : interpreted |
649
|
|
|
|
|
|
|
## - an ARRAY-ref : list of literal symbol-values |
650
|
|
|
|
|
|
|
## - a Regexp ref : regexp for target strings, passed to $enum->re2i() |
651
|
|
|
|
|
|
|
## - a string /REGEX/ : regexp for target strings, passed to $enum->re2i() |
652
|
|
|
|
|
|
|
## - another string : space-, comma-, or |-separated list of literal values |
653
|
|
|
|
|
|
|
## + %opts: |
654
|
|
|
|
|
|
|
## logLevel => $logLevel, ##-- logging level (default=undef) |
655
|
|
|
|
|
|
|
## logPrefix => $prefix, ##-- logging prefix (default="enumIds(): fetch ids") |
656
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
657
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
659
|
|
|
|
|
|
|
## ($dfilter,$sliceLo,$sliceHi,$dateLo,$dateHi) = $coldb->parseDateRequest($dateRequest='', $sliceRequest=0, $fill=0, $ddcMode=0) |
660
|
|
|
|
|
|
|
## + parses date request and returns limit and filter information as a list (list context) or HASH-ref (scalar context); |
661
|
|
|
|
|
|
|
## + %dateRequest = |
662
|
|
|
|
|
|
|
## ( |
663
|
|
|
|
|
|
|
## dfilter => $dfilter, ##-- filter-sub, called as: $wanted=$dfilter->($date); undef for none |
664
|
|
|
|
|
|
|
## slo => $sliceLo, ##-- minimum slice (inclusive) |
665
|
|
|
|
|
|
|
## shi => $sliceHi, ##-- maximum slice (inclusive) |
666
|
|
|
|
|
|
|
## dlo => $dateLo, ##-- minimum date (inclusive); undef for none, always defined if $fill is true |
667
|
|
|
|
|
|
|
## dhi => $dateHi, ##-- maximum date (inclusive); undef for none, always defined if $fill is true |
668
|
|
|
|
|
|
|
## ) |
669
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
670
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
672
|
|
|
|
|
|
|
## \%slice2xids = $coldb->xidsByDate(\@xids, $dateRequest, $sliceRequest, $fill) |
673
|
|
|
|
|
|
|
## + parse and filter \@xids by $dateRequest, $sliceRequest |
674
|
|
|
|
|
|
|
## + returns a HASH-ref from slice-ids to \@xids in that date-slice |
675
|
|
|
|
|
|
|
## + if $fill is true, returned HASH-ref has a key for each date-slice in range |
676
|
|
|
|
|
|
|
## + OBSOLETE in DiaColloDB |
677
|
|
|
|
|
|
|
sub xidsByDate { |
678
|
0
|
|
|
0
|
0
|
|
my ($coldb,$xids,$date,$slice,$fill) = @_; |
679
|
0
|
|
|
|
|
|
my ($dfilter,$slo,$shi,$dlo,$dhi) = $coldb->parseDateRequest($date,$slice,$fill); |
680
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
##-- filter xids |
682
|
0
|
|
|
|
|
|
my $xenum = $coldb->{xenum}; |
683
|
0
|
|
|
|
|
|
my $pack_x = $coldb->{pack_x}; |
684
|
0
|
|
|
|
|
|
my $pack_i = $coldb->{pack_id}; |
685
|
0
|
|
|
|
|
|
my $pack_d = $coldb->{pack_date}; |
686
|
0
|
|
|
|
|
|
my $pack_xd = "@".(packsize($pack_i) * scalar(@{$coldb->{attrs}})).$pack_d; |
|
0
|
|
|
|
|
|
|
687
|
0
|
|
|
|
|
|
my $d2xis = {}; ##-- ($dateKey => \@xis_at_date, ...) |
688
|
0
|
|
|
|
|
|
my ($xi,$d); |
689
|
0
|
|
|
|
|
|
foreach $xi (@$xids) { |
690
|
0
|
|
|
|
|
|
$d = unpack($pack_xd, $xenum->i2s($xi)); |
691
|
0
|
0
|
0
|
|
|
|
next if (($dfilter && !$dfilter->($d)) || $d < $coldb->{xdmin} || $d > $coldb->{xdmax}); |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
692
|
0
|
0
|
|
|
|
|
$d = $slice ? int($d/$slice)*$slice : 0; |
693
|
0
|
|
|
|
|
|
push(@{$d2xis->{$d}}, $xi); |
|
0
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
} |
695
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
##-- force-fill? |
697
|
0
|
0
|
0
|
|
|
|
if ($fill && $slice) { |
698
|
0
|
|
|
|
|
|
for ($d=$slo; $d <= $shi; $d += $slice) { |
699
|
0
|
|
0
|
|
|
|
$d2xis->{$d} //= []; |
700
|
|
|
|
|
|
|
} |
701
|
|
|
|
|
|
|
} |
702
|
|
|
|
|
|
|
|
703
|
0
|
|
|
|
|
|
return $d2xis; |
704
|
|
|
|
|
|
|
} |
705
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
707
|
|
|
|
|
|
|
## $compiler = $coldb->qcompiler(); |
708
|
|
|
|
|
|
|
## + get DDC::Any::CQueryCompiler for this object (cached in $coldb->{_qcompiler}) |
709
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
712
|
|
|
|
|
|
|
## $cquery_or_undef = $coldb->qparse($ddc_query_string) |
713
|
|
|
|
|
|
|
## + wraps parse in an eval {...} block and sets $coldb->{error} on failure |
714
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
715
|
|
|
|
|
|
|
|
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
718
|
|
|
|
|
|
|
## $cquery = $coldb->parseQuery([[$attr1,$val1],...], %opts) ##-- compat: ARRAY-of-ARRAYs |
719
|
|
|
|
|
|
|
## $cquery = $coldb->parseQuery(["$attr1:$val1",...], %opts) ##-- compat: ARRAY-of-requests |
720
|
|
|
|
|
|
|
## $cquery = $coldb->parseQuery({$attr1=>$val1, ...}, %opts) ##-- compat: HASH |
721
|
|
|
|
|
|
|
## $cquery = $coldb->parseQuery("$attr1=$val1, ...", %opts) ##-- compat: string |
722
|
|
|
|
|
|
|
## $cquery = $coldb->parseQuery($ddcQueryString, %opts) ##-- ddc string (with shorthand ","->WITH, "&&"->WITH) |
723
|
|
|
|
|
|
|
## + guts for parsing user target and groupby requests |
724
|
|
|
|
|
|
|
## + returns a DDC::Any::CQuery object representing the request |
725
|
|
|
|
|
|
|
## + index-only items "$l" are mapped to $l=@{} |
726
|
|
|
|
|
|
|
## + %opts: |
727
|
|
|
|
|
|
|
## warn => $level, ##-- log-level for unknown attributes (default: 'warn') |
728
|
|
|
|
|
|
|
## logas => $reqtype, ##-- request type for warnings |
729
|
|
|
|
|
|
|
## default => $attr, ##-- default attribute (for query requests) |
730
|
|
|
|
|
|
|
## mapand => $bool, ##-- map CQAnd to CQWith? (default=true unless '&&' occurs in query string) |
731
|
|
|
|
|
|
|
## ddcmode => $bool, ##-- force ddc query mode? (default=false) |
732
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
733
|
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
735
|
|
|
|
|
|
|
## \@aqs = $coldb->queryAttributes($cquery,%opts) |
736
|
|
|
|
|
|
|
## + utility for decomposing DDC queries into attribute-wise requests |
737
|
|
|
|
|
|
|
## + returns an ARRAY-ref [[$attr1,$val1], ...] |
738
|
|
|
|
|
|
|
## + each value $vali is empty or undef (all values), a CQTokSet, a CQTokExact, CQTokRegex, or CQTokAny |
739
|
|
|
|
|
|
|
## + chokes on unsupported query types or filters |
740
|
|
|
|
|
|
|
## + %opts: |
741
|
|
|
|
|
|
|
## warn => $level, ##-- log-level for unknown attributes (default: 'warn') |
742
|
|
|
|
|
|
|
## logas => $reqtype, ##-- request type for warnings |
743
|
|
|
|
|
|
|
## default => $attr, ##-- default attribute (for query requests) |
744
|
|
|
|
|
|
|
## allowExtra => \@attrs, ##-- allow extra attributes @attrs (may also be HASH-ref) |
745
|
|
|
|
|
|
|
## allowUnknown => $bool, ##-- allow unknown attributes? (default: 0) |
746
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
747
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
749
|
|
|
|
|
|
|
## \@aqs = $coldb->parseRequest($request, %opts) |
750
|
|
|
|
|
|
|
## + guts for parsing user target and groupby requests into attribute-wise ARRAY-ref [[$attr1,$val1], ...] |
751
|
|
|
|
|
|
|
## + see parseQuery() method for supported $request formats and %opts |
752
|
|
|
|
|
|
|
## + wraps $coldb->queryAttributes($coldb->parseQuery($request,%opts)) |
753
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
754
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
756
|
|
|
|
|
|
|
## \%groupby = $coldb->groupby($groupby_request, %opts) |
757
|
|
|
|
|
|
|
## \%groupby = $coldb->groupby(\%groupby, %opts) |
758
|
|
|
|
|
|
|
## + $grouby_request : see parseRequest() |
759
|
|
|
|
|
|
|
## + returns a HASH-ref: |
760
|
|
|
|
|
|
|
## ( |
761
|
|
|
|
|
|
|
## req => $request, ##-- save request |
762
|
|
|
|
|
|
|
## #x2g => \&x2g, ##-- group-tuple extraction code suitable for e.g. DiaColloDB::Relation::Cofreqs::profile(groupby=>\&x2g) ##--OLD |
763
|
|
|
|
|
|
|
## xi2g => \&xi2g, ##-- group-tuple extraction code ($xi => $gtuple) suitable for e.g. DiaColloDB::Relation::Cofreqs::profile(groupby=>\&x2g) ##--OLD |
764
|
|
|
|
|
|
|
## xs2g => \&xs2g, ##-- group-tuple extraction code ($xs => $gtuple) |
765
|
|
|
|
|
|
|
## g2s => \&g2s, ##-- stringification object suitable for DiaColloDB::Profile::stringify() [CODE,enum, or undef] |
766
|
|
|
|
|
|
|
## g2txt => \&g2txt, ##-- compatible join()-string stringifcation sub |
767
|
|
|
|
|
|
|
## xpack => \@xpack, ##-- group-attribute-wise pack-templates, given @xtuple |
768
|
|
|
|
|
|
|
## gpack => \@gpack, ##-- group-attribute-wise pack-templates, given @gtuple |
769
|
|
|
|
|
|
|
## areqs => \@areqs, ##-- parsed attribute requests ([$attr,$ahaving],...) |
770
|
|
|
|
|
|
|
## attrs => \@attrs, ##-- like $coldb->attrs($groupby_request), modulo "having" parts |
771
|
|
|
|
|
|
|
## titles => \@titles, ##-- like map {$coldb->attrTitle($_)} @attrs |
772
|
|
|
|
|
|
|
## ) |
773
|
|
|
|
|
|
|
## + %opts: |
774
|
|
|
|
|
|
|
## warn => $level, ##-- log-level for unknown attributes (default: 'warn') |
775
|
|
|
|
|
|
|
## relax => $bool, ##-- allow unsupported attributes (default=0) |
776
|
|
|
|
|
|
|
## xenum => $xenum, ##-- enum to use for \&x2g and \&g2s (default: $coldb->{xenum}) |
777
|
|
|
|
|
|
|
## + OVERRIDES DiaColloDB |
778
|
|
|
|
|
|
|
sub groupby { |
779
|
0
|
|
|
0
|
1
|
|
my ($coldb,$gbreq,%opts) = @_; |
780
|
0
|
0
|
|
|
|
|
return $gbreq if (UNIVERSAL::isa($gbreq,'HASH')); |
781
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
##-- get data |
783
|
0
|
|
0
|
|
|
|
my $wlevel = $opts{warn} // 'warn'; |
784
|
0
|
|
|
|
|
|
my $gb = { req=>$gbreq }; |
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
##-- get attribute requests |
787
|
0
|
|
|
|
|
|
my $gbareqs = $gb->{areqs} = $coldb->parseRequest($gb->{req}, %opts,logas=>'groupby'); |
788
|
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
##-- get attribute names (compat) |
790
|
0
|
|
|
|
|
|
my $gbattrs = $gb->{attrs} = [map {$_->[0]} @$gbareqs]; |
|
0
|
|
|
|
|
|
|
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
##-- get attribute titles |
793
|
0
|
|
|
|
|
|
$gb->{titles} = [map {$coldb->attrTitle($_)} @$gbattrs]; |
|
0
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
|
795
|
|
|
|
|
|
|
##-- get groupby-sub |
796
|
0
|
|
0
|
|
|
|
my $xenum = $opts{xenum} // $coldb->{xenum}; |
797
|
0
|
|
|
|
|
|
my $pack_id = $coldb->{pack_id}; |
798
|
0
|
|
|
|
|
|
my $pack_ids = "($pack_id)*"; |
799
|
0
|
|
|
|
|
|
my $len_id = packsize($pack_id); |
800
|
0
|
|
|
|
|
|
my @gbxpack = @{$gb->{xpack} = [map {$coldb->{"pack_x$_"}} @$gbattrs]}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
801
|
0
|
|
|
|
|
|
my $gbxpack = join('',@gbxpack); |
802
|
0
|
|
|
|
|
|
my @gbgpack = @{$gb->{gpack} = [map {'@'.($_*$len_id).$pack_id} (0..$#$gbattrs)]}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
803
|
0
|
|
|
|
|
|
my ($ids); |
804
|
|
|
|
|
|
|
my @gbids = ( |
805
|
|
|
|
|
|
|
map { |
806
|
0
|
|
|
|
|
|
($_->[1] && !UNIVERSAL::isa($_->[1],'DDC::Any::CQTokAny') |
807
|
|
|
|
|
|
|
? { |
808
|
0
|
|
|
|
|
|
map {($_=>undef)} |
809
|
0
|
0
|
0
|
|
|
|
@{$coldb->enumIds($coldb->{"$_->[0]enum"}, $_->[1], logLevel=>$coldb->{logProfile}, logPrefix=>"groupby(): fetch filter ids: $_->[0]")} |
|
0
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
} |
811
|
|
|
|
|
|
|
: undef) |
812
|
|
|
|
|
|
|
} @$gbareqs); |
813
|
|
|
|
|
|
|
|
814
|
0
|
|
|
|
|
|
my (@gi,$xi2g_code,$xs2g_code); |
815
|
0
|
0
|
|
|
|
|
if (grep {$_} @gbids) { |
|
0
|
|
|
|
|
|
|
816
|
|
|
|
|
|
|
##-- group-by code: with having-filters |
817
|
|
|
|
|
|
|
$xs2g_code = ('' |
818
|
|
|
|
|
|
|
.qq{ \@gi=unpack('$gbxpack',\$_[0]);} |
819
|
0
|
|
|
|
|
|
.qq{ return undef if (}.join(' || ', map {"!exists(\$gbids[$_]{\$gi[$_]})"} grep {defined($gbids[$_])} (0..$#gbids)).qq{);} |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
820
|
|
|
|
|
|
|
.qq{ return pack('$pack_ids',\@gi); } |
821
|
|
|
|
|
|
|
); |
822
|
|
|
|
|
|
|
} |
823
|
|
|
|
|
|
|
else { |
824
|
|
|
|
|
|
|
##-- group-by code: no filters |
825
|
0
|
|
|
|
|
|
$xs2g_code = qq{ pack('$pack_ids', unpack('$gbxpack', \$_[0])) }; |
826
|
|
|
|
|
|
|
} |
827
|
0
|
|
|
|
|
|
my $xs2g_sub = eval qq{sub {$xs2g_code}}; |
828
|
0
|
0
|
|
|
|
|
$coldb->logconfess($coldb->{error}="groupby(): could not compile tuple-based aggregation code sub {$xs2g_code}: $@") if (!$xs2g_sub); |
829
|
0
|
|
|
|
|
|
$@=''; |
830
|
0
|
|
|
|
|
|
$gb->{xs2g} = $xs2g_sub; |
831
|
|
|
|
|
|
|
|
832
|
0
|
|
|
|
|
|
($xi2g_code = $xs2g_code) =~ s{\$_\[0\]}{\$xenum->i2s(\$_[0])}; |
833
|
0
|
|
|
|
|
|
my $xi2g_sub = eval qq{sub {$xi2g_code}}; |
834
|
0
|
0
|
|
|
|
|
$coldb->logconfess($coldb->{error}="groupby(): could not compile id-base aggregation code sub {$xi2g_code}: $@") if (!$xi2g_sub); |
835
|
0
|
|
|
|
|
|
$@=''; |
836
|
0
|
|
|
|
|
|
$gb->{xi2g} = $xi2g_sub; |
837
|
|
|
|
|
|
|
|
838
|
|
|
|
|
|
|
##-- get stringification sub |
839
|
0
|
|
|
|
|
|
my ($genum,@genums,$g2scode); |
840
|
0
|
0
|
|
|
|
|
if (@$gbattrs == 1) { |
841
|
|
|
|
|
|
|
##-- stringify a single attribute |
842
|
0
|
|
|
|
|
|
$genum = $coldb->{$gbattrs->[0]."enum"}; |
843
|
0
|
|
|
|
|
|
$g2scode = qq{ \$genum->i2s(unpack('$pack_id',\$_[0])) }; |
844
|
|
|
|
|
|
|
} |
845
|
|
|
|
|
|
|
else { |
846
|
0
|
|
|
|
|
|
@genums = map {$coldb->{$_."enum"}} @$gbattrs; |
|
0
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
$g2scode = ('' |
848
|
|
|
|
|
|
|
.qq{ \@gi=unpack('$pack_ids', \$_[0]); } |
849
|
0
|
|
|
|
|
|
.q{ join("\t",}.join(', ', map {"\$genums[$_]->i2s(\$gi[$_])"} (0..$#genums)).q{)} |
|
0
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
); |
851
|
|
|
|
|
|
|
} |
852
|
0
|
|
|
|
|
|
my $g2s = eval qq{sub {$g2scode}}; |
853
|
0
|
0
|
|
|
|
|
$coldb->logconfess($coldb->{error}="groupby(): could not compile stringification code sub {$g2scode}: $@") if (!$g2s); |
854
|
0
|
|
|
|
|
|
$@=''; |
855
|
0
|
|
|
|
|
|
$gb->{g2s} = $g2s; |
856
|
|
|
|
|
|
|
|
857
|
|
|
|
|
|
|
##-- get pseudo-stringification sub ("\t"-joined decimal integer ids) |
858
|
0
|
|
|
|
|
|
my ($g2txt_code); |
859
|
0
|
0
|
|
|
|
|
if (@$gbattrs == 1) { |
860
|
|
|
|
|
|
|
##-- stringify a single attribute |
861
|
0
|
|
|
|
|
|
$g2txt_code = qq{ unpack('$pack_id',\$_[0]) }; |
862
|
|
|
|
|
|
|
} |
863
|
|
|
|
|
|
|
else { |
864
|
0
|
|
|
|
|
|
$g2txt_code = qq{ join("\t",unpack('$pack_ids', \$_[0])); }; |
865
|
|
|
|
|
|
|
} |
866
|
0
|
|
|
|
|
|
my $g2txt = eval qq{sub {$g2txt_code}}; |
867
|
0
|
0
|
|
|
|
|
$coldb->logconfess($coldb->{error}="groupby(): could not compile pseudo-stringification code sub {$g2txt_code}: $@") if (!$g2txt); |
868
|
0
|
|
|
|
|
|
$@=''; |
869
|
0
|
|
|
|
|
|
$gb->{g2txt} = $g2txt; |
870
|
|
|
|
|
|
|
|
871
|
0
|
|
|
|
|
|
return $gb; |
872
|
|
|
|
|
|
|
} |
873
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
875
|
|
|
|
|
|
|
## $cqfilter = $coldb->query2filter($attr,$cquery,%opts) |
876
|
|
|
|
|
|
|
## + converts a CQToken to a CQFilter, for ddc parsing |
877
|
|
|
|
|
|
|
## + %opts: |
878
|
|
|
|
|
|
|
## logas => $logas, ##-- log-prefix for warnings |
879
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
880
|
|
|
|
|
|
|
|
881
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
882
|
|
|
|
|
|
|
## ($CQCountKeyExprs,\$CQRestrict,\@CQFilters) = $coldb->parseGroupBy($groupby_string_or_request,%opts) |
883
|
|
|
|
|
|
|
## + for ddc-mode parsing |
884
|
|
|
|
|
|
|
## + %opts: |
885
|
|
|
|
|
|
|
## date => $date, |
886
|
|
|
|
|
|
|
## slice => $slice, |
887
|
|
|
|
|
|
|
## matchid => $matchid, ##-- default match-id |
888
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
889
|
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
891
|
|
|
|
|
|
|
## Profiling: Generic |
892
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
## $mprf = $coldb->profile($relation, %opts) |
894
|
|
|
|
|
|
|
## + get a relation profile for selected items as a DiaColloDB::Profile::Multi object |
895
|
|
|
|
|
|
|
## + %opts: |
896
|
|
|
|
|
|
|
## ( |
897
|
|
|
|
|
|
|
## ##-- selection parameters |
898
|
|
|
|
|
|
|
## query => $query, ##-- target request ATTR:REQ... |
899
|
|
|
|
|
|
|
## date => $date1, ##-- string or array or range "MIN-MAX" (inclusive) : default=all |
900
|
|
|
|
|
|
|
## ## |
901
|
|
|
|
|
|
|
## ##-- aggregation parameters |
902
|
|
|
|
|
|
|
## slice => $slice, ##-- date slice (default=1, 0 for global profile) |
903
|
|
|
|
|
|
|
## groupby => $groupby, ##-- string or array "ATTR1[:HAVING1] ...": default=$coldb->attrs; see groupby() method |
904
|
|
|
|
|
|
|
## ## |
905
|
|
|
|
|
|
|
## ##-- scoring and trimming parameters |
906
|
|
|
|
|
|
|
## eps => $eps, ##-- smoothing constant (default=0) |
907
|
|
|
|
|
|
|
## score => $func, ##-- scoring function (f|fm|lf|lfm|mi|ld) : default="f" |
908
|
|
|
|
|
|
|
## kbest => $k, ##-- return only $k best collocates per date (slice) : default=-1:all |
909
|
|
|
|
|
|
|
## cutoff => $cutoff, ##-- minimum score |
910
|
|
|
|
|
|
|
## global => $bool, ##-- trim profiles globally (vs. locally for each date-slice?) (default=0) |
911
|
|
|
|
|
|
|
## ## |
912
|
|
|
|
|
|
|
## ##-- profiling and debugging parameters |
913
|
|
|
|
|
|
|
## strings => $bool, ##-- do/don't stringify (default=do) |
914
|
|
|
|
|
|
|
## fill => $bool, ##-- if true, returned multi-profile will have null profiles inserted for missing slices |
915
|
|
|
|
|
|
|
## onepass => $bool, ##-- if true, use fast but incorrect 1-pass method (Cofreqs profiling only) |
916
|
|
|
|
|
|
|
## ) |
917
|
|
|
|
|
|
|
## + sets default %opts and wraps $coldb->relation($rel)->profile($coldb, %opts) |
918
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
919
|
|
|
|
|
|
|
|
920
|
|
|
|
|
|
|
## \%opts = $CLASS_OR_OBJECT->profileOptions(\%opts) |
921
|
|
|
|
|
|
|
## + instantiates default options for profile() method |
922
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
923
|
|
|
|
|
|
|
|
924
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
925
|
|
|
|
|
|
|
## Profiling: Comparison (diff) |
926
|
|
|
|
|
|
|
|
927
|
|
|
|
|
|
|
## $mprf = $coldb->compare($relation, %opts) |
928
|
|
|
|
|
|
|
## + get a relation comparison profile for selected items as a DiaColloDB::Profile::MultiDiff object |
929
|
|
|
|
|
|
|
## + %opts: |
930
|
|
|
|
|
|
|
## ( |
931
|
|
|
|
|
|
|
## ##-- selection parameters |
932
|
|
|
|
|
|
|
## (a|b)?query => $query, ##-- target query as for parseRequest() |
933
|
|
|
|
|
|
|
## (a|b)?date => $date1, ##-- string or array or range "MIN-MAX" (inclusive) : default=all |
934
|
|
|
|
|
|
|
## ## |
935
|
|
|
|
|
|
|
## ##-- aggregation parameters |
936
|
|
|
|
|
|
|
## groupby => $groupby, ##-- string or array "ATTR1[:HAVING1] ...": default=$coldb->attrs; see groupby() method |
937
|
|
|
|
|
|
|
## (a|b)?slice => $slice, ##-- date slice (default=1, 0 for global profile) |
938
|
|
|
|
|
|
|
## ## |
939
|
|
|
|
|
|
|
## ##-- scoring and trimming parameters |
940
|
|
|
|
|
|
|
## eps => $eps, ##-- smoothing constant (default=0) |
941
|
|
|
|
|
|
|
## score => $func, ##-- scoring function (f|fm|lf|lfm|mi|ld) : default="f" |
942
|
|
|
|
|
|
|
## kbest => $k, ##-- return only $k best collocates per date (slice) : default=-1:all |
943
|
|
|
|
|
|
|
## cutoff => $cutoff, ##-- minimum score (UNUSED for comparison profiles) |
944
|
|
|
|
|
|
|
## global => $bool, ##-- trim profiles globally (vs. locally for each date-slice?) (default=0) |
945
|
|
|
|
|
|
|
## diff => $diff, ##-- low-level score-diff operation (diff|adiff|sum|min|max|avg|havg); default='adiff' |
946
|
|
|
|
|
|
|
## ## |
947
|
|
|
|
|
|
|
## ##-- profiling and debugging parameters |
948
|
|
|
|
|
|
|
## strings => $bool, ##-- do/don't stringify (default=do) |
949
|
|
|
|
|
|
|
## ) |
950
|
|
|
|
|
|
|
## + sets default %opts and wraps $coldb->relation($rel)->compare($coldb, %opts) |
951
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
952
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
## \%opts = $CLASS_OR_OBJECT->compareOptions(\%opts) |
954
|
|
|
|
|
|
|
## + instantiates default options for compare() method |
955
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
956
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
##============================================================================== |
958
|
|
|
|
|
|
|
## Footer |
959
|
|
|
|
|
|
|
1; |
960
|
|
|
|
|
|
|
|
961
|
|
|
|
|
|
|
__END__ |