| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
## -*- Mode: CPerl -*- |
|
2
|
|
|
|
|
|
|
## |
|
3
|
|
|
|
|
|
|
## File: Compat::v0_09::DiaColloDB.pm |
|
4
|
|
|
|
|
|
|
## Author: Bryan Jurish <moocow@cpan.org> |
|
5
|
|
|
|
|
|
|
## Description: collocation db, top-level: v0.09.x compatibility hack |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
package DiaColloDB::Compat::v0_09::DiaColloDB; |
|
8
|
1
|
|
|
1
|
|
8
|
use DiaColloDB; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
34
|
|
|
9
|
1
|
|
|
1
|
|
7
|
use DiaColloDB::Compat::v0_09::Relation; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
23
|
|
|
10
|
1
|
|
|
1
|
|
6
|
use DiaColloDB::Compat::v0_09::Relation::Unigrams; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
20
|
|
|
11
|
1
|
|
|
1
|
|
5
|
use DiaColloDB::Compat::v0_09::Relation::Cofreqs; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
28
|
|
|
12
|
1
|
|
|
1
|
|
6
|
use DiaColloDB::Utils qw(:math :fcntl :json :sort :pack :regex :file :si :run :env :temp); |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
68
|
|
|
13
|
1
|
|
|
1
|
|
610
|
use DDC::Any; ##-- for query parsing |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
13
|
|
|
14
|
1
|
|
|
1
|
|
55
|
use File::Path qw(make_path remove_tree); |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
26
|
|
|
15
|
1
|
|
|
1
|
|
143
|
use Fcntl; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
48
|
|
|
16
|
1
|
|
|
1
|
|
295
|
use strict; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
1676
|
|
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
##============================================================================== |
|
19
|
|
|
|
|
|
|
## Globals & Constants |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB DiaColloDB::Compat); |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
## $ECLASS |
|
24
|
|
|
|
|
|
|
## + enum class |
|
25
|
|
|
|
|
|
|
#our $ECLASS = 'DiaColloDB::EnumFile'; |
|
26
|
|
|
|
|
|
|
our $ECLASS = 'DiaColloDB::EnumFile::MMap'; |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
## $XECLASS |
|
29
|
|
|
|
|
|
|
## + fixed-length enum class |
|
30
|
|
|
|
|
|
|
#our $XECLASS = 'DiaColloDB::EnumFile::FixedLen'; |
|
31
|
|
|
|
|
|
|
our $XECLASS = 'DiaColloDB::EnumFile::FixedLen::MMap'; |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
## $MMCLASS |
|
34
|
|
|
|
|
|
|
## + multimap class |
|
35
|
|
|
|
|
|
|
#our $MMCLASS = 'DiaColloDB::MultiMapFile'; |
|
36
|
|
|
|
|
|
|
our $MMCLASS = 'DiaColloDB::MultiMapFile::MMap'; |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
##============================================================================== |
|
39
|
|
|
|
|
|
|
## Constructors etc. |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
## $coldb = CLASS_OR_OBJECT->new(%args) |
|
42
|
|
|
|
|
|
|
## + %args, object structure: |
|
43
|
|
|
|
|
|
|
## ( |
|
44
|
|
|
|
|
|
|
## ##-- options |
|
45
|
|
|
|
|
|
|
## dbdir => $dbdir, ##-- database directory; REQUIRED |
|
46
|
|
|
|
|
|
|
## flags => $fcflags, ##-- fcntl flags or open()-style mode string; default='r' |
|
47
|
|
|
|
|
|
|
## attrs => \@attrs, ##-- index attributes (input as space-separated or array; compiled to array); default=undef (==>['l']) |
|
48
|
|
|
|
|
|
|
## ## + each attribute can be token-attribute qw(w p l) or a document metadata attribute "doc.ATTR" |
|
49
|
|
|
|
|
|
|
## ## + document "date" attribute is always indexed |
|
50
|
|
|
|
|
|
|
## info => \%info, ##-- additional data to return in info() method (e.g. collection, maintainer) |
|
51
|
|
|
|
|
|
|
## #bos => $bos, ##-- special string to use for BOS, undef or empty for none (default=undef) DISABLED |
|
52
|
|
|
|
|
|
|
## #eos => $eos, ##-- special string to use for EOS, undef or empty for none (default=undef) DISABLED |
|
53
|
|
|
|
|
|
|
## pack_id => $fmt, ##-- pack-format for IDs (default='N') |
|
54
|
|
|
|
|
|
|
## pack_f => $fmt, ##-- pack-format for frequencies (default='N') |
|
55
|
|
|
|
|
|
|
## pack_date => $fmt, ##-- pack-format for dates (default='n') |
|
56
|
|
|
|
|
|
|
## pack_off => $fmt, ##-- pack-format for file offsets (default='N') |
|
57
|
|
|
|
|
|
|
## pack_len => $len, ##-- pack-format for string lengths (default='n') |
|
58
|
|
|
|
|
|
|
## dmax => $dmax, ##-- maximum distance for collocation-frequencies and implicit ddc near() queries (default=5) |
|
59
|
|
|
|
|
|
|
## cfmin => $cfmin, ##-- minimum co-occurrence frequency for Cofreqs and ddc queries (default=2) |
|
60
|
|
|
|
|
|
|
## tfmin => $tfmin, ##-- minimum global term-frequency WITHOUT date component (default=2) |
|
61
|
|
|
|
|
|
|
## fmin_${a} => $fmin, ##-- minimum independent frequency for value of attribute ${a} (default=undef:from $tfmin) |
|
62
|
|
|
|
|
|
|
## keeptmp => $bool, ##-- keep temporary files? (default=0) |
|
63
|
|
|
|
|
|
|
## index_tdf => $bool, ##-- tdf: create/use (term x document) frequency matrix index? (default=undef: if available) |
|
64
|
|
|
|
|
|
|
## index_cof => $bool, ##-- cof: create/use co-frequency index (default=1) |
|
65
|
|
|
|
|
|
|
## dbreak => $dbreak, ##-- tdf: use break-type $break for tdf index (default=undef: files) |
|
66
|
|
|
|
|
|
|
## tdfopts=>\%tdfopts, ##-- tdf: options for DiaColloDB::Relation::TDF->new(); default=undef (all inherited from %TDF_OPTS) |
|
67
|
|
|
|
|
|
|
## ## |
|
68
|
|
|
|
|
|
|
## ##-- runtime ddc relation options |
|
69
|
|
|
|
|
|
|
## ddcServer => "$host:$port", ##-- server for ddc relation |
|
70
|
|
|
|
|
|
|
## ddcTimeout => $seconds, ##-- timeout for ddc relation |
|
71
|
|
|
|
|
|
|
## ## |
|
72
|
|
|
|
|
|
|
## ##-- source filtering (for create()) |
|
73
|
|
|
|
|
|
|
## pgood => $regex, ##-- positive filter regex for part-of-speech tags |
|
74
|
|
|
|
|
|
|
## pbad => $regex, ##-- negative filter regex for part-of-speech tags |
|
75
|
|
|
|
|
|
|
## wgood => $regex, ##-- positive filter regex for word text |
|
76
|
|
|
|
|
|
|
## wbad => $regex, ##-- negative filter regex for word text |
|
77
|
|
|
|
|
|
|
## lgood => $regex, ##-- positive filter regex for lemma text |
|
78
|
|
|
|
|
|
|
## lbad => $regex, ##-- negative filter regex for lemma text |
|
79
|
|
|
|
|
|
|
## ## |
|
80
|
|
|
|
|
|
|
## ##-- logging |
|
81
|
|
|
|
|
|
|
## logOpen => $level, ##-- log-level for open/close (default='info') |
|
82
|
|
|
|
|
|
|
## logCreate => $level, ##-- log-level for create messages (default='info') |
|
83
|
|
|
|
|
|
|
## logCorpusFile => $level, ##-- log-level for corpus file-parsing (default='info') |
|
84
|
|
|
|
|
|
|
## logCorpusFileN => $N, ##-- log corpus file-parsing only for every N files (0 for none; default:undef ~ $corpus->size()/100) |
|
85
|
|
|
|
|
|
|
## logExport => $level, ##-- log-level for export messages (default='info') |
|
86
|
|
|
|
|
|
|
## logProfile => $level, ##-- log-level for verbose profiling messages (default='trace') |
|
87
|
|
|
|
|
|
|
## logRequest => $level, ##-- log-level for request-level profiling messages (default='debug') |
|
88
|
|
|
|
|
|
|
## ## |
|
89
|
|
|
|
|
|
|
## ##-- runtime limits |
|
90
|
|
|
|
|
|
|
## maxExpand => $size, ##-- maximum number of elements in query expansions (default=65535) |
|
91
|
|
|
|
|
|
|
## ## |
|
92
|
|
|
|
|
|
|
## ##-- administrivia |
|
93
|
|
|
|
|
|
|
## version=>$version, ##-- DiaColloDB version of stored db (==$DiaColloDB::VERSION) |
|
94
|
|
|
|
|
|
|
## upgraded=>\@upgraded, ##-- optional administrative information about auto-magic upgrades |
|
95
|
|
|
|
|
|
|
## ## |
|
96
|
|
|
|
|
|
|
## ##-- attribute data |
|
97
|
|
|
|
|
|
|
## ${a}enum => $aenum, ##-- attribute enum: $aenum : ($dbdir/${a}_enum.*) : $astr<=>$ai : A*<=>N |
|
98
|
|
|
|
|
|
|
## ## e.g. lemmata: $lenum : ($dbdir/l_enum.* ) : $lstr<=>$li : A*<=>N |
|
99
|
|
|
|
|
|
|
## ${a}2x => $a2x, ##-- attribute multimap: $a2x : ($dbdir/${a}_2x.*) : $ai=>@xis : N=>N* |
|
100
|
|
|
|
|
|
|
## pack_x$a => $fmt ##-- pack format: extract attribute-id $ai from a packed tuple-string $xs ; $ai=unpack($coldb->{"pack_x$a"},$xs) |
|
101
|
|
|
|
|
|
|
## ## |
|
102
|
|
|
|
|
|
|
## ##-- tuple data (+dates) |
|
103
|
|
|
|
|
|
|
## xenum => $xenum, ##-- enum: tuples ($dbdir/xenum.*) : [@ais,$di]<=>$xi : N*n<=>N |
|
104
|
|
|
|
|
|
|
## pack_x => $fmt, ##-- symbol pack-format for $xenum : "${pack_id}[Nattrs]${pack_date}" |
|
105
|
|
|
|
|
|
|
## xdmin => $xdmin, ##-- minimum date |
|
106
|
|
|
|
|
|
|
## xdmax => $xdmax, ##-- maximum date |
|
107
|
|
|
|
|
|
|
## ## |
|
108
|
|
|
|
|
|
|
## ##-- relation data |
|
109
|
|
|
|
|
|
|
## xf => $xf, ##-- ug: $xi => $f($xi) : N=>N |
|
110
|
|
|
|
|
|
|
## cof => $cof, ##-- cf: [$xi1,$xi2] => $f12 |
|
111
|
|
|
|
|
|
|
## ddc => $ddc, ##-- ddc: ddc client relation |
|
112
|
|
|
|
|
|
|
## tdf => $tdf, ##-- tdf: (term x document) frequency matrix relation |
|
113
|
|
|
|
|
|
|
## ) |
|
114
|
|
|
|
|
|
|
sub new { |
|
115
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
|
116
|
0
|
|
0
|
|
|
|
my $coldb = bless({ |
|
117
|
|
|
|
|
|
|
##-- options |
|
118
|
|
|
|
|
|
|
dbdir => undef, |
|
119
|
|
|
|
|
|
|
flags => 'r', |
|
120
|
|
|
|
|
|
|
attrs => undef, |
|
121
|
|
|
|
|
|
|
#bos => undef, |
|
122
|
|
|
|
|
|
|
#eos => undef, |
|
123
|
|
|
|
|
|
|
pack_id => 'N', |
|
124
|
|
|
|
|
|
|
pack_f => 'N', |
|
125
|
|
|
|
|
|
|
pack_date => 'n', |
|
126
|
|
|
|
|
|
|
pack_off => 'N', |
|
127
|
|
|
|
|
|
|
pack_len =>'n', |
|
128
|
|
|
|
|
|
|
dmax => 5, |
|
129
|
|
|
|
|
|
|
cfmin => 2, |
|
130
|
|
|
|
|
|
|
tfmin => 2, |
|
131
|
|
|
|
|
|
|
#keeptmp => 0, |
|
132
|
|
|
|
|
|
|
index_tdf => undef, |
|
133
|
|
|
|
|
|
|
index_cof => 1, |
|
134
|
|
|
|
|
|
|
dbreak => undef, |
|
135
|
|
|
|
|
|
|
tdfopts => {}, |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
##-- filters |
|
138
|
|
|
|
|
|
|
pgood => $DiaColloDB::PGOOD_DEFAULT, |
|
139
|
|
|
|
|
|
|
pbad => $DiaColloDB::PBAD_DEFAULT, |
|
140
|
|
|
|
|
|
|
wgood => $DiaColloDB::WGOOD_DEFAULT, |
|
141
|
|
|
|
|
|
|
wbad => $DiaColloDB::WBAD_DEFAULT, |
|
142
|
|
|
|
|
|
|
lgood => $DiaColloDB::LGOOD_DEFAULT, |
|
143
|
|
|
|
|
|
|
lbad => $DiaColloDB::LBAD_DEFAULT, |
|
144
|
|
|
|
|
|
|
#vsmgood => $DiaColloDB::TDF_MGOOD_DEFAULT, |
|
145
|
|
|
|
|
|
|
#vsmbad => $DiaColloDB::TDF_MBAD_DEFAULT, |
|
146
|
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
##-- logging |
|
148
|
|
|
|
|
|
|
logOpen => 'info', |
|
149
|
|
|
|
|
|
|
logCreate => 'info', |
|
150
|
|
|
|
|
|
|
logCorpusFile => 'info', |
|
151
|
|
|
|
|
|
|
logCorpusFileN => undef, |
|
152
|
|
|
|
|
|
|
logExport => 'info', |
|
153
|
|
|
|
|
|
|
logProfile => 'trace', |
|
154
|
|
|
|
|
|
|
logRequest => 'debug', |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
##-- limits |
|
157
|
|
|
|
|
|
|
maxExpand => 65535, |
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
##-- administrivia |
|
160
|
|
|
|
|
|
|
version => "v0.09.000", |
|
161
|
|
|
|
|
|
|
#upgraded=>[], |
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
##-- attributes |
|
164
|
|
|
|
|
|
|
#lenum => undef, #$ECLASS->new(pack_i=>$coldb->{pack_id}, pack_o=>$coldb->{pack_off}, pack_l=>$coldb->{pack_len}), |
|
165
|
|
|
|
|
|
|
#l2x => undef, #$MMCLASS->new(pack_i=>$coldb->{pack_id}, pack_o=>$coldb->{pack_off}, pack_l=>$coldb->{pack_id}), |
|
166
|
|
|
|
|
|
|
#pack_xl => 'N', |
|
167
|
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
##-- tuples (+dates) |
|
169
|
|
|
|
|
|
|
#xenum => undef, #$XECLASS::FixedLen->new(pack_i=>$coldb->{pack_id}, pack_s=>$coldb->{pack_x}), |
|
170
|
|
|
|
|
|
|
#pack_x => 'Nn', |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
##-- relations |
|
173
|
|
|
|
|
|
|
#xf => undef, #DiaColloDB::Relation::Unigrams->new(packas=>$coldb->{pack_f}), |
|
174
|
|
|
|
|
|
|
#cof => undef, #DiaColloDB::Relation::Cofreqs->new(pack_f=>$pack_f, pack_i=>$pack_i, dmax=>$dmax, fmin=>$cfmin), |
|
175
|
|
|
|
|
|
|
#ddc => undef, #DiaColloDB::Relation::DDC->new(), |
|
176
|
|
|
|
|
|
|
#tdf => undef, #DiaColloDB::Relation::TDF->new(), |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
@_, ##-- user arguments |
|
179
|
|
|
|
|
|
|
}, |
|
180
|
|
|
|
|
|
|
ref($that)||$that); |
|
181
|
0
|
|
|
|
|
|
$coldb->{class} = ref($coldb); |
|
182
|
0
|
|
|
|
|
|
$coldb->{pack_w} = $coldb->{pack_id}; |
|
183
|
0
|
|
|
|
|
|
$coldb->{pack_x} = $coldb->{pack_w} . $coldb->{pack_date}; |
|
184
|
0
|
0
|
|
|
|
|
if (defined($coldb->{dbdir})) { |
|
185
|
|
|
|
|
|
|
##-- avoid initial close() if called with dbdir=>$dbdir argument |
|
186
|
0
|
|
|
|
|
|
my $dbdir = $coldb->{dbdir}; |
|
187
|
0
|
|
|
|
|
|
delete $coldb->{dbdir}; |
|
188
|
0
|
|
|
|
|
|
return $coldb->open($dbdir); |
|
189
|
|
|
|
|
|
|
} |
|
190
|
0
|
|
|
|
|
|
return $coldb; |
|
191
|
|
|
|
|
|
|
} |
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
## undef = $obj->DESTROY |
|
194
|
|
|
|
|
|
|
## + destructor calls close() if necessary |
|
195
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Client |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
## $cli_or_undef = $cli->promote($class,%opts) |
|
198
|
|
|
|
|
|
|
## + DiaColloDB::Client method override |
|
199
|
|
|
|
|
|
|
sub promote { |
|
200
|
0
|
|
|
0
|
1
|
|
$_[0]->logconfess("promote(): not supported"); |
|
201
|
|
|
|
|
|
|
} |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
##============================================================================== |
|
204
|
|
|
|
|
|
|
## I/O: open/close |
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
## $coldb_or_undef = $coldb->open($dbdir,%opts) |
|
207
|
|
|
|
|
|
|
## $coldb_or_undef = $coldb->open() |
|
208
|
|
|
|
|
|
|
sub open { |
|
209
|
0
|
|
|
0
|
1
|
|
my ($coldb,$dbdir,%opts) = @_; |
|
210
|
0
|
|
|
|
|
|
DiaColloDB::Logger->ensureLog(); |
|
211
|
0
|
0
|
|
|
|
|
$coldb = $coldb->new() if (!ref($coldb)); |
|
212
|
|
|
|
|
|
|
#@$coldb{keys %opts} = values %opts; ##-- clobber options after loadHeader() |
|
213
|
0
|
|
0
|
|
|
|
$dbdir //= $coldb->{dbdir}; |
|
214
|
0
|
|
|
|
|
|
$dbdir =~ s{/$}{}; |
|
215
|
0
|
0
|
|
|
|
|
$coldb->close() if ($coldb->opened); |
|
216
|
0
|
|
|
|
|
|
$coldb->{dbdir} = $dbdir; |
|
217
|
0
|
|
0
|
|
|
|
my $flags = fcflags($opts{flags} // $coldb->{flags}); |
|
218
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logOpen}, "open($dbdir)"); |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
##-- open: truncate |
|
221
|
0
|
0
|
|
|
|
|
if (fctrunc($flags)) { |
|
222
|
0
|
|
|
|
|
|
$flags |= O_CREAT; |
|
223
|
0
|
0
|
0
|
|
|
|
!-d $dbdir |
|
224
|
|
|
|
|
|
|
or remove_tree($dbdir) |
|
225
|
|
|
|
|
|
|
or $coldb->logconfess("open(): could not remove old $dbdir: $!"); |
|
226
|
|
|
|
|
|
|
} |
|
227
|
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
##-- open: create |
|
229
|
0
|
0
|
|
|
|
|
if (!-d $dbdir) { |
|
230
|
0
|
0
|
|
|
|
|
$coldb->logconfess("open(): no such directory '$dbdir'") if (!fccreat($flags)); |
|
231
|
0
|
0
|
|
|
|
|
make_path($dbdir) |
|
232
|
|
|
|
|
|
|
or $coldb->logconfess("open(): could not create DB directory '$dbdir': $!"); |
|
233
|
|
|
|
|
|
|
} |
|
234
|
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
##-- open: header |
|
236
|
|
|
|
|
|
|
$coldb->loadHeader() |
|
237
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("open(): failed to load header file", $coldb->headerFile, ": $!"); |
|
238
|
0
|
|
|
|
|
|
@$coldb{keys %opts} = values %opts; ##-- clobber header options with user-supplied values |
|
239
|
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
##-- open: tdf: require |
|
241
|
0
|
0
|
|
|
|
|
$coldb->{index_tdf} = 0 if (!-r "$dbdir/tdf.hdr"); |
|
242
|
0
|
0
|
|
|
|
|
if ($coldb->{index_tdf}) { |
|
243
|
0
|
0
|
|
|
|
|
if (!require "DiaColloDB/Relation/TDF.pm") { |
|
244
|
0
|
0
|
|
|
|
|
$coldb->logwarn("open(): require failed for DiaColloDB/Relation/TDF.pm ; (term x document) matrix modelling disabled", ($@ ? "\n: $@" : '')); |
|
245
|
0
|
|
|
|
|
|
$coldb->{index_tdf} = 0; |
|
246
|
|
|
|
|
|
|
} |
|
247
|
|
|
|
|
|
|
} |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
##-- open: common options |
|
250
|
0
|
|
|
|
|
|
my %efopts = (flags=>$flags, pack_i=>$coldb->{pack_id}, pack_o=>$coldb->{pack_off}, pack_l=>$coldb->{pack_len}); |
|
251
|
0
|
|
|
|
|
|
my %mmopts = (flags=>$flags, pack_i=>$coldb->{pack_id}); |
|
252
|
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
##-- open: attributes |
|
254
|
0
|
|
|
|
|
|
my $attrs = $coldb->{attrs} = $coldb->attrs(undef,['l']); |
|
255
|
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
##-- open: by attribute |
|
257
|
0
|
|
|
|
|
|
my $axat = 0; |
|
258
|
0
|
|
|
|
|
|
foreach my $attr (@$attrs) { |
|
259
|
|
|
|
|
|
|
##-- open: ${attr}* |
|
260
|
0
|
0
|
|
|
|
|
my $abase = (-r "$dbdir/${attr}_enum.hdr" ? "$dbdir/${attr}_" : "$dbdir/${attr}"); ##-- v0.03-compatibility hack |
|
261
|
0
|
0
|
|
|
|
|
$coldb->{"${attr}enum"} = $ECLASS->new(base=>"${abase}enum", %efopts) |
|
262
|
|
|
|
|
|
|
or $coldb->logconfess("open(): failed to open enum ${abase}enum.*: $!"); |
|
263
|
0
|
0
|
|
|
|
|
$coldb->{"${attr}2x"} = $MMCLASS->new(base=>"${abase}2x", %mmopts) |
|
264
|
|
|
|
|
|
|
or $coldb->logconfess("open(): failed to open expansion multimap ${abase}2x.*: $!"); |
|
265
|
0
|
|
0
|
|
|
|
$coldb->{"pack_x$attr"} //= "\@${axat}$coldb->{pack_id}"; |
|
266
|
0
|
|
|
|
|
|
$axat += packsize($coldb->{pack_id}); |
|
267
|
|
|
|
|
|
|
} |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
##-- open: xenum |
|
270
|
|
|
|
|
|
|
$coldb->{xenum} = $XECLASS->new(base=>"$dbdir/xenum", %efopts, pack_s=>$coldb->{pack_x}) |
|
271
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("open(): failed to open tuple-enum $dbdir/xenum.*: $!"); |
|
272
|
0
|
0
|
0
|
|
|
|
if (!defined($coldb->{xdmin}) || !defined($coldb->{xdmax})) { |
|
273
|
|
|
|
|
|
|
##-- hack: guess date-range if not specified |
|
274
|
0
|
|
|
|
|
|
$coldb->vlog('warn', "Warning: extracting date-range from xenum: you should update $coldb->{dbdir}/header.json"); |
|
275
|
0
|
|
|
|
|
|
my $pack_xdate = '@'.(packsize($coldb->{pack_id}) * scalar(@{$coldb->attrs})).$coldb->{pack_date}; |
|
|
0
|
|
|
|
|
|
|
|
276
|
0
|
|
|
|
|
|
my ($dmin,$dmax,$d) = ('inf','-inf'); |
|
277
|
0
|
|
|
|
|
|
foreach (@{$coldb->{xenum}->toArray}) { |
|
|
0
|
|
|
|
|
|
|
|
278
|
0
|
0
|
|
|
|
|
next if (!$_); |
|
279
|
0
|
0
|
|
|
|
|
next if (!defined($d = unpack($pack_xdate,$_))); ##-- strangeness: getting only 9-bytes in $_ for 10-byte values in file and toArray(): why?! |
|
280
|
0
|
0
|
|
|
|
|
$dmin = $d if ($d < $dmin); |
|
281
|
0
|
0
|
|
|
|
|
$dmax = $d if ($d > $dmax); |
|
282
|
|
|
|
|
|
|
} |
|
283
|
0
|
|
|
|
|
|
$coldb->vlog('warn', "extracted date-range \"xdmin\":$dmin, \"xdmax\":$dmax"); |
|
284
|
0
|
|
|
|
|
|
@$coldb{qw(xdmin xdmax)} = ($dmin,$dmax); |
|
285
|
|
|
|
|
|
|
} |
|
286
|
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
##-- open: xf |
|
288
|
0
|
0
|
|
|
|
|
$coldb->{xf} = DiaColloDB::Compat::v0_09::Relation::Unigrams->new(file=>"$dbdir/xf.dba", flags=>$flags, packas=>$coldb->{pack_f}, logCompat=>'off') |
|
289
|
|
|
|
|
|
|
or $coldb->logconfess("open(): failed to open tuple-unigrams $dbdir/xf.dba: $!"); |
|
290
|
0
|
0
|
0
|
|
|
|
$coldb->{xf}{N} = $coldb->{xN} if ($coldb->{xN} && !$coldb->{xf}{N}); ##-- compat |
|
291
|
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
##-- open: cof |
|
293
|
0
|
0
|
0
|
|
|
|
if ($coldb->{index_cof}//1) { |
|
294
|
|
|
|
|
|
|
$coldb->{cof} = DiaColloDB::Compat::v0_09::Relation::Cofreqs->new(base=>"$dbdir/cof", flags=>$flags, |
|
295
|
|
|
|
|
|
|
pack_i=>$coldb->{pack_id}, pack_f=>$coldb->{pack_f}, |
|
296
|
|
|
|
|
|
|
dmax=>$coldb->{dmax}, fmin=>$coldb->{cfmin}, |
|
297
|
0
|
0
|
|
|
|
|
logCompat=>'off', |
|
298
|
|
|
|
|
|
|
) |
|
299
|
|
|
|
|
|
|
or $coldb->logconfess("open(): failed to open co-frequency file $dbdir/cof.*: $!"); |
|
300
|
|
|
|
|
|
|
} |
|
301
|
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
##-- open: ddc (undef if ddcServer isn't set in ddc.hdr or $coldb) |
|
303
|
0
|
0
|
0
|
|
|
|
$coldb->{ddc} = DiaColloDB::Relation::DDC->new(-r "$dbdir/ddc.hdr" ? (base=>"$dbdir/ddc") : qw())->fromDB($coldb) |
|
304
|
|
|
|
|
|
|
// 'DiaColloDB::Relation::DDC'; |
|
305
|
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
##-- open: tdf (if available) |
|
307
|
0
|
0
|
|
|
|
|
if ($coldb->{index_tdf}) { |
|
308
|
0
|
|
0
|
|
|
|
$coldb->{tdfopts} //= {}; |
|
309
|
0
|
|
0
|
|
|
|
$coldb->{tdfopts}{$_} //= $DiaColloDB::TDF_OPTS{$_} foreach (keys %DiaColloDB::TDF_OPTS); ##-- tdf: default options |
|
310
|
|
|
|
|
|
|
$coldb->{tdf} = DiaColloDB::Relation::TDF->new((-r "$dbdir/tdf.hdr" ? (base=>"$dbdir/tdf") : qw()), |
|
311
|
|
|
|
|
|
|
dbreak => $coldb->{dbreak}, |
|
312
|
0
|
0
|
|
|
|
|
%{$coldb->{tdfopts}}, |
|
|
0
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
); |
|
314
|
|
|
|
|
|
|
} |
|
315
|
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
##-- all done |
|
317
|
0
|
|
|
|
|
|
return $coldb; |
|
318
|
|
|
|
|
|
|
} |
|
319
|
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
## @dbkeys = $coldb->dbkeys() |
|
322
|
|
|
|
|
|
|
sub dbkeys { |
|
323
|
|
|
|
|
|
|
return ( |
|
324
|
0
|
0
|
|
0
|
1
|
|
(ref($_[0]) ? (map {($_."enum",$_."2x")} @{$_[0]->attrs}) : qw()), |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
qw(xenum xf cof tdf), |
|
326
|
|
|
|
|
|
|
); |
|
327
|
|
|
|
|
|
|
} |
|
328
|
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
## $coldb_or_undef = $coldb->close() |
|
330
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
331
|
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
## $bool = $coldb->opened() |
|
333
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
334
|
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
## @files = $obj->diskFiles() |
|
336
|
|
|
|
|
|
|
## + returns list of dist files for this db |
|
337
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
338
|
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
##============================================================================== |
|
340
|
|
|
|
|
|
|
## Create/compile |
|
341
|
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
343
|
|
|
|
|
|
|
## create: utils |
|
344
|
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
## $multimap = $coldb->create_xmap($base, \%xs2i, $packfmt, $label="multimap") |
|
346
|
1
|
|
|
1
|
|
14
|
BEGIN { *create_xmap = DiaColloDB::Compat->nocompat('create_xmap'); } |
|
347
|
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
## \@attrs = $coldb->attrs() |
|
349
|
|
|
|
|
|
|
## \@attrs = $coldb->attrs($attrs=$coldb->{attrs}, $default=[]) |
|
350
|
|
|
|
|
|
|
## + parse attributes in $attrs as array |
|
351
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
352
|
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
## $aname = $CLASS_OR_OBJECT->attrName($attr) |
|
354
|
|
|
|
|
|
|
## + returns canonical (short) attribute name for $attr |
|
355
|
|
|
|
|
|
|
## + supports aliases in %ATTR_ALIAS = ($alias=>$name, ...) |
|
356
|
|
|
|
|
|
|
## + see also: |
|
357
|
|
|
|
|
|
|
## %ATTR_RALIAS = ($name=>\@aliases, ...) |
|
358
|
|
|
|
|
|
|
## %ATTR_CBEXPR = ($name=>$ddcCountByExpr, ...) |
|
359
|
|
|
|
|
|
|
## %ATTR_TITLE = ($name_or_alias=>$title, ...) |
|
360
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
361
|
|
|
|
|
|
|
our %ATTR_ALIAS = %DiaColloDB::ATTR_ALIAS; |
|
362
|
|
|
|
|
|
|
our %ATTR_RALIAS = %DiaColloDB::ATTR_RALIAS; |
|
363
|
|
|
|
|
|
|
our %ATTR_TITLE = %DiaColloDB::ATTR_TITLE; |
|
364
|
|
|
|
|
|
|
our %ATTR_CBEXPR = %DiaColloDB::ATTR_CBEXPR; |
|
365
|
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
## $atitle = $CLASS_OR_OBJECT->attrTitle($attr_or_alias) |
|
367
|
|
|
|
|
|
|
## + returns an attribute title for $attr_or_alias |
|
368
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
369
|
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
## $acbexpr = $CLASS_OR_OBJECT->attrCountBy($attr_or_alias,$matchid=0) |
|
371
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
372
|
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
## $aquery_or_filter_or_undef = $CLASS_OR_OBJECT->attrQuery($attr_or_alias,$cquery) |
|
374
|
|
|
|
|
|
|
## + returns a CQuery or CQFilter object for condition $cquery on $attr_or_alias |
|
375
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
376
|
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
## \@attrdata = $coldb->attrData() |
|
378
|
|
|
|
|
|
|
## \@attrdata = $coldb->attrData(\@attrs=$coldb->attrs) |
|
379
|
|
|
|
|
|
|
## + get attribute data for \@attrs |
|
380
|
|
|
|
|
|
|
## + return @attrdata = ({a=>$attr, i=>$i, enum=>$aenum, pack_x=>$pack_xa, a2x=>$a2x, ...}) |
|
381
|
|
|
|
|
|
|
sub attrData { |
|
382
|
0
|
|
|
0
|
0
|
|
my ($coldb,$attrs) = @_; |
|
383
|
0
|
|
0
|
|
|
|
$attrs //= $coldb->attrs; |
|
384
|
0
|
|
|
|
|
|
my ($attr); |
|
385
|
|
|
|
|
|
|
return [map { |
|
386
|
0
|
|
|
|
|
|
$attr = $coldb->attrName($attrs->[$_]); |
|
|
0
|
|
|
|
|
|
|
|
387
|
0
|
|
|
|
|
|
{i=>$_, a=>$attr, enum=>$coldb->{"${attr}enum"}, pack_x=>$coldb->{"pack_x$attr"}, a2x=>$coldb->{"${attr}2x"}} |
|
388
|
|
|
|
|
|
|
} (0..$#$attrs)]; |
|
389
|
|
|
|
|
|
|
} |
|
390
|
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
## $bool = $coldb->hasAttr($attr) |
|
392
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
393
|
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
396
|
|
|
|
|
|
|
## create: from corpus |
|
397
|
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
## $bool = $coldb->create($corpus,%opts) |
|
399
|
|
|
|
|
|
|
## + %opts: |
|
400
|
|
|
|
|
|
|
## $key => $val, ##-- clobbers $coldb->{$key} |
|
401
|
|
|
|
|
|
|
## + DISABLED |
|
402
|
1
|
|
|
1
|
|
9
|
BEGIN { *create = DiaColloDB::Compat->nocompat('create'); } |
|
403
|
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
405
|
|
|
|
|
|
|
## create: union (aka merge) |
|
406
|
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
## $coldb = $CLASS_OR_OBJECT->union(\@coldbs_or_dbdirs,%opts) |
|
408
|
|
|
|
|
|
|
## + populates $coldb as union over @coldbs_or_dbdirs |
|
409
|
|
|
|
|
|
|
## + clobbers argument dbs {_union_${a}i2u}, {_union_xi2u}, {_union_argi} |
|
410
|
|
|
|
|
|
|
## + DISABLED |
|
411
|
1
|
|
|
1
|
|
8
|
BEGIN { *merge = *union = DiaColloDB::Compat->nocompat('union'); } |
|
412
|
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
414
|
|
|
|
|
|
|
## I/O: header |
|
415
|
|
|
|
|
|
|
## + largely INHERITED from DiaColloDB::Persistent |
|
416
|
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
## @keys = $coldb->headerKeys() |
|
418
|
|
|
|
|
|
|
## + keys to save as header |
|
419
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
420
|
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
## $bool = $coldb->loadHeaderData() |
|
422
|
|
|
|
|
|
|
## $bool = $coldb->loadHeaderData($data) |
|
423
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
424
|
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
## $bool = $coldb->saveHeader() |
|
426
|
|
|
|
|
|
|
## $bool = $coldb->saveHeader($headerFile) |
|
427
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Persistent |
|
428
|
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
##============================================================================== |
|
430
|
|
|
|
|
|
|
## Export/Import |
|
431
|
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
## $bool = $coldb->dbexport() |
|
433
|
|
|
|
|
|
|
## $bool = $coldb->dbexport($outdir,%opts) |
|
434
|
|
|
|
|
|
|
## + $outdir defaults to "$coldb->{dbdir}/export" |
|
435
|
|
|
|
|
|
|
## + %opts: |
|
436
|
|
|
|
|
|
|
## export_sdat => $bool, ##-- whether to export *.sdat (stringified tuple files for debugging; default=0) |
|
437
|
|
|
|
|
|
|
## export_cof => $bool, ##-- do/don't export cof.* (default=do) |
|
438
|
|
|
|
|
|
|
## export_tdf => $bool, ##-- do/don't export tdf.* (default=do) |
|
439
|
|
|
|
|
|
|
sub dbexport { |
|
440
|
0
|
|
|
0
|
0
|
|
my ($coldb,$outdir,%opts) = @_; |
|
441
|
0
|
0
|
|
|
|
|
$coldb->logconfess("cannot dbexport() an un-opened DB") if (!$coldb->opened); |
|
442
|
0
|
|
0
|
|
|
|
$outdir //= "$coldb->{dbdir}/export"; |
|
443
|
0
|
|
|
|
|
|
$outdir =~ s{/$}{}; |
|
444
|
0
|
|
|
|
|
|
$coldb->vlog('info', "export($outdir/)"); |
|
445
|
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
##-- options |
|
447
|
0
|
0
|
|
|
|
|
my $export_sdat = exists($opts{export_sdat}) ? $opts{export_sdat} : 0; |
|
448
|
0
|
0
|
|
|
|
|
my $export_cof = exists($opts{export_cof}) ? $opts{export_cof} : 1; |
|
449
|
0
|
0
|
|
|
|
|
my $export_tdf = exists($opts{export_tdf}) ? $opts{export_tdf} : 1; |
|
450
|
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
##-- create export directory |
|
452
|
0
|
0
|
0
|
|
|
|
-d $outdir |
|
453
|
|
|
|
|
|
|
or make_path($outdir) |
|
454
|
|
|
|
|
|
|
or $coldb->logconfess("dbexport(): could not create export directory $outdir: $!"); |
|
455
|
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
##-- dump: header |
|
457
|
0
|
0
|
|
|
|
|
$coldb->saveHeader("$outdir/header.json") |
|
458
|
|
|
|
|
|
|
or $coldb->logconfess("dbexport(): could not export header to $outdir/header.json: $!"); |
|
459
|
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
##-- dump: load enums |
|
461
|
0
|
|
|
|
|
|
my $adata = $coldb->attrData(); |
|
462
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): loading enums to memory"); |
|
463
|
0
|
0
|
0
|
|
|
|
$coldb->{xenum}->load() if ($coldb->{xenum} && !$coldb->{xenum}->loaded); |
|
464
|
0
|
|
|
|
|
|
foreach (@$adata) { |
|
465
|
0
|
0
|
0
|
|
|
|
$_->{enum}->load() if ($_->{enum} && !$_->{enum}->loaded); |
|
466
|
|
|
|
|
|
|
} |
|
467
|
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
##-- dump: common: stringification |
|
469
|
0
|
|
|
|
|
|
my $pack_x = $coldb->{pack_x}; |
|
470
|
0
|
|
|
|
|
|
my ($xs2txt,$xi2txt); |
|
471
|
0
|
0
|
|
|
|
|
if ($export_sdat) { |
|
472
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): preparing tuple-stringification structures"); |
|
473
|
|
|
|
|
|
|
|
|
474
|
0
|
|
|
|
|
|
foreach (@$adata) { |
|
475
|
0
|
|
|
|
|
|
my $i2s = $_->{i2s} = $_->{enum}->toArray; |
|
476
|
0
|
|
0
|
0
|
|
|
$_->{i2txt} = sub { return $i2s->[$_[0]//0]//''; }; |
|
|
0
|
|
0
|
|
|
|
|
|
477
|
|
|
|
|
|
|
} |
|
478
|
|
|
|
|
|
|
|
|
479
|
0
|
|
|
|
|
|
my $xi2s = $coldb->{xenum}->toArray; |
|
480
|
0
|
|
|
|
|
|
my @ai2s = map {$_->{i2s}} @$adata; |
|
|
0
|
|
|
|
|
|
|
|
481
|
0
|
|
|
|
|
|
my (@x); |
|
482
|
|
|
|
|
|
|
$xs2txt = sub { |
|
483
|
0
|
|
|
0
|
|
|
@x = unpack($pack_x,$_[0]); |
|
484
|
0
|
|
0
|
|
|
|
return join("\t", (map {$ai2s[$_][$x[$_]//0]//''} (0..$#ai2s)), $x[$#x]//0); |
|
|
0
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
485
|
0
|
|
|
|
|
|
}; |
|
486
|
|
|
|
|
|
|
$xi2txt = sub { |
|
487
|
0
|
|
0
|
0
|
|
|
@x = unpack($pack_x, $xi2s->[$_[0]//0]//''); |
|
|
|
|
0
|
|
|
|
|
|
488
|
0
|
|
0
|
|
|
|
return join("\t", (map {$ai2s[$_][$x[$_]//0]//''} (0..$#ai2s)), $x[$#x]//0); |
|
|
0
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
489
|
0
|
|
|
|
|
|
}; |
|
490
|
|
|
|
|
|
|
} |
|
491
|
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
##-- dump: xenum: raw |
|
493
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): exporting raw tuple-enum file $outdir/xenum.dat"); |
|
494
|
0
|
0
|
|
|
|
|
$coldb->{xenum}->saveTextFile("$outdir/xenum.dat", pack_s=>$pack_x) |
|
495
|
|
|
|
|
|
|
or $coldb->logconfess("export failed for $outdir/xenum.dat"); |
|
496
|
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
##-- dump: xenum: stringified |
|
498
|
0
|
0
|
|
|
|
|
if ($export_sdat) { |
|
499
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): exporting stringified tuple-enum file $outdir/xenum.sdat"); |
|
500
|
0
|
0
|
|
|
|
|
$coldb->{xenum}->saveTextFile("$outdir/xenum.sdat", pack_s=>$xs2txt) |
|
501
|
|
|
|
|
|
|
or $coldb->logconfess("dbexport() failed for $outdir/xenum.sdat"); |
|
502
|
|
|
|
|
|
|
} |
|
503
|
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
##-- dump: by attribute: enum |
|
505
|
0
|
|
|
|
|
|
foreach (@$adata) { |
|
506
|
|
|
|
|
|
|
##-- dump: by attribute: enum |
|
507
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): exporting attribute enum file $outdir/$_->{a}_enum.dat"); |
|
508
|
0
|
0
|
|
|
|
|
$_->{enum}->saveTextFile("$outdir/$_->{a}_enum.dat") |
|
509
|
|
|
|
|
|
|
or $coldb->logconfess("dbexport() failed for $outdir/$_->{a}_enum.dat"); |
|
510
|
|
|
|
|
|
|
} |
|
511
|
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
##-- dump: by attribute: a2x |
|
513
|
0
|
|
|
|
|
|
foreach (@$adata) { |
|
514
|
|
|
|
|
|
|
##-- dump: by attribute: a2x: raw |
|
515
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): exporting attribute expansion multimap $outdir/$_->{a}_2x.dat (raw)"); |
|
516
|
0
|
0
|
|
|
|
|
$_->{a2x}->saveTextFile("$outdir/$_->{a}_2x.dat") |
|
517
|
|
|
|
|
|
|
or $coldb->logconfess("dbexport() failed for $outdir/$_->{a}_2x.dat"); |
|
518
|
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
##-- dump: by attribute: a2x: stringified |
|
520
|
0
|
0
|
|
|
|
|
if ($export_sdat) { |
|
521
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): exporting attribute expansion multimap $outdir/$_->{a}_2x.sdat (strings)"); |
|
522
|
0
|
0
|
|
|
|
|
$_->{a2x}->saveTextFile("$outdir/$_->{a}_2x.sdat", a2s=>$_->{i2txt}, b2s=>$xi2txt) |
|
523
|
|
|
|
|
|
|
or $coldb->logconfess("dbexport() failed for $outdir/$_->{a}_2x.sdat"); |
|
524
|
|
|
|
|
|
|
} |
|
525
|
|
|
|
|
|
|
} |
|
526
|
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
##-- dump: xf |
|
528
|
0
|
0
|
|
|
|
|
if ($coldb->{xf}) { |
|
529
|
|
|
|
|
|
|
##-- dump: xf: raw |
|
530
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): exporting tuple-frequency index $outdir/xf.dat"); |
|
531
|
0
|
|
|
|
|
|
$coldb->{xf}->setFilters($coldb->{pack_f}); |
|
532
|
0
|
0
|
|
|
|
|
$coldb->{xf}->saveTextFile("$outdir/xf.dat", keys=>1) |
|
533
|
|
|
|
|
|
|
or $coldb->logconfess("export failed for $outdir/xf.dat"); |
|
534
|
0
|
|
|
|
|
|
$coldb->{xf}->setFilters(); |
|
535
|
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
##-- dump: xf: stringified |
|
537
|
0
|
0
|
|
|
|
|
if ($export_sdat) { |
|
538
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): exporting stringified tuple-frequency index $outdir/xf.sdat"); |
|
539
|
0
|
0
|
|
|
|
|
$coldb->{xf}->saveTextFile("$outdir/xf.sdat", key2s=>$xi2txt) |
|
540
|
|
|
|
|
|
|
or $coldb->logconfess("dbexport() failed for $outdir/xf.sdat"); |
|
541
|
|
|
|
|
|
|
} |
|
542
|
|
|
|
|
|
|
} |
|
543
|
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
##-- dump: cof |
|
545
|
0
|
0
|
0
|
|
|
|
if ($coldb->{cof} && $export_cof) { |
|
546
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): exporting raw co-frequency index $outdir/cof.dat"); |
|
547
|
0
|
0
|
|
|
|
|
$coldb->{cof}->saveTextFile("$outdir/cof.dat") |
|
548
|
|
|
|
|
|
|
or $coldb->logconfess("export failed for $outdir/cof.dat"); |
|
549
|
|
|
|
|
|
|
|
|
550
|
0
|
0
|
|
|
|
|
if ($export_sdat) { |
|
551
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): exporting stringified co-frequency index $outdir/cof.sdat"); |
|
552
|
0
|
0
|
|
|
|
|
$coldb->{cof}->saveTextFile("$outdir/cof.sdat", i2s=>$xi2txt) |
|
553
|
|
|
|
|
|
|
or $coldb->logconfess("export failed for $outdir/cof.sdat"); |
|
554
|
|
|
|
|
|
|
} |
|
555
|
|
|
|
|
|
|
} |
|
556
|
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
##-- dump: tdf |
|
558
|
0
|
0
|
0
|
|
|
|
if ($coldb->{tdf} && $coldb->{index_tdf} && $export_tdf) { |
|
|
|
|
0
|
|
|
|
|
|
559
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): exporting term-document index $outdir/tdf.*"); |
|
560
|
0
|
0
|
|
|
|
|
$coldb->{tdf}->export("$outdir/tdf", $coldb) |
|
561
|
|
|
|
|
|
|
or $coldb->logconfess("export failed for $outdir/tdf.*"); |
|
562
|
|
|
|
|
|
|
} |
|
563
|
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
##-- all done |
|
565
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logExport}, "dbexport(): export to $outdir complete."); |
|
566
|
0
|
|
|
|
|
|
return $coldb; |
|
567
|
|
|
|
|
|
|
} |
|
568
|
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
## $coldb = $coldb->dbimport() |
|
570
|
|
|
|
|
|
|
## $coldb = $coldb->dbimport($txtdir,%opts) |
|
571
|
|
|
|
|
|
|
## + import ColocDB data from $txtdir |
|
572
|
|
|
|
|
|
|
## + TODO |
|
573
|
|
|
|
|
|
|
sub dbimport { |
|
574
|
0
|
|
|
0
|
0
|
|
my ($coldb,$txtdir,%opts) = @_; |
|
575
|
0
|
0
|
|
|
|
|
$coldb = $coldb->new() if (!ref($coldb)); |
|
576
|
0
|
|
|
|
|
|
$coldb->logconfess("dbimport(): not yet implemented"); |
|
577
|
|
|
|
|
|
|
} |
|
578
|
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
##============================================================================== |
|
580
|
|
|
|
|
|
|
## Info |
|
581
|
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
## \%info = $coldb->dbinfo() |
|
583
|
|
|
|
|
|
|
## + get db info |
|
584
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
585
|
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
##============================================================================== |
|
588
|
|
|
|
|
|
|
## Profiling |
|
589
|
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
591
|
|
|
|
|
|
|
## Profiling: Wrappers |
|
592
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Client |
|
593
|
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
## $mprf = $coldb->query($rel,%opts) |
|
595
|
|
|
|
|
|
|
## + get a generic DiaColloDB::Profile::Multi object for $rel |
|
596
|
|
|
|
|
|
|
## + calls $coldb->profile() or $coldb->compare() as appropriate |
|
597
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Client |
|
598
|
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
## $mprf = $coldb->profile1(%opts) |
|
600
|
|
|
|
|
|
|
## + get unigram frequency profile for selected items as a DiaColloDB::Profile::Multi object |
|
601
|
|
|
|
|
|
|
## + really just wraps $coldb->profile('xf', %opts) |
|
602
|
|
|
|
|
|
|
## + %opts: see profile() method |
|
603
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Client |
|
604
|
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
## $mprf = $coldb->profile2(%opts) |
|
606
|
|
|
|
|
|
|
## + get co-frequency profile for selected items as a DiaColloDB::Profile::Multi object |
|
607
|
|
|
|
|
|
|
## + really just wraps $coldb->profile('cof', %opts) |
|
608
|
|
|
|
|
|
|
## + %opts: see profile() method |
|
609
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Client |
|
610
|
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
## $mprf = $coldb->compare1(%opts) |
|
612
|
|
|
|
|
|
|
## + get unigram comparison profile for selected items as a DiaColloDB::Profile::MultiDiff object |
|
613
|
|
|
|
|
|
|
## + really just wraps $coldb->compare('xf', %opts) |
|
614
|
|
|
|
|
|
|
## + %opts: see compare() method |
|
615
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Client |
|
616
|
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
## $mprf = $coldb->compare2(%opts) |
|
618
|
|
|
|
|
|
|
## + get co-frequency comparison profile for selected items as a DiaColloDB::Profile::MultiDiff object |
|
619
|
|
|
|
|
|
|
## + really just wraps $coldb->profile('cof', %opts) |
|
620
|
|
|
|
|
|
|
## + %opts: see compare() method |
|
621
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Client |
|
622
|
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
625
|
|
|
|
|
|
|
## Profiling: Utils |
|
626
|
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
|
628
|
|
|
|
|
|
|
## $relname = $coldb->relname($rel) |
|
629
|
|
|
|
|
|
|
## + returns an appropriate relation name for profile() and friends |
|
630
|
|
|
|
|
|
|
## + returns $rel if $coldb->{$rel} supports a profile() method |
|
631
|
|
|
|
|
|
|
## + otherwise heuristically parses $relationName /xf|f?1|ug/ or /f1?2|c/ |
|
632
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
633
|
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
|
635
|
|
|
|
|
|
|
## $obj_or_undef = $coldb->relation($rel) |
|
636
|
|
|
|
|
|
|
## + returns an appropriate relation-like object for profile() and friends |
|
637
|
|
|
|
|
|
|
## + wraps $coldb->{$coldb->relname($rel)} |
|
638
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
639
|
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
|
641
|
|
|
|
|
|
|
## @relnames = $coldb->relations() |
|
642
|
|
|
|
|
|
|
## + gets list of defined relations |
|
643
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
644
|
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
|
646
|
|
|
|
|
|
|
## \@ids = $coldb->enumIds($enum,$req,%opts) |
|
647
|
|
|
|
|
|
|
## + parses enum IDs for $req, which is one of: |
|
648
|
|
|
|
|
|
|
## - a DDC::Any::CQTokExact, ::CQTokInfl, ::CQTokSet, ::CQTokSetInfl, or ::CQTokRegex : interpreted |
|
649
|
|
|
|
|
|
|
## - an ARRAY-ref : list of literal symbol-values |
|
650
|
|
|
|
|
|
|
## - a Regexp ref : regexp for target strings, passed to $enum->re2i() |
|
651
|
|
|
|
|
|
|
## - a string /REGEX/ : regexp for target strings, passed to $enum->re2i() |
|
652
|
|
|
|
|
|
|
## - another string : space-, comma-, or |-separated list of literal values |
|
653
|
|
|
|
|
|
|
## + %opts: |
|
654
|
|
|
|
|
|
|
## logLevel => $logLevel, ##-- logging level (default=undef) |
|
655
|
|
|
|
|
|
|
## logPrefix => $prefix, ##-- logging prefix (default="enumIds(): fetch ids") |
|
656
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
657
|
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
|
659
|
|
|
|
|
|
|
## ($dfilter,$sliceLo,$sliceHi,$dateLo,$dateHi) = $coldb->parseDateRequest($dateRequest='', $sliceRequest=0, $fill=0, $ddcMode=0) |
|
660
|
|
|
|
|
|
|
## + parses date request and returns limit and filter information as a list (list context) or HASH-ref (scalar context); |
|
661
|
|
|
|
|
|
|
## + %dateRequest = |
|
662
|
|
|
|
|
|
|
## ( |
|
663
|
|
|
|
|
|
|
## dfilter => $dfilter, ##-- filter-sub, called as: $wanted=$dfilter->($date); undef for none |
|
664
|
|
|
|
|
|
|
## slo => $sliceLo, ##-- minimum slice (inclusive) |
|
665
|
|
|
|
|
|
|
## shi => $sliceHi, ##-- maximum slice (inclusive) |
|
666
|
|
|
|
|
|
|
## dlo => $dateLo, ##-- minimum date (inclusive); undef for none, always defined if $fill is true |
|
667
|
|
|
|
|
|
|
## dhi => $dateHi, ##-- maximum date (inclusive); undef for none, always defined if $fill is true |
|
668
|
|
|
|
|
|
|
## ) |
|
669
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
670
|
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
|
672
|
|
|
|
|
|
|
## \%slice2xids = $coldb->xidsByDate(\@xids, $dateRequest, $sliceRequest, $fill) |
|
673
|
|
|
|
|
|
|
## + parse and filter \@xids by $dateRequest, $sliceRequest |
|
674
|
|
|
|
|
|
|
## + returns a HASH-ref from slice-ids to \@xids in that date-slice |
|
675
|
|
|
|
|
|
|
## + if $fill is true, returned HASH-ref has a key for each date-slice in range |
|
676
|
|
|
|
|
|
|
## + OBSOLETE in DiaColloDB |
|
677
|
|
|
|
|
|
|
sub xidsByDate { |
|
678
|
0
|
|
|
0
|
0
|
|
my ($coldb,$xids,$date,$slice,$fill) = @_; |
|
679
|
0
|
|
|
|
|
|
my ($dfilter,$slo,$shi,$dlo,$dhi) = $coldb->parseDateRequest($date,$slice,$fill); |
|
680
|
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
##-- filter xids |
|
682
|
0
|
|
|
|
|
|
my $xenum = $coldb->{xenum}; |
|
683
|
0
|
|
|
|
|
|
my $pack_x = $coldb->{pack_x}; |
|
684
|
0
|
|
|
|
|
|
my $pack_i = $coldb->{pack_id}; |
|
685
|
0
|
|
|
|
|
|
my $pack_d = $coldb->{pack_date}; |
|
686
|
0
|
|
|
|
|
|
my $pack_xd = "@".(packsize($pack_i) * scalar(@{$coldb->{attrs}})).$pack_d; |
|
|
0
|
|
|
|
|
|
|
|
687
|
0
|
|
|
|
|
|
my $d2xis = {}; ##-- ($dateKey => \@xis_at_date, ...) |
|
688
|
0
|
|
|
|
|
|
my ($xi,$d); |
|
689
|
0
|
|
|
|
|
|
foreach $xi (@$xids) { |
|
690
|
0
|
|
|
|
|
|
$d = unpack($pack_xd, $xenum->i2s($xi)); |
|
691
|
0
|
0
|
0
|
|
|
|
next if (($dfilter && !$dfilter->($d)) || $d < $coldb->{xdmin} || $d > $coldb->{xdmax}); |
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
692
|
0
|
0
|
|
|
|
|
$d = $slice ? int($d/$slice)*$slice : 0; |
|
693
|
0
|
|
|
|
|
|
push(@{$d2xis->{$d}}, $xi); |
|
|
0
|
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
} |
|
695
|
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
##-- force-fill? |
|
697
|
0
|
0
|
0
|
|
|
|
if ($fill && $slice) { |
|
698
|
0
|
|
|
|
|
|
for ($d=$slo; $d <= $shi; $d += $slice) { |
|
699
|
0
|
|
0
|
|
|
|
$d2xis->{$d} //= []; |
|
700
|
|
|
|
|
|
|
} |
|
701
|
|
|
|
|
|
|
} |
|
702
|
|
|
|
|
|
|
|
|
703
|
0
|
|
|
|
|
|
return $d2xis; |
|
704
|
|
|
|
|
|
|
} |
|
705
|
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
|
707
|
|
|
|
|
|
|
## $compiler = $coldb->qcompiler(); |
|
708
|
|
|
|
|
|
|
## + get DDC::Any::CQueryCompiler for this object (cached in $coldb->{_qcompiler}) |
|
709
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
710
|
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
|
712
|
|
|
|
|
|
|
## $cquery_or_undef = $coldb->qparse($ddc_query_string) |
|
713
|
|
|
|
|
|
|
## + wraps parse in an eval {...} block and sets $coldb->{error} on failure |
|
714
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
715
|
|
|
|
|
|
|
|
|
716
|
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
|
718
|
|
|
|
|
|
|
## $cquery = $coldb->parseQuery([[$attr1,$val1],...], %opts) ##-- compat: ARRAY-of-ARRAYs |
|
719
|
|
|
|
|
|
|
## $cquery = $coldb->parseQuery(["$attr1:$val1",...], %opts) ##-- compat: ARRAY-of-requests |
|
720
|
|
|
|
|
|
|
## $cquery = $coldb->parseQuery({$attr1=>$val1, ...}, %opts) ##-- compat: HASH |
|
721
|
|
|
|
|
|
|
## $cquery = $coldb->parseQuery("$attr1=$val1, ...", %opts) ##-- compat: string |
|
722
|
|
|
|
|
|
|
## $cquery = $coldb->parseQuery($ddcQueryString, %opts) ##-- ddc string (with shorthand ","->WITH, "&&"->WITH) |
|
723
|
|
|
|
|
|
|
## + guts for parsing user target and groupby requests |
|
724
|
|
|
|
|
|
|
## + returns a DDC::Any::CQuery object representing the request |
|
725
|
|
|
|
|
|
|
## + index-only items "$l" are mapped to $l=@{} |
|
726
|
|
|
|
|
|
|
## + %opts: |
|
727
|
|
|
|
|
|
|
## warn => $level, ##-- log-level for unknown attributes (default: 'warn') |
|
728
|
|
|
|
|
|
|
## logas => $reqtype, ##-- request type for warnings |
|
729
|
|
|
|
|
|
|
## default => $attr, ##-- default attribute (for query requests) |
|
730
|
|
|
|
|
|
|
## mapand => $bool, ##-- map CQAnd to CQWith? (default=true unless '&&' occurs in query string) |
|
731
|
|
|
|
|
|
|
## ddcmode => $bool, ##-- force ddc query mode? (default=false) |
|
732
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
733
|
|
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
|
735
|
|
|
|
|
|
|
## \@aqs = $coldb->queryAttributes($cquery,%opts) |
|
736
|
|
|
|
|
|
|
## + utility for decomposing DDC queries into attribute-wise requests |
|
737
|
|
|
|
|
|
|
## + returns an ARRAY-ref [[$attr1,$val1], ...] |
|
738
|
|
|
|
|
|
|
## + each value $vali is empty or undef (all values), a CQTokSet, a CQTokExact, CQTokRegex, or CQTokAny |
|
739
|
|
|
|
|
|
|
## + chokes on unsupported query types or filters |
|
740
|
|
|
|
|
|
|
## + %opts: |
|
741
|
|
|
|
|
|
|
## warn => $level, ##-- log-level for unknown attributes (default: 'warn') |
|
742
|
|
|
|
|
|
|
## logas => $reqtype, ##-- request type for warnings |
|
743
|
|
|
|
|
|
|
## default => $attr, ##-- default attribute (for query requests) |
|
744
|
|
|
|
|
|
|
## allowExtra => \@attrs, ##-- allow extra attributes @attrs (may also be HASH-ref) |
|
745
|
|
|
|
|
|
|
## allowUnknown => $bool, ##-- allow unknown attributes? (default: 0) |
|
746
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
747
|
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
|
749
|
|
|
|
|
|
|
## \@aqs = $coldb->parseRequest($request, %opts) |
|
750
|
|
|
|
|
|
|
## + guts for parsing user target and groupby requests into attribute-wise ARRAY-ref [[$attr1,$val1], ...] |
|
751
|
|
|
|
|
|
|
## + see parseQuery() method for supported $request formats and %opts |
|
752
|
|
|
|
|
|
|
## + wraps $coldb->queryAttributes($coldb->parseQuery($request,%opts)) |
|
753
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
754
|
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
|
756
|
|
|
|
|
|
|
## \%groupby = $coldb->groupby($groupby_request, %opts) |
|
757
|
|
|
|
|
|
|
## \%groupby = $coldb->groupby(\%groupby, %opts) |
|
758
|
|
|
|
|
|
|
## + $grouby_request : see parseRequest() |
|
759
|
|
|
|
|
|
|
## + returns a HASH-ref: |
|
760
|
|
|
|
|
|
|
## ( |
|
761
|
|
|
|
|
|
|
## req => $request, ##-- save request |
|
762
|
|
|
|
|
|
|
## #x2g => \&x2g, ##-- group-tuple extraction code suitable for e.g. DiaColloDB::Relation::Cofreqs::profile(groupby=>\&x2g) ##--OLD |
|
763
|
|
|
|
|
|
|
## xi2g => \&xi2g, ##-- group-tuple extraction code ($xi => $gtuple) suitable for e.g. DiaColloDB::Relation::Cofreqs::profile(groupby=>\&x2g) ##--OLD |
|
764
|
|
|
|
|
|
|
## xs2g => \&xs2g, ##-- group-tuple extraction code ($xs => $gtuple) |
|
765
|
|
|
|
|
|
|
## g2s => \&g2s, ##-- stringification object suitable for DiaColloDB::Profile::stringify() [CODE,enum, or undef] |
|
766
|
|
|
|
|
|
|
## g2txt => \&g2txt, ##-- compatible join()-string stringifcation sub |
|
767
|
|
|
|
|
|
|
## xpack => \@xpack, ##-- group-attribute-wise pack-templates, given @xtuple |
|
768
|
|
|
|
|
|
|
## gpack => \@gpack, ##-- group-attribute-wise pack-templates, given @gtuple |
|
769
|
|
|
|
|
|
|
## areqs => \@areqs, ##-- parsed attribute requests ([$attr,$ahaving],...) |
|
770
|
|
|
|
|
|
|
## attrs => \@attrs, ##-- like $coldb->attrs($groupby_request), modulo "having" parts |
|
771
|
|
|
|
|
|
|
## titles => \@titles, ##-- like map {$coldb->attrTitle($_)} @attrs |
|
772
|
|
|
|
|
|
|
## ) |
|
773
|
|
|
|
|
|
|
## + %opts: |
|
774
|
|
|
|
|
|
|
## warn => $level, ##-- log-level for unknown attributes (default: 'warn') |
|
775
|
|
|
|
|
|
|
## relax => $bool, ##-- allow unsupported attributes (default=0) |
|
776
|
|
|
|
|
|
|
## xenum => $xenum, ##-- enum to use for \&x2g and \&g2s (default: $coldb->{xenum}) |
|
777
|
|
|
|
|
|
|
## + OVERRIDES DiaColloDB |
|
778
|
|
|
|
|
|
|
sub groupby { |
|
779
|
0
|
|
|
0
|
1
|
|
my ($coldb,$gbreq,%opts) = @_; |
|
780
|
0
|
0
|
|
|
|
|
return $gbreq if (UNIVERSAL::isa($gbreq,'HASH')); |
|
781
|
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
##-- get data |
|
783
|
0
|
|
0
|
|
|
|
my $wlevel = $opts{warn} // 'warn'; |
|
784
|
0
|
|
|
|
|
|
my $gb = { req=>$gbreq }; |
|
785
|
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
##-- get attribute requests |
|
787
|
0
|
|
|
|
|
|
my $gbareqs = $gb->{areqs} = $coldb->parseRequest($gb->{req}, %opts,logas=>'groupby'); |
|
788
|
|
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
##-- get attribute names (compat) |
|
790
|
0
|
|
|
|
|
|
my $gbattrs = $gb->{attrs} = [map {$_->[0]} @$gbareqs]; |
|
|
0
|
|
|
|
|
|
|
|
791
|
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
##-- get attribute titles |
|
793
|
0
|
|
|
|
|
|
$gb->{titles} = [map {$coldb->attrTitle($_)} @$gbattrs]; |
|
|
0
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
|
|
795
|
|
|
|
|
|
|
##-- get groupby-sub |
|
796
|
0
|
|
0
|
|
|
|
my $xenum = $opts{xenum} // $coldb->{xenum}; |
|
797
|
0
|
|
|
|
|
|
my $pack_id = $coldb->{pack_id}; |
|
798
|
0
|
|
|
|
|
|
my $pack_ids = "($pack_id)*"; |
|
799
|
0
|
|
|
|
|
|
my $len_id = packsize($pack_id); |
|
800
|
0
|
|
|
|
|
|
my @gbxpack = @{$gb->{xpack} = [map {$coldb->{"pack_x$_"}} @$gbattrs]}; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
801
|
0
|
|
|
|
|
|
my $gbxpack = join('',@gbxpack); |
|
802
|
0
|
|
|
|
|
|
my @gbgpack = @{$gb->{gpack} = [map {'@'.($_*$len_id).$pack_id} (0..$#$gbattrs)]}; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
803
|
0
|
|
|
|
|
|
my ($ids); |
|
804
|
|
|
|
|
|
|
my @gbids = ( |
|
805
|
|
|
|
|
|
|
map { |
|
806
|
0
|
|
|
|
|
|
($_->[1] && !UNIVERSAL::isa($_->[1],'DDC::Any::CQTokAny') |
|
807
|
|
|
|
|
|
|
? { |
|
808
|
0
|
|
|
|
|
|
map {($_=>undef)} |
|
809
|
0
|
0
|
0
|
|
|
|
@{$coldb->enumIds($coldb->{"$_->[0]enum"}, $_->[1], logLevel=>$coldb->{logProfile}, logPrefix=>"groupby(): fetch filter ids: $_->[0]")} |
|
|
0
|
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
} |
|
811
|
|
|
|
|
|
|
: undef) |
|
812
|
|
|
|
|
|
|
} @$gbareqs); |
|
813
|
|
|
|
|
|
|
|
|
814
|
0
|
|
|
|
|
|
my (@gi,$xi2g_code,$xs2g_code); |
|
815
|
0
|
0
|
|
|
|
|
if (grep {$_} @gbids) { |
|
|
0
|
|
|
|
|
|
|
|
816
|
|
|
|
|
|
|
##-- group-by code: with having-filters |
|
817
|
|
|
|
|
|
|
$xs2g_code = ('' |
|
818
|
|
|
|
|
|
|
.qq{ \@gi=unpack('$gbxpack',\$_[0]);} |
|
819
|
0
|
|
|
|
|
|
.qq{ return undef if (}.join(' || ', map {"!exists(\$gbids[$_]{\$gi[$_]})"} grep {defined($gbids[$_])} (0..$#gbids)).qq{);} |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
820
|
|
|
|
|
|
|
.qq{ return pack('$pack_ids',\@gi); } |
|
821
|
|
|
|
|
|
|
); |
|
822
|
|
|
|
|
|
|
} |
|
823
|
|
|
|
|
|
|
else { |
|
824
|
|
|
|
|
|
|
##-- group-by code: no filters |
|
825
|
0
|
|
|
|
|
|
$xs2g_code = qq{ pack('$pack_ids', unpack('$gbxpack', \$_[0])) }; |
|
826
|
|
|
|
|
|
|
} |
|
827
|
0
|
|
|
|
|
|
my $xs2g_sub = eval qq{sub {$xs2g_code}}; |
|
828
|
0
|
0
|
|
|
|
|
$coldb->logconfess($coldb->{error}="groupby(): could not compile tuple-based aggregation code sub {$xs2g_code}: $@") if (!$xs2g_sub); |
|
829
|
0
|
|
|
|
|
|
$@=''; |
|
830
|
0
|
|
|
|
|
|
$gb->{xs2g} = $xs2g_sub; |
|
831
|
|
|
|
|
|
|
|
|
832
|
0
|
|
|
|
|
|
($xi2g_code = $xs2g_code) =~ s{\$_\[0\]}{\$xenum->i2s(\$_[0])}; |
|
833
|
0
|
|
|
|
|
|
my $xi2g_sub = eval qq{sub {$xi2g_code}}; |
|
834
|
0
|
0
|
|
|
|
|
$coldb->logconfess($coldb->{error}="groupby(): could not compile id-base aggregation code sub {$xi2g_code}: $@") if (!$xi2g_sub); |
|
835
|
0
|
|
|
|
|
|
$@=''; |
|
836
|
0
|
|
|
|
|
|
$gb->{xi2g} = $xi2g_sub; |
|
837
|
|
|
|
|
|
|
|
|
838
|
|
|
|
|
|
|
##-- get stringification sub |
|
839
|
0
|
|
|
|
|
|
my ($genum,@genums,$g2scode); |
|
840
|
0
|
0
|
|
|
|
|
if (@$gbattrs == 1) { |
|
841
|
|
|
|
|
|
|
##-- stringify a single attribute |
|
842
|
0
|
|
|
|
|
|
$genum = $coldb->{$gbattrs->[0]."enum"}; |
|
843
|
0
|
|
|
|
|
|
$g2scode = qq{ \$genum->i2s(unpack('$pack_id',\$_[0])) }; |
|
844
|
|
|
|
|
|
|
} |
|
845
|
|
|
|
|
|
|
else { |
|
846
|
0
|
|
|
|
|
|
@genums = map {$coldb->{$_."enum"}} @$gbattrs; |
|
|
0
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
$g2scode = ('' |
|
848
|
|
|
|
|
|
|
.qq{ \@gi=unpack('$pack_ids', \$_[0]); } |
|
849
|
0
|
|
|
|
|
|
.q{ join("\t",}.join(', ', map {"\$genums[$_]->i2s(\$gi[$_])"} (0..$#genums)).q{)} |
|
|
0
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
); |
|
851
|
|
|
|
|
|
|
} |
|
852
|
0
|
|
|
|
|
|
my $g2s = eval qq{sub {$g2scode}}; |
|
853
|
0
|
0
|
|
|
|
|
$coldb->logconfess($coldb->{error}="groupby(): could not compile stringification code sub {$g2scode}: $@") if (!$g2s); |
|
854
|
0
|
|
|
|
|
|
$@=''; |
|
855
|
0
|
|
|
|
|
|
$gb->{g2s} = $g2s; |
|
856
|
|
|
|
|
|
|
|
|
857
|
|
|
|
|
|
|
##-- get pseudo-stringification sub ("\t"-joined decimal integer ids) |
|
858
|
0
|
|
|
|
|
|
my ($g2txt_code); |
|
859
|
0
|
0
|
|
|
|
|
if (@$gbattrs == 1) { |
|
860
|
|
|
|
|
|
|
##-- stringify a single attribute |
|
861
|
0
|
|
|
|
|
|
$g2txt_code = qq{ unpack('$pack_id',\$_[0]) }; |
|
862
|
|
|
|
|
|
|
} |
|
863
|
|
|
|
|
|
|
else { |
|
864
|
0
|
|
|
|
|
|
$g2txt_code = qq{ join("\t",unpack('$pack_ids', \$_[0])); }; |
|
865
|
|
|
|
|
|
|
} |
|
866
|
0
|
|
|
|
|
|
my $g2txt = eval qq{sub {$g2txt_code}}; |
|
867
|
0
|
0
|
|
|
|
|
$coldb->logconfess($coldb->{error}="groupby(): could not compile pseudo-stringification code sub {$g2txt_code}: $@") if (!$g2txt); |
|
868
|
0
|
|
|
|
|
|
$@=''; |
|
869
|
0
|
|
|
|
|
|
$gb->{g2txt} = $g2txt; |
|
870
|
|
|
|
|
|
|
|
|
871
|
0
|
|
|
|
|
|
return $gb; |
|
872
|
|
|
|
|
|
|
} |
|
873
|
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
|
875
|
|
|
|
|
|
|
## $cqfilter = $coldb->query2filter($attr,$cquery,%opts) |
|
876
|
|
|
|
|
|
|
## + converts a CQToken to a CQFilter, for ddc parsing |
|
877
|
|
|
|
|
|
|
## + %opts: |
|
878
|
|
|
|
|
|
|
## logas => $logas, ##-- log-prefix for warnings |
|
879
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
880
|
|
|
|
|
|
|
|
|
881
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
|
882
|
|
|
|
|
|
|
## ($CQCountKeyExprs,\$CQRestrict,\@CQFilters) = $coldb->parseGroupBy($groupby_string_or_request,%opts) |
|
883
|
|
|
|
|
|
|
## + for ddc-mode parsing |
|
884
|
|
|
|
|
|
|
## + %opts: |
|
885
|
|
|
|
|
|
|
## date => $date, |
|
886
|
|
|
|
|
|
|
## slice => $slice, |
|
887
|
|
|
|
|
|
|
## matchid => $matchid, ##-- default match-id |
|
888
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
889
|
|
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
891
|
|
|
|
|
|
|
## Profiling: Generic |
|
892
|
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
## $mprf = $coldb->profile($relation, %opts) |
|
894
|
|
|
|
|
|
|
## + get a relation profile for selected items as a DiaColloDB::Profile::Multi object |
|
895
|
|
|
|
|
|
|
## + %opts: |
|
896
|
|
|
|
|
|
|
## ( |
|
897
|
|
|
|
|
|
|
## ##-- selection parameters |
|
898
|
|
|
|
|
|
|
## query => $query, ##-- target request ATTR:REQ... |
|
899
|
|
|
|
|
|
|
## date => $date1, ##-- string or array or range "MIN-MAX" (inclusive) : default=all |
|
900
|
|
|
|
|
|
|
## ## |
|
901
|
|
|
|
|
|
|
## ##-- aggregation parameters |
|
902
|
|
|
|
|
|
|
## slice => $slice, ##-- date slice (default=1, 0 for global profile) |
|
903
|
|
|
|
|
|
|
## groupby => $groupby, ##-- string or array "ATTR1[:HAVING1] ...": default=$coldb->attrs; see groupby() method |
|
904
|
|
|
|
|
|
|
## ## |
|
905
|
|
|
|
|
|
|
## ##-- scoring and trimming parameters |
|
906
|
|
|
|
|
|
|
## eps => $eps, ##-- smoothing constant (default=0) |
|
907
|
|
|
|
|
|
|
## score => $func, ##-- scoring function (f|fm|lf|lfm|mi|ld) : default="f" |
|
908
|
|
|
|
|
|
|
## kbest => $k, ##-- return only $k best collocates per date (slice) : default=-1:all |
|
909
|
|
|
|
|
|
|
## cutoff => $cutoff, ##-- minimum score |
|
910
|
|
|
|
|
|
|
## global => $bool, ##-- trim profiles globally (vs. locally for each date-slice?) (default=0) |
|
911
|
|
|
|
|
|
|
## ## |
|
912
|
|
|
|
|
|
|
## ##-- profiling and debugging parameters |
|
913
|
|
|
|
|
|
|
## strings => $bool, ##-- do/don't stringify (default=do) |
|
914
|
|
|
|
|
|
|
## fill => $bool, ##-- if true, returned multi-profile will have null profiles inserted for missing slices |
|
915
|
|
|
|
|
|
|
## onepass => $bool, ##-- if true, use fast but incorrect 1-pass method (Cofreqs profiling only) |
|
916
|
|
|
|
|
|
|
## ) |
|
917
|
|
|
|
|
|
|
## + sets default %opts and wraps $coldb->relation($rel)->profile($coldb, %opts) |
|
918
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
919
|
|
|
|
|
|
|
|
|
920
|
|
|
|
|
|
|
## \%opts = $CLASS_OR_OBJECT->profileOptions(\%opts) |
|
921
|
|
|
|
|
|
|
## + instantiates default options for profile() method |
|
922
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
923
|
|
|
|
|
|
|
|
|
924
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
925
|
|
|
|
|
|
|
## Profiling: Comparison (diff) |
|
926
|
|
|
|
|
|
|
|
|
927
|
|
|
|
|
|
|
## $mprf = $coldb->compare($relation, %opts) |
|
928
|
|
|
|
|
|
|
## + get a relation comparison profile for selected items as a DiaColloDB::Profile::MultiDiff object |
|
929
|
|
|
|
|
|
|
## + %opts: |
|
930
|
|
|
|
|
|
|
## ( |
|
931
|
|
|
|
|
|
|
## ##-- selection parameters |
|
932
|
|
|
|
|
|
|
## (a|b)?query => $query, ##-- target query as for parseRequest() |
|
933
|
|
|
|
|
|
|
## (a|b)?date => $date1, ##-- string or array or range "MIN-MAX" (inclusive) : default=all |
|
934
|
|
|
|
|
|
|
## ## |
|
935
|
|
|
|
|
|
|
## ##-- aggregation parameters |
|
936
|
|
|
|
|
|
|
## groupby => $groupby, ##-- string or array "ATTR1[:HAVING1] ...": default=$coldb->attrs; see groupby() method |
|
937
|
|
|
|
|
|
|
## (a|b)?slice => $slice, ##-- date slice (default=1, 0 for global profile) |
|
938
|
|
|
|
|
|
|
## ## |
|
939
|
|
|
|
|
|
|
## ##-- scoring and trimming parameters |
|
940
|
|
|
|
|
|
|
## eps => $eps, ##-- smoothing constant (default=0) |
|
941
|
|
|
|
|
|
|
## score => $func, ##-- scoring function (f|fm|lf|lfm|mi|ld) : default="f" |
|
942
|
|
|
|
|
|
|
## kbest => $k, ##-- return only $k best collocates per date (slice) : default=-1:all |
|
943
|
|
|
|
|
|
|
## cutoff => $cutoff, ##-- minimum score (UNUSED for comparison profiles) |
|
944
|
|
|
|
|
|
|
## global => $bool, ##-- trim profiles globally (vs. locally for each date-slice?) (default=0) |
|
945
|
|
|
|
|
|
|
## diff => $diff, ##-- low-level score-diff operation (diff|adiff|sum|min|max|avg|havg); default='adiff' |
|
946
|
|
|
|
|
|
|
## ## |
|
947
|
|
|
|
|
|
|
## ##-- profiling and debugging parameters |
|
948
|
|
|
|
|
|
|
## strings => $bool, ##-- do/don't stringify (default=do) |
|
949
|
|
|
|
|
|
|
## ) |
|
950
|
|
|
|
|
|
|
## + sets default %opts and wraps $coldb->relation($rel)->compare($coldb, %opts) |
|
951
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
952
|
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
## \%opts = $CLASS_OR_OBJECT->compareOptions(\%opts) |
|
954
|
|
|
|
|
|
|
## + instantiates default options for compare() method |
|
955
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB |
|
956
|
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
##============================================================================== |
|
958
|
|
|
|
|
|
|
## Footer |
|
959
|
|
|
|
|
|
|
1; |
|
960
|
|
|
|
|
|
|
|
|
961
|
|
|
|
|
|
|
__END__ |