line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
## -*- Mode: CPerl -*- |
2
|
|
|
|
|
|
|
## File: DiaColloDB.pm |
3
|
|
|
|
|
|
|
## Author: Bryan Jurish <moocow@cpan.org> |
4
|
|
|
|
|
|
|
## Description: collocation db, top-level |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package DiaColloDB; |
7
|
2
|
|
|
2
|
|
213646
|
use 5.010; ##-- v5.10.0: for // operator |
|
2
|
|
|
|
|
8
|
|
8
|
|
|
|
|
|
|
|
9
|
2
|
|
|
2
|
|
896
|
use DiaColloDB::Compat; |
|
2
|
|
|
|
|
6
|
|
|
2
|
|
|
|
|
74
|
|
10
|
2
|
|
|
2
|
|
1316
|
use DiaColloDB::Client; |
|
1
|
|
|
|
|
13
|
|
|
1
|
|
|
|
|
99
|
|
11
|
1
|
|
|
1
|
|
12
|
use DiaColloDB::Logger; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
26
|
|
12
|
1
|
|
|
1
|
|
862
|
use DiaColloDB::EnumFile; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
58
|
|
13
|
|
|
|
|
|
|
#use DiaColloDB::EnumFile::Identity; |
14
|
1
|
|
|
1
|
|
761
|
use DiaColloDB::EnumFile::FixedLen; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
59
|
|
15
|
1
|
|
|
1
|
|
698
|
use DiaColloDB::EnumFile::FixedMap; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
51
|
|
16
|
1
|
|
|
1
|
|
676
|
use DiaColloDB::EnumFile::MMap; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
67
|
|
17
|
1
|
|
|
1
|
|
678
|
use DiaColloDB::EnumFile::Tied; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
43
|
|
18
|
1
|
|
|
1
|
|
654
|
use DiaColloDB::MultiMapFile; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
78
|
|
19
|
1
|
|
|
1
|
|
723
|
use DiaColloDB::MultiMapFile::MMap; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
41
|
|
20
|
1
|
|
|
1
|
|
564
|
use DiaColloDB::PackedFile; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
51
|
|
21
|
1
|
|
|
1
|
|
748
|
use DiaColloDB::PackedFile::MMap; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
62
|
|
22
|
1
|
|
|
1
|
|
724
|
use DiaColloDB::Relation; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
46
|
|
23
|
1
|
|
|
1
|
|
1229
|
use DiaColloDB::Relation::Unigrams; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
52
|
|
24
|
1
|
|
|
1
|
|
769
|
use DiaColloDB::Relation::Cofreqs; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
60
|
|
25
|
1
|
|
|
1
|
|
837
|
use DiaColloDB::Relation::DDC; |
|
1
|
|
|
|
|
7
|
|
|
1
|
|
|
|
|
50
|
|
26
|
|
|
|
|
|
|
#use DiaColloDB::Relation::TDF; ##-- loaded on-demand |
27
|
1
|
|
|
1
|
|
9
|
use DiaColloDB::Profile; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
26
|
|
28
|
1
|
|
|
1
|
|
7
|
use DiaColloDB::Profile::Multi; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
22
|
|
29
|
1
|
|
|
1
|
|
736
|
use DiaColloDB::Profile::MultiDiff; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
36
|
|
30
|
1
|
|
|
1
|
|
512
|
use DiaColloDB::Corpus; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
53
|
|
31
|
1
|
|
|
1
|
|
702
|
use DiaColloDB::Corpus::Compiled; |
|
1
|
|
|
|
|
6
|
|
|
1
|
|
|
|
|
69
|
|
32
|
1
|
|
|
1
|
|
15
|
use DiaColloDB::Corpus::Filters qw(:defaults); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
90
|
|
33
|
1
|
|
|
1
|
|
351
|
use DiaColloDB::Persistent; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
30
|
|
34
|
1
|
|
|
1
|
|
6
|
use DiaColloDB::Utils qw(:math :fcntl :json :sort :pack :regex :file :si :run :env :temp :jobs); |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
37
|
|
35
|
|
|
|
|
|
|
#use DiaColloDB::Temp::Vec; |
36
|
1
|
|
|
1
|
|
1229
|
use DiaColloDB::Timer; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
36
|
|
37
|
1
|
|
|
1
|
|
1387
|
use DDC::Any; ##-- for query parsing |
|
1
|
|
|
|
|
26068
|
|
|
1
|
|
|
|
|
8
|
|
38
|
1
|
|
|
1
|
|
90174
|
use Fcntl; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
85
|
|
39
|
1
|
|
|
1
|
|
393
|
use File::Path qw(make_path remove_tree); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
26
|
|
40
|
1
|
|
|
1
|
|
106
|
use version; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
11
|
|
41
|
1
|
|
|
1
|
|
82
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
573
|
|
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
##============================================================================== |
45
|
|
|
|
|
|
|
## Globals & Constants |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
our $VERSION = "0.12.020"; |
48
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB::Client); |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
## $TDF_MGOOD_DEFAULT |
51
|
|
|
|
|
|
|
## + default positive meta-field regex for document parsing (tdf only) |
52
|
|
|
|
|
|
|
## + don't use qr// here, since Storable doesn't like pre-compiled Regexps |
53
|
|
|
|
|
|
|
our $TDF_MGOOD_DEFAULT = q/^(?:author|pnd|title|basename|collection|flags|textClass|genre|country|region|party|role)$/; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
## $TDF_MBAD_DEFAULT |
56
|
|
|
|
|
|
|
## + default negative meta-field regex for document parsing (tdf only) |
57
|
|
|
|
|
|
|
## + don't use qr// here, since Storable doesn't like pre-compiled Regexps. |
58
|
|
|
|
|
|
|
our $TDF_MBAD_DEFAULT = q/_$/; |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
## $ECLASS |
61
|
|
|
|
|
|
|
## + enum class |
62
|
|
|
|
|
|
|
#our $ECLASS = 'DiaColloDB::EnumFile'; |
63
|
|
|
|
|
|
|
our $ECLASS = 'DiaColloDB::EnumFile::MMap'; |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
## $XECLASS |
66
|
|
|
|
|
|
|
## + fixed-length enum class |
67
|
|
|
|
|
|
|
#our $XECLASS = 'DiaColloDB::EnumFile::FixedLen'; |
68
|
|
|
|
|
|
|
our $XECLASS = 'DiaColloDB::EnumFile::FixedLen::MMap'; |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
## $MMCLASS |
71
|
|
|
|
|
|
|
## + multimap class |
72
|
|
|
|
|
|
|
#our $MMCLASS = 'DiaColloDB::MultiMapFile'; |
73
|
|
|
|
|
|
|
our $MMCLASS = 'DiaColloDB::MultiMapFile::MMap'; |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
## %TDF_OPTS : tdf: default options for DiaColloDB::Relation::TDF->new() |
76
|
|
|
|
|
|
|
our %TDF_OPTS = ( |
77
|
|
|
|
|
|
|
mgood => $TDF_MGOOD_DEFAULT, ##-- positive filter regex for metadata attributes |
78
|
|
|
|
|
|
|
mbad => $TDF_MBAD_DEFAULT, ##-- negative filter regex for metadata attributes |
79
|
|
|
|
|
|
|
## |
80
|
|
|
|
|
|
|
minFreq=>undef, ##-- minimum total term-frequency for model inclusion (default=from $coldb->{tfmin}) |
81
|
|
|
|
|
|
|
minDocFreq=>4, ##-- minimim "doc-frequency" (#/docs per term) for model inclusion |
82
|
|
|
|
|
|
|
minDocSize=>4, ##-- minimum doc size (#/tokens per doc) for model inclusion (default=8; formerly $coldb->{vbnmin}) |
83
|
|
|
|
|
|
|
## + for kern[page?] (n:%sigs,%toks): 1:0%,0%, 2:5.1%,0.5%, 4:18%,1.6%, 5:22%,2.3%, 8:34%,4.6%, 10:40%,6.5%, 16:54%,12.8% |
84
|
|
|
|
|
|
|
maxDocSize=>'inf', ##-- maximum doc size (#/tokens per doc) for model inclusion (default=inf; formerly $coldb->{vbnmax}) |
85
|
|
|
|
|
|
|
## |
86
|
|
|
|
|
|
|
#smoothf=>1, ##-- smoothing constant |
87
|
|
|
|
|
|
|
#saveMem=>1, ##-- slower but memory-friendlier compilation |
88
|
|
|
|
|
|
|
## |
89
|
|
|
|
|
|
|
vtype=>'float', ##-- store compiled values as 32-bit floats |
90
|
|
|
|
|
|
|
itype=>'long', ##-- store compiled indices as 32-bit integers |
91
|
|
|
|
|
|
|
); |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
## $NJOBS |
94
|
|
|
|
|
|
|
## + number of parallel jobs for various operations |
95
|
|
|
|
|
|
|
## + setting this to 0 (zero) will run in pure serial |
96
|
|
|
|
|
|
|
## + on unix/linux, setting this to "-1" will use the total number of cores on your system, |
97
|
|
|
|
|
|
|
## otherwise behaves like 0 |
98
|
|
|
|
|
|
|
our $NJOBS = -1; |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
##============================================================================== |
101
|
|
|
|
|
|
|
## Constructors etc. |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
## $coldb = CLASS_OR_OBJECT->new(%args) |
104
|
|
|
|
|
|
|
## + %args, object structure: |
105
|
|
|
|
|
|
|
## ( |
106
|
|
|
|
|
|
|
## ##-- options |
107
|
|
|
|
|
|
|
## dbdir => $dbdir, ##-- database directory; REQUIRED |
108
|
|
|
|
|
|
|
## flags => $fcflags, ##-- fcntl flags or open()-style mode string; default='r' |
109
|
|
|
|
|
|
|
## attrs => \@attrs, ##-- index attributes (input as space-separated or array; compiled to array); default=undef (==>['l']) |
110
|
|
|
|
|
|
|
## ## + each attribute can be token-attribute qw(w p l) or a document metadata attribute "doc.ATTR" |
111
|
|
|
|
|
|
|
## ## + document "date" attribute is always indexed |
112
|
|
|
|
|
|
|
## info => \%info, ##-- additional data to return in info() method (e.g. collection, maintainer) |
113
|
|
|
|
|
|
|
## #bos => $bos, ##-- special string to use for BOS, undef or empty for none (default=undef) DISABLED |
114
|
|
|
|
|
|
|
## #eos => $eos, ##-- special string to use for EOS, undef or empty for none (default=undef) DISABLED |
115
|
|
|
|
|
|
|
## pack_id => $fmt, ##-- pack-format for IDs (default='N') |
116
|
|
|
|
|
|
|
## pack_f => $fmt, ##-- pack-format for frequencies (default='N') |
117
|
|
|
|
|
|
|
## pack_date => $fmt, ##-- pack-format for dates (default='n') |
118
|
|
|
|
|
|
|
## pack_off => $fmt, ##-- pack-format for file offsets (default='N') |
119
|
|
|
|
|
|
|
## pack_len => $len, ##-- pack-format for string lengths (default='n') |
120
|
|
|
|
|
|
|
## dmax => $dmax, ##-- maximum distance for collocation-frequencies and implicit ddc near() queries (default=5) |
121
|
|
|
|
|
|
|
## cfmin => $cfmin, ##-- minimum co-occurrence frequency for Cofreqs and ddc queries (default=2) |
122
|
|
|
|
|
|
|
## tfmin => $tfmin, ##-- minimum global term-frequency WITHOUT date component (default=2) |
123
|
|
|
|
|
|
|
## fmin_${a} => $fmin, ##-- minimum independent frequency for value of attribute ${a} (default=undef:from $tfmin) |
124
|
|
|
|
|
|
|
## keeptmp => $bool, ##-- keep temporary files? (default=0) |
125
|
|
|
|
|
|
|
## mmap => $bool, ##-- use mmap() subclasses if available? (default: true) |
126
|
|
|
|
|
|
|
## debug => $bool, ##-- enable painful debugging code? (default: false) |
127
|
|
|
|
|
|
|
## index_tdf => $bool, ##-- tdf: create/use (term x document) frequency matrix index? (default=undef: if available) |
128
|
|
|
|
|
|
|
## index_cof => $bool, ##-- cof: create/use co-frequency index (default=1) |
129
|
|
|
|
|
|
|
## index_xf => $bool, ##-- xf: create/use unigram index (default=1) |
130
|
|
|
|
|
|
|
## dbreak => $dbreak, ##-- tdf: use break-type $break for tdf index (default=undef: files) |
131
|
|
|
|
|
|
|
## tdfopts=>\%tdfopts, ##-- tdf: options for DiaColloDB::Relation::TDF->new(); default=undef (all inherited from %TDF_OPTS) |
132
|
|
|
|
|
|
|
## ## |
133
|
|
|
|
|
|
|
## ##-- runtime ddc relation options |
134
|
|
|
|
|
|
|
## ddcServer => "$host:$port", ##-- server for ddc relation |
135
|
|
|
|
|
|
|
## ddcTimeout => $seconds, ##-- timeout for ddc relation |
136
|
|
|
|
|
|
|
## ## |
137
|
|
|
|
|
|
|
## ##-- source filtering (for create()) |
138
|
|
|
|
|
|
|
## pgood => $regex, ##-- positive filter regex for part-of-speech tags |
139
|
|
|
|
|
|
|
## pbad => $regex, ##-- negative filter regex for part-of-speech tags |
140
|
|
|
|
|
|
|
## wgood => $regex, ##-- positive filter regex for word text |
141
|
|
|
|
|
|
|
## wbad => $regex, ##-- negative filter regex for word text |
142
|
|
|
|
|
|
|
## lgood => $regex, ##-- positive filter regex for lemma text |
143
|
|
|
|
|
|
|
## lbad => $regex, ##-- negative filter regex for lemma text |
144
|
|
|
|
|
|
|
## ## |
145
|
|
|
|
|
|
|
## ##-- logging |
146
|
|
|
|
|
|
|
## logOpen => $level, ##-- log-level for open/close (default='info') |
147
|
|
|
|
|
|
|
## logCreate => $level, ##-- log-level for create messages (default='info') |
148
|
|
|
|
|
|
|
## logThread => $level, ##-- log-level for multithreading operations (default='debug') |
149
|
|
|
|
|
|
|
## logCorpusFile => $level, ##-- log-level for corpus file-parsing (default='info') |
150
|
|
|
|
|
|
|
## logCorpusFileN => $N, ##-- log corpus file-parsing only for every N files (0 for none; default:undef ~ $corpus->size()/100) |
151
|
|
|
|
|
|
|
## logExport => $level, ##-- log-level for export messages (default='info') |
152
|
|
|
|
|
|
|
## logProfile => $level, ##-- log-level for verbose profiling messages (default='trace') |
153
|
|
|
|
|
|
|
## logRequest => $level, ##-- log-level for request-level profiling messages (default='debug') |
154
|
|
|
|
|
|
|
## logCompat => $level, ##-- log-level for compatibility warnings (default='warn') |
155
|
|
|
|
|
|
|
## ## |
156
|
|
|
|
|
|
|
## ##-- runtime limits |
157
|
|
|
|
|
|
|
## maxExpand => $size, ##-- maximum number of elements in query expansions (default=65535) |
158
|
|
|
|
|
|
|
## ## |
159
|
|
|
|
|
|
|
## ##-- administrivia |
160
|
|
|
|
|
|
|
## version=>$version, ##-- DiaColloDB version of stored db (==$DiaColloDB::VERSION) |
161
|
|
|
|
|
|
|
## upgraded=>\@upgraded, ##-- optional administrative information about auto-magic upgrades |
162
|
|
|
|
|
|
|
## ## |
163
|
|
|
|
|
|
|
## ##-- attribute data |
164
|
|
|
|
|
|
|
## ${a}enum => $aenum, ##-- attribute enum: $aenum : ($dbdir/${a}_enum.*) : $astr<=>$ai : A*<=>N |
165
|
|
|
|
|
|
|
## ## e.g. lemmata: $lenum : ($dbdir/l_enum.* ) : $lstr<=>$li : A*<=>N |
166
|
|
|
|
|
|
|
## ${a}2t => $a2t, ##-- attribute multimap: $a2t : ($dbdir/${a}_2t.*) : $ai=>@tis : N=>N* |
167
|
|
|
|
|
|
|
## pack_t$a => $fmt ##-- pack format: extract attribute-id $ai from a packed tuple-string $ts ; $ai=unpack($coldb->{"pack_t$a"},$ts) |
168
|
|
|
|
|
|
|
## ## |
169
|
|
|
|
|
|
|
## ##-- tuple data (-dates) |
170
|
|
|
|
|
|
|
## ## + as of v0.10.000, packed term tuples EXCLUDING dates ("t-tuples") are mapped by $coldb->{tenum} |
171
|
|
|
|
|
|
|
## ## + prior to v0.10.000, term tuples INCLUDING dates ("x-tuples") were mapped by $coldb->{xenum}, now obsolete |
172
|
|
|
|
|
|
|
## tenum => $tenum, ##-- enum: tuples ($dbdir/tenum.*) : \@ais<=>$ti : N*<=>N |
173
|
|
|
|
|
|
|
## pack_t => $fmt, ##-- symbol pack-format for $tenum : "${pack_id}[Nattrs]" |
174
|
|
|
|
|
|
|
## xdmin => $xdmin, ##-- minimum date (>= v0.04) |
175
|
|
|
|
|
|
|
## xdmax => $xdmax, ##-- maximum date (>= v0.04) |
176
|
|
|
|
|
|
|
## ## |
177
|
|
|
|
|
|
|
## ##-- relation data |
178
|
|
|
|
|
|
|
## #xf => $xf, ##-- ug: $xi => $f($xi) : N=>N |
179
|
|
|
|
|
|
|
## #cof => $cof, ##-- cf: [$xi1,$xi2] => $f12 |
180
|
|
|
|
|
|
|
## xf => $xf, ##-- ug: [$ti,$date] => f($ti,$date) |
181
|
|
|
|
|
|
|
## cof => $cof, ##-- cf: [$ti1,$date,$ti2] => f($ti1,$date,$ti2) |
182
|
|
|
|
|
|
|
## ddc => $ddc, ##-- ddc: ddc client relation |
183
|
|
|
|
|
|
|
## tdf => $tdf, ##-- tdf: (term x document) frequency matrix relation |
184
|
|
|
|
|
|
|
## ) |
185
|
|
|
|
|
|
|
sub new { |
186
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
187
|
|
|
|
|
|
|
my $coldb = bless({ |
188
|
|
|
|
|
|
|
##-- options |
189
|
|
|
|
|
|
|
dbdir => undef, |
190
|
|
|
|
|
|
|
flags => 'r', |
191
|
|
|
|
|
|
|
attrs => undef, |
192
|
|
|
|
|
|
|
#bos => undef, |
193
|
|
|
|
|
|
|
#eos => undef, |
194
|
|
|
|
|
|
|
pack_id => 'N', |
195
|
|
|
|
|
|
|
pack_f => 'N', |
196
|
|
|
|
|
|
|
pack_date => 'n', |
197
|
|
|
|
|
|
|
pack_off => 'N', |
198
|
|
|
|
|
|
|
pack_len =>'n', |
199
|
|
|
|
|
|
|
dmax => 5, |
200
|
|
|
|
|
|
|
cfmin => 2, |
201
|
|
|
|
|
|
|
tfmin => 2, |
202
|
|
|
|
|
|
|
#keeptmp => 0, |
203
|
|
|
|
|
|
|
#mmap => 1, |
204
|
|
|
|
|
|
|
#debug => 0, |
205
|
|
|
|
|
|
|
index_tdf => undef, |
206
|
|
|
|
|
|
|
index_cof => 1, |
207
|
|
|
|
|
|
|
index_xf => 1, |
208
|
|
|
|
|
|
|
dbreak => undef, |
209
|
|
|
|
|
|
|
tdfopts => {}, |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
##-- filters (pgood, pbad, etc. now in DiaColloDB::Corpus::Filters; default value see below) |
212
|
0
|
|
0
|
|
|
|
%{DiaColloDB::Corpus::Filters->new}, |
|
0
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
#vsmgood => $TDF_MGOOD_DEFAULT, |
214
|
|
|
|
|
|
|
#vsmbad => $TDF_MBAD_DEFAULT, |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
##-- logging |
217
|
|
|
|
|
|
|
logOpen => 'info', |
218
|
|
|
|
|
|
|
logCreate => 'info', |
219
|
|
|
|
|
|
|
logThread => 'debug', |
220
|
|
|
|
|
|
|
logCorpusFile => 'info', |
221
|
|
|
|
|
|
|
logCorpusFileN => undef, |
222
|
|
|
|
|
|
|
logExport => 'info', |
223
|
|
|
|
|
|
|
logProfile => 'trace', |
224
|
|
|
|
|
|
|
logRequest => 'debug', |
225
|
|
|
|
|
|
|
logCompat => 'warn', |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
##-- limits |
228
|
|
|
|
|
|
|
maxExpand => 65535, |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
##-- administrivia |
231
|
|
|
|
|
|
|
version => "$VERSION", |
232
|
|
|
|
|
|
|
#upgraded=>[], |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
##-- attributes |
235
|
|
|
|
|
|
|
#lenum => undef, #$ECLASS->new(pack_i=>$coldb->{pack_id}, pack_o=>$coldb->{pack_off}, pack_l=>$coldb->{pack_len}), |
236
|
|
|
|
|
|
|
#l2t => undef, #$MMCLASS->new(pack_i=>$coldb->{pack_id}, pack_o=>$coldb->{pack_off}, pack_l=>$coldb->{pack_id}), |
237
|
|
|
|
|
|
|
#pack_tl => 'N', |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
##-- tuples (-dates) |
240
|
|
|
|
|
|
|
#tenum => undef, #$XECLASS::FixedLen->new(pack_i=>$coldb->{pack_id}, pack_s=>$coldb->{pack_t}, pack_d=>$coldb->{pack_date}), |
241
|
|
|
|
|
|
|
#pack_t => 'N', |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
##-- relations |
244
|
|
|
|
|
|
|
#xf => undef, #DiaColloDB::Relation::Unigrams->new(pack_i=>$pack_i, pack_f=>$pack_f, pack_d=>$pack_date), |
245
|
|
|
|
|
|
|
#cof => undef, #DiaColloDB::Relation::Cofreqs->new(pack_f=>$pack_f, pack_i=>$pack_i, pack_d=>$pack_date, dmax=>$dmax, fmin=>$cfmin), |
246
|
|
|
|
|
|
|
#ddc => undef, #DiaColloDB::Relation::DDC->new(), |
247
|
|
|
|
|
|
|
#tdf => undef, #DiaColloDB::Relation::TDF->new(), |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
@_, ##-- user arguments |
250
|
|
|
|
|
|
|
}, |
251
|
|
|
|
|
|
|
ref($that)||$that); |
252
|
0
|
|
|
|
|
|
$coldb->{class} = ref($coldb); |
253
|
0
|
|
|
|
|
|
$coldb->{pack_t} = $coldb->{pack_id}; |
254
|
0
|
0
|
|
|
|
|
if (defined($coldb->{dbdir})) { |
255
|
|
|
|
|
|
|
##-- avoid initial close() if called with dbdir=>$dbdir argument |
256
|
0
|
|
|
|
|
|
my $dbdir = $coldb->{dbdir}; |
257
|
0
|
|
|
|
|
|
delete $coldb->{dbdir}; |
258
|
0
|
|
|
|
|
|
return $coldb->open($dbdir); |
259
|
|
|
|
|
|
|
} |
260
|
0
|
|
|
|
|
|
return $coldb; |
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
## undef = $obj->DESTROY |
264
|
|
|
|
|
|
|
## + destructor calls close() if necessary |
265
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Client |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
## $cli_or_undef = $cli->promote($class,%opts) |
268
|
|
|
|
|
|
|
## + DiaColloDB::Client method override |
269
|
|
|
|
|
|
|
sub promote { |
270
|
0
|
|
|
0
|
1
|
|
$_[0]->logconfess("promote(): not supported"); |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
##======================================================================== |
274
|
|
|
|
|
|
|
## Create/compile |
275
|
|
|
|
|
|
|
our (%ATTR_ALIAS,%ATTR_RALIAS,%ATTR_TITLE,%ATTR_CBEXPR); |
276
|
1
|
|
|
1
|
|
854
|
use DiaColloDB::methods::compile; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
1792
|
|
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
##============================================================================== |
279
|
|
|
|
|
|
|
## I/O: open/close |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
## $coldb_or_undef = $coldb->open($dbdir,%opts) |
282
|
|
|
|
|
|
|
## $coldb_or_undef = $coldb->open() |
283
|
|
|
|
|
|
|
sub open { |
284
|
0
|
|
|
0
|
1
|
|
my ($coldb,$dbdir,%opts) = @_; |
285
|
0
|
|
|
|
|
|
DiaColloDB::Logger->ensureLog(); |
286
|
0
|
0
|
|
|
|
|
$coldb = $coldb->new() if (!ref($coldb)); |
287
|
|
|
|
|
|
|
#@$coldb{keys %opts} = values %opts; ##-- clobber options after loadHeader() |
288
|
0
|
|
0
|
|
|
|
$dbdir //= $coldb->{dbdir}; |
289
|
0
|
|
|
|
|
|
$dbdir =~ s{/$}{}; |
290
|
0
|
0
|
|
|
|
|
$coldb->close() if ($coldb->opened); |
291
|
0
|
|
|
|
|
|
$coldb->{dbdir} = $dbdir; |
292
|
0
|
|
0
|
|
|
|
my $flags = fcflags($opts{flags} // $coldb->{flags}); |
293
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logOpen}, "open($dbdir)"); |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
##-- open: truncate |
296
|
0
|
0
|
|
|
|
|
if (fctrunc($flags)) { |
297
|
0
|
|
|
|
|
|
$flags |= O_CREAT; |
298
|
0
|
0
|
0
|
|
|
|
!-d $dbdir |
299
|
|
|
|
|
|
|
or remove_tree($dbdir) |
300
|
|
|
|
|
|
|
or $coldb->logconfess("open(): could not remove old $dbdir: $!"); |
301
|
|
|
|
|
|
|
} |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
##-- open: create |
304
|
0
|
0
|
|
|
|
|
if (!-d $dbdir) { |
305
|
0
|
0
|
|
|
|
|
$coldb->logconfess("open(): no such directory '$dbdir'") if (!fccreat($flags)); |
306
|
0
|
0
|
|
|
|
|
make_path($dbdir) |
307
|
|
|
|
|
|
|
or $coldb->logconfess("open(): could not create DB directory '$dbdir': $!"); |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
##-- open: header |
311
|
0
|
|
|
|
|
|
my ($hdr); |
312
|
0
|
0
|
0
|
|
|
|
if (fcread($flags) && !fctrunc($flags)) { |
313
|
0
|
0
|
|
|
|
|
$hdr = $coldb->readHeader() |
314
|
|
|
|
|
|
|
or $coldb->logconfess("open(): failed to read header file '", $coldb->headerFile, "': $!"); |
315
|
0
|
0
|
|
|
|
|
$coldb->loadHeaderData($hdr) |
316
|
|
|
|
|
|
|
or $coldb->logconess("failed to instantiate header from '", $coldb->headerFile, "': $!"); |
317
|
|
|
|
|
|
|
} |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
##-- clobber header options with user-supplied values |
320
|
0
|
|
|
|
|
|
@$coldb{keys %opts} = values %opts; |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
##-- open: check compatiblity |
323
|
0
|
|
|
|
|
|
my $min_version_compat = '0.10.000'; |
324
|
0
|
0
|
0
|
|
|
|
if (!$coldb->{version} || version->parse($coldb->{version}) < version->parse($min_version_compat)) { |
|
|
0
|
0
|
|
|
|
|
325
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCompat}, "using compatibility mode for DB directory '$dbdir'; consider running \`dcdb-upgrade.perl $dbdir\'"); |
326
|
0
|
|
|
|
|
|
DiaColloDB::Compat->usecompat('v0_09'); |
327
|
0
|
|
|
|
|
|
bless($coldb, 'DiaColloDB::Compat::v0_09::DiaColloDB'); |
328
|
0
|
|
|
|
|
|
$coldb->{version} = $hdr->{version}; |
329
|
0
|
|
|
|
|
|
delete $coldb->{dbdir}; |
330
|
0
|
|
|
|
|
|
return $coldb->open($dbdir,%opts); |
331
|
|
|
|
|
|
|
} |
332
|
|
|
|
|
|
|
elsif (!defined($coldb->{xdmin}) || !defined($coldb->{xdmax})) { |
333
|
0
|
|
|
|
|
|
$coldb->logconfess("open(): no date-range keys {xdmin,xdmax} found in header; try running \`dcdb-upgrade.perl $dbdir'"); |
334
|
|
|
|
|
|
|
} |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
##-- open: tdf: require |
337
|
0
|
0
|
|
|
|
|
$coldb->{index_tdf} = 0 if (!-r "$dbdir/tdf.hdr"); |
338
|
0
|
0
|
|
|
|
|
if ($coldb->{index_tdf}) { |
339
|
0
|
0
|
|
|
|
|
if (!require "DiaColloDB/Relation/TDF.pm") { |
340
|
0
|
0
|
|
|
|
|
$coldb->logwarn("open(): require failed for DiaColloDB/Relation/TDF.pm ; (term x document) matrix modelling disabled", ($@ ? "\n: $@" : '')); |
341
|
0
|
|
|
|
|
|
$coldb->{index_tdf} = 0; |
342
|
|
|
|
|
|
|
} |
343
|
|
|
|
|
|
|
} |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
##-- open: common options |
346
|
0
|
|
|
|
|
|
my %efopts = (flags=>$flags, pack_i=>$coldb->{pack_id}, pack_o=>$coldb->{pack_off}, pack_l=>$coldb->{pack_len}); |
347
|
0
|
|
|
|
|
|
my %mmopts = (flags=>$flags, pack_i=>$coldb->{pack_id}); |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
##-- open: attributes |
350
|
0
|
|
|
|
|
|
my $attrs = $coldb->{attrs} = $coldb->attrs(undef,['l']); |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
##-- open: by attribute |
353
|
0
|
|
|
|
|
|
my $atat = 0; |
354
|
0
|
|
|
|
|
|
foreach my $attr (@$attrs) { |
355
|
|
|
|
|
|
|
##-- open: ${attr}* |
356
|
0
|
0
|
|
|
|
|
my $abase = (-r "$dbdir/${attr}_enum.hdr" ? "$dbdir/${attr}_" : "$dbdir/${attr}"); ##-- v0.03-compatibility hack |
357
|
0
|
0
|
|
|
|
|
$coldb->{"${attr}enum"} = $coldb->mmclass($ECLASS)->new(base=>"${abase}enum", %efopts) |
358
|
|
|
|
|
|
|
or $coldb->logconfess("open(): failed to open enum ${abase}enum.*: $!"); |
359
|
0
|
0
|
|
|
|
|
$coldb->{"${attr}2t"} = $coldb->mmclass($MMCLASS)->new(base=>"${abase}2t", %mmopts) |
360
|
|
|
|
|
|
|
or $coldb->logconfess("open(): failed to open expansion multimap ${abase}2x.*: $!"); |
361
|
0
|
|
0
|
|
|
|
$coldb->{"pack_t$attr"} //= "\@${atat}$coldb->{pack_id}"; |
362
|
0
|
|
|
|
|
|
$atat += packsize($coldb->{pack_id}); |
363
|
|
|
|
|
|
|
} |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
##-- open: tenum |
366
|
|
|
|
|
|
|
$coldb->{tenum} = $coldb->mmclass($XECLASS)->new(base=>"$dbdir/tenum", %efopts, pack_s=>$coldb->{pack_t}) |
367
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("open(): failed to open tuple-enum $dbdir/tenum.*: $!"); |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
##-- open: xf |
370
|
0
|
0
|
0
|
|
|
|
if ($coldb->{index_xf}//1) { |
371
|
|
|
|
|
|
|
$coldb->{xf} = DiaColloDB::Relation::Unigrams->new(base=>"$dbdir/xf", flags=>$flags, mmap=>$coldb->{mmap}, |
372
|
|
|
|
|
|
|
pack_i=>$coldb->{pack_id}, pack_f=>$coldb->{pack_f}, pack_d=>$coldb->{pack_date} |
373
|
|
|
|
|
|
|
) |
374
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("open(): failed to open tuple-unigrams $dbdir/xf.*: $!"); |
375
|
0
|
0
|
0
|
|
|
|
$coldb->{xf}{N} = $coldb->{xN} if ($coldb->{xN} && !$coldb->{xf}{N}); ##-- compat |
376
|
|
|
|
|
|
|
} |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
##-- open: cof |
379
|
0
|
0
|
0
|
|
|
|
if ($coldb->{index_cof}//1) { |
380
|
|
|
|
|
|
|
$coldb->{cof} = DiaColloDB::Relation::Cofreqs->new(base=>"$dbdir/cof", flags=>$flags, mmap=>$coldb->{mmap}, |
381
|
|
|
|
|
|
|
pack_i=>$coldb->{pack_id}, pack_f=>$coldb->{pack_f}, pack_d=>$coldb->{pack_date}, |
382
|
|
|
|
|
|
|
dmax=>$coldb->{dmax}, fmin=>$coldb->{cfmin}, |
383
|
|
|
|
|
|
|
) |
384
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("open(): failed to open co-frequency file $dbdir/cof.*: $!"); |
385
|
|
|
|
|
|
|
} |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
##-- open: ddc (undef if ddcServer isn't set in ddc.hdr or $coldb) |
388
|
0
|
0
|
0
|
|
|
|
$coldb->{ddc} = DiaColloDB::Relation::DDC->new(-r "$dbdir/ddc.hdr" ? (base=>"$dbdir/ddc") : qw())->fromDB($coldb) |
389
|
|
|
|
|
|
|
// 'DiaColloDB::Relation::DDC'; |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
##-- open: tdf (if available) |
392
|
0
|
0
|
|
|
|
|
if ($coldb->{index_tdf}) { |
393
|
0
|
|
0
|
|
|
|
$coldb->{tdfopts} //= {}; |
394
|
0
|
|
0
|
|
|
|
$coldb->{tdfopts}{$_} //= $TDF_OPTS{$_} foreach (keys %TDF_OPTS); ##-- tdf: default options |
395
|
|
|
|
|
|
|
$coldb->{tdf} = DiaColloDB::Relation::TDF->new((-r "$dbdir/tdf.hdr" ? (base=>"$dbdir/tdf") : qw()), |
396
|
|
|
|
|
|
|
dbreak => $coldb->{dbreak}, |
397
|
0
|
0
|
|
|
|
|
%{$coldb->{tdfopts}}, |
|
0
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
); |
399
|
|
|
|
|
|
|
} |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
##-- all done |
402
|
0
|
|
|
|
|
|
return $coldb; |
403
|
|
|
|
|
|
|
} |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
## @dbkeys = $coldb->dbkeys() |
407
|
|
|
|
|
|
|
sub dbkeys { |
408
|
|
|
|
|
|
|
return ( |
409
|
0
|
0
|
|
0
|
1
|
|
(ref($_[0]) ? (map {($_."enum",$_."2t")} @{$_[0]->attrs}) : qw()), |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
qw(tenum xf cof tdf), |
411
|
|
|
|
|
|
|
); |
412
|
|
|
|
|
|
|
} |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
## $coldb_or_undef = $coldb->close() |
415
|
|
|
|
|
|
|
sub close { |
416
|
0
|
|
|
0
|
1
|
|
my $coldb = shift; |
417
|
0
|
0
|
|
|
|
|
return $coldb if (!ref($coldb)); |
418
|
0
|
|
0
|
|
|
|
$coldb->vlog($coldb->{logOpen}, "close(".($coldb->{dbdir}//'').")"); |
419
|
0
|
|
|
|
|
|
foreach ($coldb->dbkeys) { |
420
|
0
|
0
|
|
|
|
|
next if (!defined($coldb->{$_})); |
421
|
0
|
0
|
|
|
|
|
return undef if (!$coldb->{$_}->close()); |
422
|
0
|
|
|
|
|
|
delete $coldb->{$_}; |
423
|
|
|
|
|
|
|
} |
424
|
0
|
|
|
|
|
|
$coldb->{dbdir} = undef; |
425
|
0
|
|
|
|
|
|
return $coldb; |
426
|
|
|
|
|
|
|
} |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
## $bool = $coldb->opened() |
429
|
|
|
|
|
|
|
sub opened { |
430
|
0
|
|
|
0
|
1
|
|
my $coldb = shift; |
431
|
|
|
|
|
|
|
return (defined($coldb->{dbdir}) |
432
|
0
|
|
0
|
|
|
|
&& !grep {!$_->opened} grep {defined($_)} @$coldb{$coldb->dbkeys} |
433
|
|
|
|
|
|
|
); |
434
|
|
|
|
|
|
|
} |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
## @files = $obj->diskFiles() |
437
|
|
|
|
|
|
|
## + returns list of dist files for this db |
438
|
|
|
|
|
|
|
sub diskFiles { |
439
|
0
|
|
|
0
|
1
|
|
my $coldb = shift; |
440
|
0
|
|
|
|
|
|
return ("$coldb->{dbdir}/header.json", map {$_->diskFiles} grep {UNIVERSAL::can(ref($_),'diskFiles')} values %$coldb); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
} |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
##============================================================================== |
444
|
|
|
|
|
|
|
## I/O: header |
445
|
|
|
|
|
|
|
## + largely INHERITED from DiaColloDB::Persistent |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
## @keys = $coldb->headerKeys() |
448
|
|
|
|
|
|
|
## + keys to save as header |
449
|
|
|
|
|
|
|
sub headerKeys { |
450
|
0
|
|
0
|
0
|
1
|
|
return (qw(attrs upgraded), grep {!ref($_[0]{$_}) && $_ !~ m{^(?:dbdir$|flags$|njobs$|perms$|info$|tdfopts$|log|debug)}} keys %{$_[0]}); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
} |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
## $bool = $coldb->loadHeaderData() |
454
|
|
|
|
|
|
|
## $bool = $coldb->loadHeaderData($data) |
455
|
|
|
|
|
|
|
sub loadHeaderData { |
456
|
0
|
|
|
0
|
1
|
|
my ($coldb,$hdr) = @_; |
457
|
0
|
0
|
0
|
|
|
|
if (!defined($hdr) && !fccreat($coldb->{flags})) { |
|
|
0
|
|
|
|
|
|
458
|
0
|
|
|
|
|
|
$coldb->logconfess("loadHeader() failed to load header data from ", $coldb->headerFile, ": $!"); |
459
|
|
|
|
|
|
|
} |
460
|
|
|
|
|
|
|
elsif (defined($hdr)) { |
461
|
0
|
|
|
|
|
|
$coldb->{version} = undef; |
462
|
0
|
|
|
|
|
|
return $coldb->SUPER::loadHeaderData($hdr); |
463
|
|
|
|
|
|
|
} |
464
|
0
|
|
|
|
|
|
return $coldb; |
465
|
|
|
|
|
|
|
} |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
## $bool = $coldb->saveHeader() |
468
|
|
|
|
|
|
|
## $bool = $coldb->saveHeader($headerFile) |
469
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Persistent |
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
##======================================================================== |
472
|
|
|
|
|
|
|
## export/import |
473
|
1
|
|
|
1
|
|
742
|
use DiaColloDB::methods::export; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
7513
|
|
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
##============================================================================== |
477
|
|
|
|
|
|
|
## Info |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
## \%info = $coldb->dbinfo() |
480
|
|
|
|
|
|
|
## + get db info |
481
|
|
|
|
|
|
|
sub dbinfo { |
482
|
0
|
|
|
0
|
1
|
|
my $coldb = shift; |
483
|
0
|
|
|
|
|
|
my $adata = $coldb->attrData(); |
484
|
0
|
|
|
|
|
|
my $du = $coldb->du(); |
485
|
|
|
|
|
|
|
my $info = { |
486
|
|
|
|
|
|
|
##-- literals |
487
|
0
|
0
|
|
|
|
|
(map {exists($coldb->{$_}) ? ($_=>$coldb->{$_}) : qw()} |
488
|
|
|
|
|
|
|
qw(dbdir bos eos dmax cfmin xdmin xdmax version upgraded label collection maintainer)), |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
##-- disk usage |
491
|
|
|
|
|
|
|
du_b => $du, |
492
|
|
|
|
|
|
|
du_h => si_str($du), |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
##-- timestamp |
495
|
|
|
|
|
|
|
timestamp => $coldb->timestamp, |
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
##-- attributes |
498
|
|
|
|
|
|
|
attrs => [map { |
499
|
|
|
|
|
|
|
{( |
500
|
|
|
|
|
|
|
name => $_->{a}, |
501
|
|
|
|
|
|
|
title => $coldb->attrTitle($_->{a}), |
502
|
|
|
|
|
|
|
size => $_->{enum}->size, |
503
|
|
|
|
|
|
|
alias => $ATTR_RALIAS{$_->{a}}, |
504
|
0
|
|
|
|
|
|
)} |
505
|
|
|
|
|
|
|
} @$adata], |
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
##-- relations |
508
|
|
|
|
|
|
|
#relations => [$coldb->relations], |
509
|
0
|
|
|
|
|
|
relations => { map {($_=>$coldb->{$_}->dbinfo($coldb))} $coldb->relations }, |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
##-- overrides |
512
|
0
|
|
0
|
|
|
|
%{$coldb->{info}//{}}, |
|
0
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
}; |
514
|
0
|
|
|
|
|
|
return $info; |
515
|
|
|
|
|
|
|
} |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
##============================================================================== |
519
|
|
|
|
|
|
|
## Profiling |
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
522
|
|
|
|
|
|
|
## Profiling: Wrappers |
523
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Client |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
## $mprf = $coldb->query($rel,%opts) |
526
|
|
|
|
|
|
|
## + get a generic DiaColloDB::Profile::Multi object for $rel |
527
|
|
|
|
|
|
|
## + calls $coldb->profile() or $coldb->compare() as appropriate |
528
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Client |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
## $mprf = $coldb->profile1(%opts) |
531
|
|
|
|
|
|
|
## + get unigram frequency profile for selected items as a DiaColloDB::Profile::Multi object |
532
|
|
|
|
|
|
|
## + really just wraps $coldb->profile('xf', %opts) |
533
|
|
|
|
|
|
|
## + %opts: see profile() method |
534
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Client |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
## $mprf = $coldb->profile2(%opts) |
537
|
|
|
|
|
|
|
## + get co-frequency profile for selected items as a DiaColloDB::Profile::Multi object |
538
|
|
|
|
|
|
|
## + really just wraps $coldb->profile('cof', %opts) |
539
|
|
|
|
|
|
|
## + %opts: see profile() method |
540
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Client |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
## $mprf = $coldb->compare1(%opts) |
543
|
|
|
|
|
|
|
## + get unigram comparison profile for selected items as a DiaColloDB::Profile::MultiDiff object |
544
|
|
|
|
|
|
|
## + really just wraps $coldb->compare('xf', %opts) |
545
|
|
|
|
|
|
|
## + %opts: see compare() method |
546
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Client |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
## $mprf = $coldb->compare2(%opts) |
549
|
|
|
|
|
|
|
## + get co-frequency comparison profile for selected items as a DiaColloDB::Profile::MultiDiff object |
550
|
|
|
|
|
|
|
## + really just wraps $coldb->profile('cof', %opts) |
551
|
|
|
|
|
|
|
## + %opts: see compare() method |
552
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Client |
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
556
|
|
|
|
|
|
|
## Profiling: Utils |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
559
|
|
|
|
|
|
|
## $relname = $coldb->relname($rel) |
560
|
|
|
|
|
|
|
## + returns an appropriate relation name for profile() and friends |
561
|
|
|
|
|
|
|
## + returns $rel if $coldb->{$rel} supports a profile() method |
562
|
|
|
|
|
|
|
## + otherwise heuristically parses $relationName /xf|f?1|ug/ or /f1?2|c/ |
563
|
|
|
|
|
|
|
sub relname { |
564
|
0
|
|
|
0
|
1
|
|
my ($coldb,$rel) = @_; |
565
|
0
|
0
|
|
|
|
|
if (UNIVERSAL::can($coldb->{$rel},'profile')) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
566
|
0
|
|
|
|
|
|
return $rel; |
567
|
|
|
|
|
|
|
} |
568
|
|
|
|
|
|
|
elsif ($rel =~ m/^(?:[ux]|f?1$)/) { |
569
|
0
|
|
|
|
|
|
return 'xf'; |
570
|
|
|
|
|
|
|
} |
571
|
|
|
|
|
|
|
elsif ($rel =~ m/^(?:c|f?1?2$)/) { |
572
|
0
|
|
|
|
|
|
return 'cof'; |
573
|
|
|
|
|
|
|
} |
574
|
|
|
|
|
|
|
elsif ($rel =~ m/^(?:v|vec|vs|vsem|sem|td[mf])$/) { |
575
|
0
|
|
|
|
|
|
return 'tdf'; |
576
|
|
|
|
|
|
|
} |
577
|
0
|
|
|
|
|
|
return $rel; |
578
|
|
|
|
|
|
|
} |
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
581
|
|
|
|
|
|
|
## $obj_or_undef = $coldb->relation($rel) |
582
|
|
|
|
|
|
|
## + returns an appropriate relation-like object for profile() and friends |
583
|
|
|
|
|
|
|
## + wraps $coldb->{$coldb->relname($rel)} |
584
|
|
|
|
|
|
|
sub relation { |
585
|
0
|
|
|
0
|
1
|
|
return $_[0]->{$_[0]->relname($_[1])}; |
586
|
|
|
|
|
|
|
} |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
589
|
|
|
|
|
|
|
## @relnames = $coldb->relations() |
590
|
|
|
|
|
|
|
## + gets list of defined relations |
591
|
|
|
|
|
|
|
sub relations { |
592
|
0
|
|
|
0
|
1
|
|
return grep {UNIVERSAL::isa(ref($_[0]{$_}),'DiaColloDB::Relation')} keys %{$_[0]}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
} |
594
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
596
|
|
|
|
|
|
|
## \@ids = $coldb->enumIds($enum,$req,%opts) |
597
|
|
|
|
|
|
|
## + parses enum IDs for $req, which is one of: |
598
|
|
|
|
|
|
|
## - a DDC::Any::CQTokExact, ::CQTokInfl, ::CQTokSet, ::CQTokSetInfl, or ::CQTokRegex : interpreted |
599
|
|
|
|
|
|
|
## - an ARRAY-ref : list of literal symbol-values |
600
|
|
|
|
|
|
|
## - a Regexp ref : regexp for target strings, passed to $enum->re2i() |
601
|
|
|
|
|
|
|
## - a string /REGEX/ : regexp for target strings, passed to $enum->re2i() |
602
|
|
|
|
|
|
|
## - another string : space-, comma-, or |-separated list of literal values |
603
|
|
|
|
|
|
|
## + %opts: |
604
|
|
|
|
|
|
|
## logLevel => $logLevel, ##-- logging level (default=undef) |
605
|
|
|
|
|
|
|
## logPrefix => $prefix, ##-- logging prefix (default="enumIds(): fetch ids") |
606
|
|
|
|
|
|
|
sub enumIds { |
607
|
0
|
|
|
0
|
1
|
|
my ($coldb,$enum,$req,%opts) = @_; |
608
|
0
|
|
0
|
|
|
|
$opts{logPrefix} //= "enumIds(): fetch ids"; |
609
|
0
|
0
|
0
|
|
|
|
if (UNIVERSAL::isa($req,'DDC::Any::CQTokInfl') || UNIVERSAL::isa($req,'DDC::Any::CQTokExact')) { |
|
|
0
|
0
|
|
|
|
|
|
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
610
|
|
|
|
|
|
|
##-- CQuery: CQTokExact |
611
|
0
|
|
|
|
|
|
$coldb->vlog($opts{logLevel}, $opts{logPrefix}, " (", ref($req), ")"); |
612
|
0
|
|
|
|
|
|
return [$enum->s2i($req->getValue)]; |
613
|
|
|
|
|
|
|
} |
614
|
|
|
|
|
|
|
elsif (UNIVERSAL::isa($req,'DDC::Any::CQTokSet') || UNIVERSAL::isa($req,'DDC::Any::CQTokSetInfl')) { |
615
|
|
|
|
|
|
|
##-- CQuery: CQTokSet |
616
|
0
|
|
|
|
|
|
$coldb->vlog($opts{logLevel}, $opts{logPrefix}, " (", ref($req), ")"); |
617
|
0
|
|
|
|
|
|
return [map {$enum->s2i($_)} @{$req->getValues}]; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
} |
619
|
|
|
|
|
|
|
elsif (UNIVERSAL::isa($req,'DDC::Any::CQTokRegex')) { |
620
|
|
|
|
|
|
|
##-- CQuery: CQTokRegex |
621
|
0
|
|
|
|
|
|
$coldb->vlog($opts{logLevel}, $opts{logPrefix}, " (", ref($req), ")"); |
622
|
0
|
|
|
|
|
|
return $enum->re2i($req->getValue); |
623
|
|
|
|
|
|
|
} |
624
|
|
|
|
|
|
|
elsif (UNIVERSAL::isa($req,'DDC::Any::CQTokAny')) { |
625
|
0
|
|
|
|
|
|
$coldb->vlog($opts{logLevel}, $opts{logPrefix}, " (", ref($req), ")"); |
626
|
0
|
|
|
|
|
|
return undef; |
627
|
|
|
|
|
|
|
} |
628
|
|
|
|
|
|
|
elsif (UNIVERSAL::isa($req,'ARRAY')) { |
629
|
|
|
|
|
|
|
##-- compat: array |
630
|
0
|
|
|
|
|
|
$coldb->vlog($opts{logLevel}, $opts{logPrefix}, " (ARRAY)"); |
631
|
0
|
|
|
|
|
|
return [map {$enum->s2i($_)} @$req]; |
|
0
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
} |
633
|
|
|
|
|
|
|
elsif (UNIVERSAL::isa($req,'Regexp') || $req =~ m{^/}) { |
634
|
|
|
|
|
|
|
##-- compat: regex |
635
|
0
|
|
|
|
|
|
$coldb->vlog($opts{logLevel}, $opts{logPrefix}, " (REGEX)"); |
636
|
0
|
|
|
|
|
|
return $enum->re2i($req); |
637
|
|
|
|
|
|
|
} |
638
|
|
|
|
|
|
|
elsif (!ref($req)) { |
639
|
|
|
|
|
|
|
##-- compat: space-, comma-, or |-separated literals |
640
|
0
|
|
|
|
|
|
$coldb->vlog($opts{logLevel}, $opts{logPrefix}, " (STRINGS)"); |
641
|
0
|
|
0
|
|
|
|
return [grep {defined($_)} map {$enum->s2i($_)} grep {($_//'') ne ''} map {s{\\(.)}{$1}g; $_} split(/(?:(?<!\\)[\,\s\|])+/,$req)]; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
} |
643
|
|
|
|
|
|
|
else { |
644
|
|
|
|
|
|
|
##-- reference: unhandled |
645
|
0
|
|
|
|
|
|
$coldb->logconfess($coldb->{error}="$opts{logPrefix}: can't handle request of type ".ref($req)); |
646
|
|
|
|
|
|
|
} |
647
|
0
|
|
|
|
|
|
return []; |
648
|
|
|
|
|
|
|
} |
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
651
|
|
|
|
|
|
|
## ($dfilter,$sliceLo,$sliceHi,$dateLo,$dateHi) = $coldb->parseDateRequest($dateRequest='', $sliceRequest=0, $fill=0, $ddcMode=0) |
652
|
|
|
|
|
|
|
## \%dateRequest = $coldb->parseDateRequest($dateRequest='', $sliceRequest=0, $fill=0, $ddcMode=0) |
653
|
|
|
|
|
|
|
## + parses date request and returns limit and filter information as a list (list context) or HASH-ref (scalar context); |
654
|
|
|
|
|
|
|
## + %dateRequest = |
655
|
|
|
|
|
|
|
## ( |
656
|
|
|
|
|
|
|
## dfilter => $dfilter, ##-- filter-sub, called as: $wanted=$dfilter->($date); undef for none |
657
|
|
|
|
|
|
|
## slo => $sliceLo, ##-- minimum slice (inclusive) |
658
|
|
|
|
|
|
|
## shi => $sliceHi, ##-- maximum slice (inclusive) |
659
|
|
|
|
|
|
|
## dlo => $dateLo, ##-- minimum date (inclusive); undef for none, always defined if $fill is true |
660
|
|
|
|
|
|
|
## dhi => $dateHi, ##-- maximum date (inclusive); undef for none, always defined if $fill is true |
661
|
|
|
|
|
|
|
## ) |
662
|
|
|
|
|
|
|
sub parseDateRequest { |
663
|
0
|
|
|
0
|
1
|
|
my ($coldb,$date,$slice,$fill,$ddcmode) = @_; |
664
|
0
|
|
|
|
|
|
my ($dfilter,$slo,$shi,$dlo,$dhi); |
665
|
0
|
|
0
|
|
|
|
$date //= ''; |
666
|
0
|
0
|
0
|
|
|
|
if ($date =~ /^[\s\*]*$/) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
667
|
|
|
|
|
|
|
##-- empty date request or universal wildcard: ignore |
668
|
0
|
|
|
|
|
|
$dlo = $dhi = undef; |
669
|
|
|
|
|
|
|
} |
670
|
|
|
|
|
|
|
elsif (UNIVERSAL::isa($date,'Regexp') || $date =~ /^\//) { |
671
|
|
|
|
|
|
|
##-- date request: regex string |
672
|
0
|
0
|
|
|
|
|
$coldb->logconfess("parseDateRequest(): can't handle date regex '$date' in ddc mode") if ($ddcmode); |
673
|
0
|
|
|
|
|
|
my $dre = regex($date); |
674
|
0
|
|
|
0
|
|
|
$dfilter = sub { $_[0] =~ $dre }; |
|
0
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
} |
676
|
|
|
|
|
|
|
elsif ($date =~ /^\s*((?:[0-9]+|\*?))\s*[\-\:]+\s*((?:[0-9]+|\*?))\s*$/) { |
677
|
|
|
|
|
|
|
##-- date request: range MIN:MAX (inclusive) |
678
|
0
|
|
|
|
|
|
($dlo,$dhi) = ($1,$2); |
679
|
0
|
0
|
0
|
|
|
|
$dlo = $coldb->{xdmin} if (($dlo//'') =~ /^\*?$/); |
680
|
0
|
0
|
0
|
|
|
|
$dhi = $coldb->{xdmax} if (($dhi//'') =~ /^\*?$/); |
681
|
0
|
|
|
|
|
|
$dlo += 0; |
682
|
0
|
|
|
|
|
|
$dhi += 0; |
683
|
0
|
0
|
|
0
|
|
|
$dfilter = sub { $_[0]>=$dlo && $_[0]<=$dhi }; |
|
0
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
} |
685
|
|
|
|
|
|
|
elsif ($date =~ /[\s\,\|]+/) { |
686
|
|
|
|
|
|
|
##-- date request: list |
687
|
0
|
0
|
|
|
|
|
$coldb->logconfess("parseDateRequest(): can't handle date list '$date' in ddc mode") if ($ddcmode); |
688
|
0
|
|
0
|
|
|
|
my %dwant = map {($_=>undef)} grep {($_//'') ne ''} split(/[\s\,\|]+/,$date); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
689
|
0
|
|
|
0
|
|
|
$dfilter = sub { exists($dwant{$_[0]}) }; |
|
0
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
} |
691
|
|
|
|
|
|
|
else { |
692
|
|
|
|
|
|
|
##-- date request: single value |
693
|
0
|
|
|
|
|
|
$dlo = $dhi = $date; |
694
|
0
|
|
|
0
|
|
|
$dfilter = sub { $_[0] == $date }; |
|
0
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
} |
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
##-- force-fill? |
698
|
0
|
0
|
|
|
|
|
if ($fill) { |
699
|
0
|
0
|
0
|
|
|
|
$dlo = $coldb->{xdmin} if (!$dlo || $dlo < $coldb->{xdmin}); |
700
|
0
|
0
|
0
|
|
|
|
$dhi = $coldb->{xdmax} if (!$dhi || $dhi > $coldb->{xdmax}); |
701
|
|
|
|
|
|
|
} |
702
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
##-- slice-range |
704
|
0
|
0
|
0
|
|
|
|
($slo,$shi) = map {$slice ? ($slice*int($_/$slice)) : 0} (($dlo//$coldb->{xdmin}),($dhi//$coldb->{xdmax})); |
|
0
|
|
0
|
|
|
|
|
705
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
return wantarray |
707
|
0
|
0
|
|
|
|
|
? ($dfilter,$slo,$shi,$dlo,$dhi) |
708
|
|
|
|
|
|
|
: { dfilter=>$dfilter, slo=>$slo, shi=>$shi, dlo=>$dlo, dhi=>$dhi }; |
709
|
|
|
|
|
|
|
} |
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
712
|
|
|
|
|
|
|
## $compiler = $coldb->qcompiler(); |
713
|
|
|
|
|
|
|
## + get DDC::Any::CQueryCompiler for this object (cached in $coldb->{_qcompiler}) |
714
|
|
|
|
|
|
|
sub qcompiler { |
715
|
0
|
|
0
|
0
|
1
|
|
return $_[0]{_qcompiler} ||= DDC::Any::CQueryCompiler->new(); |
716
|
|
|
|
|
|
|
} |
717
|
|
|
|
|
|
|
|
718
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
719
|
|
|
|
|
|
|
## $cquery_or_undef = $coldb->qparse($ddc_query_string) |
720
|
|
|
|
|
|
|
## + wraps parse in an eval {...} block and sets $coldb->{error} on failure |
721
|
|
|
|
|
|
|
sub qparse { |
722
|
0
|
|
|
0
|
1
|
|
my ($coldb,$qstr) = @_; |
723
|
0
|
|
|
|
|
|
my ($q); |
724
|
0
|
|
|
|
|
|
eval { $q=$coldb->qcompiler->ParseQuery($qstr); }; |
|
0
|
|
|
|
|
|
|
725
|
0
|
0
|
0
|
|
|
|
if ($@ || !defined($q)) { |
726
|
0
|
|
|
|
|
|
$coldb->{error}="$@"; |
727
|
0
|
|
|
|
|
|
return undef; |
728
|
|
|
|
|
|
|
} |
729
|
0
|
|
|
|
|
|
return $q; |
730
|
|
|
|
|
|
|
} |
731
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
734
|
|
|
|
|
|
|
## $cquery = $coldb->parseQuery([[$attr1,$val1],...], %opts) ##-- compat: ARRAY-of-ARRAYs |
735
|
|
|
|
|
|
|
## $cquery = $coldb->parseQuery(["$attr1:$val1",...], %opts) ##-- compat: ARRAY-of-requests |
736
|
|
|
|
|
|
|
## $cquery = $coldb->parseQuery({$attr1=>$val1, ...}, %opts) ##-- compat: HASH |
737
|
|
|
|
|
|
|
## $cquery = $coldb->parseQuery("$attr1=$val1, ...", %opts) ##-- compat: string |
738
|
|
|
|
|
|
|
## $cquery = $coldb->parseQuery($ddcQueryString, %opts) ##-- ddc string (with shorthand ","->WITH, "&&"->WITH) |
739
|
|
|
|
|
|
|
## + guts for parsing user target and groupby requests |
740
|
|
|
|
|
|
|
## + returns a DDC::Any::CQuery object representing the request |
741
|
|
|
|
|
|
|
## + index-only items "$l" are mapped to $l=@{} |
742
|
|
|
|
|
|
|
## + if query request is wrapped in "(...)" or "[...]", native parsing is NOT attempted |
743
|
|
|
|
|
|
|
## + %opts: |
744
|
|
|
|
|
|
|
## warn => $level, ##-- log-level for unknown attributes (default: 'warn') |
745
|
|
|
|
|
|
|
## logas => $reqtype, ##-- request type for warnings |
746
|
|
|
|
|
|
|
## parseas => $reqtype, ##-- request type for parsing ('groupby' or 'query' (default)) |
747
|
|
|
|
|
|
|
## qref => \$qref, ##-- store parsed query in SCALAR-ref $qref |
748
|
|
|
|
|
|
|
## default => $attr, ##-- default attribute (for query requests) |
749
|
|
|
|
|
|
|
## mapand => $bool, ##-- map CQAnd to CQWith? (default=true unless '&&' occurs in query string) |
750
|
|
|
|
|
|
|
## ddcmode => $bool, ##-- force ddc query parsing? (0:no:default, >0:always, <0:fallback) |
751
|
|
|
|
|
|
|
sub parseQuery { |
752
|
0
|
|
|
0
|
1
|
|
my ($coldb,$req,%opts) = @_; |
753
|
0
|
|
|
|
|
|
my $req0 = $req; |
754
|
0
|
|
0
|
|
|
|
my $wlevel = $opts{warn} // 'warn'; |
755
|
0
|
|
|
|
|
|
my $defaultIndex = $opts{default}; |
756
|
0
|
|
0
|
|
|
|
my $logas = $opts{logas}//''; |
757
|
0
|
|
0
|
|
|
|
my $parseas = $opts{parseas} || $logas || 'query'; |
758
|
0
|
|
0
|
|
|
|
my $ddcmode = $opts{ddcmode} || 0; |
759
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
##-- compat: accept ARRAY or HASH requests |
761
|
0
|
0
|
|
|
|
|
my $areqs = (UNIVERSAL::isa($req,'ARRAY') ? [@$req] |
|
|
0
|
|
|
|
|
|
762
|
|
|
|
|
|
|
: (UNIVERSAL::isa($req,'HASH') ? [%$req] |
763
|
|
|
|
|
|
|
: undef)); |
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
##-- ddcmode: detect "[...]" queries |
766
|
0
|
0
|
|
|
|
|
$ddcmode = 1 if ($req =~ s{^\s*\[(.*)\]\s*$}{$1}); |
767
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
##-- compat: parse into attribute-local requests $areqs=[[$attr1,$areq1],...] |
769
|
0
|
|
|
|
|
|
my $sepre = qr{[\s\,]}; |
770
|
0
|
|
|
|
|
|
my $charre = qr{(?:\\[^ux0-9]|[\w\x{80}-\x{ffff}])}; |
771
|
0
|
|
|
|
|
|
my $attrre = qr{(?:\$?(?:doc\.)?${charre}+)}; |
772
|
0
|
|
|
|
|
|
my $orre = qr{(?:\s*\|?\s*)}; |
773
|
0
|
|
|
|
|
|
my $setre = qr{(?:(?:${charre}+)(?:${orre}${charre}+)*)}; ##-- value: |-separated barewords |
774
|
0
|
|
|
|
|
|
my $regre = qr{(?:/(?:\\/|[^/]*)/(?:[gimsadlux]*))}; ##-- value regexes |
775
|
0
|
|
|
|
|
|
my $valre = qr{(?:${setre}|${regre})}; |
776
|
0
|
0
|
|
|
|
|
my $reqre = ($parseas eq 'groupby' |
777
|
|
|
|
|
|
|
? qr{(?:(?:${attrre}(?:[:=]${valre})?)|${valre})} ##-- groupby mode: require ATTRIBUTES |
778
|
|
|
|
|
|
|
: qr{(?:(?:${attrre}[:=])?${valre})} ##-- query mode: reqzure VALUES |
779
|
|
|
|
|
|
|
); |
780
|
0
|
0
|
0
|
|
|
|
if (!$areqs |
|
|
|
0
|
|
|
|
|
781
|
|
|
|
|
|
|
&& ($ddcmode <= 0) ##-- allow native parsing? |
782
|
|
|
|
|
|
|
&& $req =~ m/^${sepre}* ##-- initial separators (optional) |
783
|
|
|
|
|
|
|
(?:${reqre}${sepre}+)* ##-- separated components |
784
|
|
|
|
|
|
|
(?:${reqre}) ##-- final component |
785
|
|
|
|
|
|
|
${sepre}* ##-- final separators (optional) |
786
|
|
|
|
|
|
|
$/x) { |
787
|
0
|
0
|
|
|
|
|
$coldb->debug("parseQuery($logas): parsing native query request [ddcmode=$ddcmode]") if ($coldb->{debug}); |
788
|
0
|
|
|
|
|
|
$areqs = [grep {defined($_)} ($req =~ m/${sepre}*(${reqre})/g)]; |
|
0
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
} |
790
|
|
|
|
|
|
|
|
791
|
|
|
|
|
|
|
##-- construct DDC query $q |
792
|
0
|
|
|
|
|
|
my ($q); |
793
|
0
|
0
|
|
|
|
|
if ($areqs) { |
794
|
|
|
|
|
|
|
##-- compat: diacollo<=v0.06-style attribute-wise request in @$areqs; construct DDC query by hand |
795
|
0
|
|
|
|
|
|
my ($attr,$areq,$aq); |
796
|
0
|
|
|
|
|
|
foreach (@$areqs) { |
797
|
0
|
0
|
|
|
|
|
if (UNIVERSAL::isa($_,'ARRAY')) { |
|
|
0
|
|
|
|
|
|
798
|
|
|
|
|
|
|
##-- compat: attribute request: ARRAY |
799
|
0
|
|
|
|
|
|
($attr,$areq) = @$_; |
800
|
|
|
|
|
|
|
} elsif (UNIVERSAL::isa($_,'HASH')) { |
801
|
|
|
|
|
|
|
##-- compat: attribute request: HASH |
802
|
0
|
|
|
|
|
|
($attr,$areq) = %$_; |
803
|
|
|
|
|
|
|
} else { |
804
|
|
|
|
|
|
|
##-- compat: attribute request: STRING (native) |
805
|
0
|
0
|
|
|
|
|
next if (uc($_) eq 'WITH'); ##-- avoid ddc keyword |
806
|
0
|
0
|
|
|
|
|
($attr,$areq) = m{^(${attrre})[:=](${valre})$} ? ($1,$2) : ($_,undef); |
807
|
0
|
|
|
|
|
|
$attr =~ s/\\(.)/$1/g; |
808
|
0
|
0
|
|
|
|
|
$areq =~ s/\\(.)/$1/g if (defined($areq)); |
809
|
|
|
|
|
|
|
} |
810
|
|
|
|
|
|
|
|
811
|
|
|
|
|
|
|
##-- compat: parse defaults |
812
|
0
|
0
|
0
|
|
|
|
($attr,$areq) = ('',$attr) if (defined($defaultIndex) && !defined($areq)); |
813
|
0
|
0
|
0
|
|
|
|
$attr = $defaultIndex//'' if (($attr//'') eq ''); |
|
|
|
0
|
|
|
|
|
814
|
0
|
|
|
|
|
|
$attr =~ s/^\$//; |
815
|
|
|
|
|
|
|
|
816
|
0
|
0
|
0
|
|
|
|
$coldb->debug("parseQuery($logas): parsing native request clause: (".($attr//'')." = ".($areq//'').")") if ($coldb->{debug}); |
|
|
|
0
|
|
|
|
|
817
|
|
|
|
|
|
|
|
818
|
0
|
0
|
0
|
|
|
|
if (UNIVERSAL::isa($areq,'DDC::Any::CQuery')) { |
|
|
0
|
0
|
|
|
|
|
|
|
0
|
0
|
|
|
|
|
|
|
0
|
0
|
|
|
|
|
|
|
0
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
819
|
|
|
|
|
|
|
##-- compat: value: ddc query object |
820
|
0
|
|
|
|
|
|
$aq = $areq; |
821
|
0
|
0
|
0
|
|
|
|
$aq->setIndexName($attr) if ($aq->can('setIndexName') && $attr ne ''); |
822
|
|
|
|
|
|
|
} |
823
|
|
|
|
|
|
|
elsif (UNIVERSAL::isa($areq,'ARRAY')) { |
824
|
|
|
|
|
|
|
##-- compat: value: array --> CQTokSet @{VAL1,...,VALN} |
825
|
0
|
|
|
|
|
|
$aq = DDC::Any::CQTokSet->new($attr, '', $areq); |
826
|
|
|
|
|
|
|
} |
827
|
|
|
|
|
|
|
elsif (UNIVERSAL::isa($areq,'RegExp') || (($opts{ddcmode}||0)<1 && $areq && $areq =~ m{^${regre}$})) { |
828
|
|
|
|
|
|
|
##-- compat: value: regex --> CQTokRegex /REGEX/ |
829
|
0
|
|
|
|
|
|
my $re = regex($areq).""; |
830
|
0
|
|
|
|
|
|
$re =~ s{\G(.*?\(\?\^[^adlu:]*)[adlu]*}{$1}g; ##-- trim perl-5.14 character-set modifiers: they break KWIC-links, since DDC (PCRE) doesn't support them! |
831
|
0
|
|
|
|
|
|
$re =~ s{^\(\?\^[adlu]*\:(.*)\)$}{$1}; ##-- trim redundant top-level grouping inserted by qr{}-stringification |
832
|
0
|
|
|
|
|
|
$aq = DDC::Any::CQTokRegex->new($attr, $re); |
833
|
|
|
|
|
|
|
} |
834
|
|
|
|
|
|
|
elsif (!$areq || $areq =~ /^\s*${reqre}\s*$/) { |
835
|
|
|
|
|
|
|
##-- compat: value: space- or |-separated literals --> CQTokExact $a=@VAL or CQTokSet $a=@{VAL1,...VALN} or CQTokAny $a=* |
836
|
|
|
|
|
|
|
## + also applies to empty $areq, e.g. in groupby clauses |
837
|
0
|
|
0
|
|
|
|
my $vals = [grep {($_//'') ne ''} map {s{\\(.)}{$1}g; $_} split(/(?:(?<!\\)[\,\s\|])+/,($areq//''))]; |
|
0
|
|
0
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
838
|
0
|
0
|
0
|
|
|
|
$aq = (@$vals<=1 |
|
|
0
|
0
|
|
|
|
|
839
|
|
|
|
|
|
|
? (($vals->[0]//'*') eq '*' |
840
|
|
|
|
|
|
|
? DDC::Any::CQTokAny->new($attr,'*') |
841
|
|
|
|
|
|
|
: DDC::Any::CQTokExact->new($attr,$vals->[0])) |
842
|
|
|
|
|
|
|
: DDC::Any::CQTokSet->new($attr,($areq//''),$vals)); |
843
|
|
|
|
|
|
|
} |
844
|
|
|
|
|
|
|
elsif ($ddcmode && ($areq//'') ne '') { |
845
|
|
|
|
|
|
|
##-- compat: ddcmode: parse requests as ddc queries |
846
|
|
|
|
|
|
|
$aq = $coldb->qparse($areq) |
847
|
0
|
0
|
|
|
|
|
or $coldb->logconfess($coldb->{error}="parseQuery(): failed to parse request \`$areq': $coldb->{error}"); |
848
|
|
|
|
|
|
|
} |
849
|
|
|
|
|
|
|
##-- push request to query |
850
|
0
|
0
|
|
|
|
|
$q = $q ? DDC::Any::CQWith->new($q,$aq) : $aq; |
851
|
|
|
|
|
|
|
} |
852
|
|
|
|
|
|
|
} |
853
|
|
|
|
|
|
|
else { |
854
|
|
|
|
|
|
|
##-- ddc: diacollo>=v0.06: ddc request parsing: allow shorthands (',' --> 'WITH'), ('INDEX=VAL' --> '$INDEX=VAL'), and ($INDEX --> $INDEX=@{}) |
855
|
0
|
|
|
|
|
|
my $compiler = $coldb->qcompiler(); |
856
|
0
|
|
|
|
|
|
my ($err); |
857
|
0
|
|
|
|
|
|
while (!defined($q)) { |
858
|
|
|
|
|
|
|
#$coldb->trace("req=$req\n"); |
859
|
0
|
|
|
|
|
|
undef $@; |
860
|
0
|
|
|
|
|
|
eval { $q=$compiler->ParseQuery($req); }; |
|
0
|
|
|
|
|
|
|
861
|
0
|
0
|
0
|
|
|
|
last if (!($err=$@) && defined($q)); |
862
|
0
|
0
|
|
|
|
|
if ($err =~ /syntax error/) { |
863
|
0
|
0
|
|
|
|
|
if ($err =~ /unexpected ','/) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
864
|
|
|
|
|
|
|
##-- (X Y) --> (X WITH Y) |
865
|
0
|
|
|
|
|
|
$req =~ s/(?!<\\)\s*,\s*/ WITH /; |
866
|
0
|
|
|
|
|
|
next; |
867
|
|
|
|
|
|
|
} |
868
|
|
|
|
|
|
|
elsif ($err =~ /expecting '='/) { |
869
|
|
|
|
|
|
|
##-- ($INDEX) --> ($INDEX=*) (for group-by) |
870
|
0
|
|
|
|
|
|
$req =~ s/(\$\w+)(?!\s*\=)/$1=*/; |
871
|
0
|
|
|
|
|
|
next; |
872
|
|
|
|
|
|
|
} |
873
|
|
|
|
|
|
|
elsif ($err =~ /unexpected SYMBOL, expecting INTEGER at line \d+, near token \`([^\']*)\'/) { |
874
|
|
|
|
|
|
|
##-- (INDEX=) --> ($INDEX=) |
875
|
0
|
|
|
|
|
|
my $tok = $1; |
876
|
0
|
|
|
|
|
|
$req =~ s/(?!<\$)(\S+)\s*=\s*\Q$tok\E/\$$1=$tok/; |
877
|
0
|
|
|
|
|
|
next; |
878
|
|
|
|
|
|
|
} |
879
|
|
|
|
|
|
|
} |
880
|
0
|
|
0
|
|
|
|
$coldb->logconfess("parseQuery(): could not parse request '$req0': ", ($err//'')); |
881
|
|
|
|
|
|
|
} |
882
|
|
|
|
|
|
|
} |
883
|
|
|
|
|
|
|
|
884
|
|
|
|
|
|
|
##-- tweak query: map CQAnd to CQWith |
885
|
|
|
|
|
|
|
$q = $q->mapTraverse(sub { |
886
|
0
|
0
|
|
0
|
|
|
return UNIVERSAL::isa($_[0],'DDC::Any::CQAnd') ? DDC::Any::CQWith->new($_[0]->getDtr1,$_[0]->getDtr2) : $_[0]; |
887
|
|
|
|
|
|
|
}) |
888
|
0
|
0
|
0
|
|
|
|
if ($opts{mapand} || (!defined($opts{mapand}) && $req0 !~ /\&\&/)); |
|
|
|
0
|
|
|
|
|
889
|
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
##-- maybe store parsed query |
891
|
0
|
0
|
|
|
|
|
${$opts{qref}} = $q if (ref($opts{qref})); |
|
0
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
|
893
|
0
|
0
|
|
|
|
|
$coldb->debug("parseQuery($logas): parsed query: ", $q->toString) if ($coldb->{debug}); |
894
|
|
|
|
|
|
|
|
895
|
0
|
|
|
|
|
|
return $q; |
896
|
|
|
|
|
|
|
} |
897
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
899
|
|
|
|
|
|
|
## \@aqs = $coldb->queryAttributes($cquery,%opts) |
900
|
|
|
|
|
|
|
## + utility for decomposing DDC queries into attribute-wise requests |
901
|
|
|
|
|
|
|
## + returns an ARRAY-ref [[$attr1,$val1], ...] |
902
|
|
|
|
|
|
|
## + each value $vali is empty or undef (all values), a CQTokSet, a CQTokExact, CQTokRegex, or CQTokAny |
903
|
|
|
|
|
|
|
## + chokes on unsupported query types or filters |
904
|
|
|
|
|
|
|
## + %opts: |
905
|
|
|
|
|
|
|
## warn => $level, ##-- log-level for unknown attributes (default: 'warn') |
906
|
|
|
|
|
|
|
## logas => $reqtype, ##-- request type for warnings |
907
|
|
|
|
|
|
|
## default => $attr, ##-- default attribute (for query requests) |
908
|
|
|
|
|
|
|
## allowExtra => \@attrs, ##-- allow extra attributes @attrs (may also be HASH-ref) |
909
|
|
|
|
|
|
|
## allowUnknown => $bool, ##-- allow unknown attributes? (default: 0) |
910
|
|
|
|
|
|
|
sub queryAttributes { |
911
|
0
|
|
|
0
|
1
|
|
my ($coldb,$cquery,%opts) = @_; |
912
|
0
|
|
0
|
|
|
|
my $wlevel = $opts{warn} // 'warn'; |
913
|
0
|
|
|
|
|
|
my $default = $opts{default}; |
914
|
0
|
|
0
|
|
|
|
my $logas = $opts{logas}//''; |
915
|
|
|
|
|
|
|
|
916
|
|
|
|
|
|
|
my $warnsub = sub { |
917
|
0
|
0
|
|
0
|
|
|
$coldb->logconfess($coldb->{error}="queryAttributes(): can't handle ".join('',@_)) if (!$opts{relax}); |
918
|
0
|
|
|
|
|
|
$coldb->vlog($wlevel, "queryAttributes(): ignoring ", @_); |
919
|
0
|
|
|
|
|
|
}; |
920
|
|
|
|
|
|
|
|
921
|
0
|
|
|
|
|
|
my $areqs = []; |
922
|
0
|
|
|
|
|
|
my ($q,$attr,$aq); |
923
|
0
|
|
|
|
|
|
foreach $q (@{$cquery->Descendants}) { |
|
0
|
|
|
|
|
|
|
924
|
0
|
0
|
|
|
|
|
if (!defined($q)) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
925
|
|
|
|
|
|
|
##-- NULL: ignore |
926
|
0
|
|
|
|
|
|
next; |
927
|
|
|
|
|
|
|
} |
928
|
|
|
|
|
|
|
elsif ($q->isa('DDC::Any::CQWith')) { |
929
|
|
|
|
|
|
|
##-- CQWith: ignore (just recurse) |
930
|
0
|
|
|
|
|
|
next; |
931
|
|
|
|
|
|
|
} |
932
|
|
|
|
|
|
|
elsif ($q->isa('DDC::Any::CQueryOptions')) { |
933
|
|
|
|
|
|
|
##-- CQueryOptions: check for nontrivial user requests |
934
|
0
|
0
|
|
|
|
|
$warnsub->("#WITHIN clause") if (@{$q->getWithin}); |
|
0
|
|
|
|
|
|
|
935
|
0
|
0
|
|
|
|
|
$warnsub->("#CNTXT clause") if ($q->getContextSentencesCount); |
936
|
|
|
|
|
|
|
} |
937
|
|
|
|
|
|
|
elsif ($q->isa('DDC::Any::CQToken')) { |
938
|
|
|
|
|
|
|
##-- CQToken: create attribute clause |
939
|
0
|
0
|
|
|
|
|
$warnsub->("negated query clause in native $logas request (".$q->toString.")") if ($q->getNegated); |
940
|
0
|
0
|
0
|
|
|
|
$warnsub->("explicit term-expansion chain in native $logas request (".$q->toString.")") if ($q->can('getExpanders') && @{$q->getExpanders}); |
|
0
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
|
942
|
0
|
|
0
|
|
|
|
my $attr = $q->getIndexName || $default; |
943
|
0
|
0
|
|
|
|
|
if (ref($q) =~ /^DDC::\w+::CQTok(?:Exact|Set|Regex|Any)$/) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
944
|
0
|
|
|
|
|
|
$aq = $q; |
945
|
|
|
|
|
|
|
} elsif (ref($q) =~ /^DDC::\w+::CQTokInfl$/) { |
946
|
0
|
|
|
|
|
|
$aq = DDC::Any::CQTokExact->new($q->getIndexName, $q->getValue); |
947
|
|
|
|
|
|
|
} elsif (ref($q) =~ /^DDC::\w+::CQTokSetInfl$/) { |
948
|
0
|
|
|
|
|
|
$aq = DDC::Any::CQTokSet->new($q->getIndexName, $q->getValue, $q->getValues); |
949
|
|
|
|
|
|
|
} elsif (ref($q) =~ /^DDC::\w+::CQTokPrefix$/) { |
950
|
0
|
|
|
|
|
|
$aq = DDC::Any::CQTokRegex->new($q->getIndexName, '^'.quotemeta($q->getValue)); |
951
|
|
|
|
|
|
|
} elsif (ref($q) =~ /^DDC::\w+::CQTokSuffix$/) { |
952
|
0
|
|
|
|
|
|
$aq = DDC::Any::CQTokRegex->new($q->getIndexName, quotemeta($q->getValue).'$'); |
953
|
|
|
|
|
|
|
} elsif (ref($q) =~ /^DDC::\w+::CQTokInfix$/) { |
954
|
0
|
|
|
|
|
|
$aq = DDC::Any::CQTokRegex->new($q->getIndexName, quotemeta($q->getValue)); |
955
|
|
|
|
|
|
|
} else { |
956
|
0
|
|
|
|
|
|
$warnsub->("token query clause of type ".ref($q)." in native $logas request (".$q->toString.")"); |
957
|
|
|
|
|
|
|
} |
958
|
0
|
0
|
0
|
|
|
|
$aq=undef if ($aq && $aq->isa('DDC::Any::CQTokAny')); ##-- empty value, e.g. for groupby |
959
|
0
|
|
|
|
|
|
push(@$areqs, [$attr,$aq]); |
960
|
|
|
|
|
|
|
} |
961
|
|
|
|
|
|
|
elsif ($q->isa('DDC::Any::CQFilter')) { |
962
|
|
|
|
|
|
|
##-- CQFilter |
963
|
0
|
0
|
0
|
|
|
|
if ($q->isa('DDC::Any::CQFHasField')) { |
|
|
0
|
0
|
|
|
|
|
|
|
0
|
0
|
|
|
|
|
964
|
|
|
|
|
|
|
##-- CQFilter: CQFHasField |
965
|
0
|
|
|
|
|
|
my $attr = $q->getArg0; |
966
|
0
|
0
|
|
|
|
|
if ($q->isa('DDC::Any::CQFHasFieldValue')) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
967
|
0
|
|
|
|
|
|
$aq = DDC::Any::CQTokExact->new($attr, $q->getArg1); |
968
|
|
|
|
|
|
|
} |
969
|
|
|
|
|
|
|
elsif ($q->isa('DDC::Any::CQFHasFieldSet')) { |
970
|
0
|
|
|
|
|
|
$aq = DDC::Any::CQTokSet->new($attr, $q->getArg1, $q->getValues); |
971
|
|
|
|
|
|
|
} |
972
|
|
|
|
|
|
|
elsif ($q->isa('DDC::Any::CQFHasFieldRegex')) { |
973
|
0
|
|
|
|
|
|
$aq = DDC::Any::CQTokRegex->new($attr, $q->getArg1); |
974
|
|
|
|
|
|
|
} |
975
|
|
|
|
|
|
|
elsif ($q->isa('DDC::Any::CQFHasFieldPrefix')) { |
976
|
0
|
|
|
|
|
|
$aq = DDC::Any::CQTokRegex->new($attr, '^'.quotemeta($q->getArg1)); |
977
|
|
|
|
|
|
|
} |
978
|
|
|
|
|
|
|
elsif ($q->isa('DDC::Any::CQFHasFieldSuffix')) { |
979
|
0
|
|
|
|
|
|
$aq = DDC::Any::CQTokRegex->new($attr, quotemeta($q->getArg1).'$'); |
980
|
|
|
|
|
|
|
} |
981
|
|
|
|
|
|
|
elsif ($q->isa('DDC::Any::CQFHasFieldInfix')) { |
982
|
0
|
|
|
|
|
|
$aq = DDC::Any::CQTokRegex->new($attr, quotemeta($q->getArg1)); |
983
|
|
|
|
|
|
|
} |
984
|
|
|
|
|
|
|
else { |
985
|
0
|
|
|
|
|
|
$warnsub->("filter of type ".ref($q)." unsupported in native $logas request (".$q->toString.")"); |
986
|
|
|
|
|
|
|
} |
987
|
0
|
0
|
0
|
|
|
|
$aq=undef if ($aq && $aq->isa('DDC::Any::CQTokAny')); ##-- empty value, e.g. for groupby |
988
|
0
|
|
|
|
|
|
push(@$areqs, [$attr,$aq]); |
989
|
|
|
|
|
|
|
} |
990
|
|
|
|
|
|
|
elsif ($q->isa('DDC::Any::CQFRandomSort') || $q->isa('DDC::Any::CQFRankSort')) { |
991
|
|
|
|
|
|
|
##-- CQFilter: CQFRandomSort, CQFRanksort: ignore |
992
|
0
|
|
|
|
|
|
next; |
993
|
|
|
|
|
|
|
} |
994
|
|
|
|
|
|
|
elsif ($q->isa('DDC::Any::CQFSort') && ($q->getArg1 ne '' || $q->getArg2 ne '')) { |
995
|
|
|
|
|
|
|
##-- CQFilter: CQFSort: other |
996
|
0
|
|
|
|
|
|
$warnsub->("filter of type ".ref($q)." with nontrivial bounds in native $logas request (".$q->toString.")"); |
997
|
|
|
|
|
|
|
} |
998
|
|
|
|
|
|
|
} |
999
|
|
|
|
|
|
|
else { |
1000
|
|
|
|
|
|
|
##-- something else |
1001
|
0
|
|
|
|
|
|
$warnsub->("query clause of type ".ref($q)." in native $logas request (".$q->toString.")"); |
1002
|
|
|
|
|
|
|
} |
1003
|
|
|
|
|
|
|
} |
1004
|
|
|
|
|
|
|
|
1005
|
|
|
|
|
|
|
##-- check for unsupported attributes & normalize attribute names |
1006
|
0
|
|
|
|
|
|
my $allowExtra = $opts{allowExtra}; |
1007
|
0
|
0
|
|
|
|
|
$allowExtra = { map {($_=>undef)} @$allowExtra } if (!UNIVERSAL::isa($allowExtra,'HASH')); |
|
0
|
|
|
|
|
|
|
1008
|
|
|
|
|
|
|
@$areqs = grep { |
1009
|
0
|
|
|
|
|
|
$attr = $coldb->attrName($_->[0]); |
|
0
|
|
|
|
|
|
|
1010
|
0
|
0
|
0
|
|
|
|
if ( !$opts{allowUnknown} && !$coldb->hasAttr($attr) && !($allowExtra && exists($allowExtra->{$attr})) ) { |
1011
|
0
|
|
0
|
|
|
|
$warnsub->("unsupported attribute '".($_->[0]//'(undef)')."' in $logas request"); |
1012
|
0
|
|
|
|
|
|
0 |
1013
|
|
|
|
|
|
|
} else { |
1014
|
0
|
|
|
|
|
|
$_->[0] = $attr; |
1015
|
0
|
|
|
|
|
|
1 |
1016
|
|
|
|
|
|
|
} |
1017
|
|
|
|
|
|
|
} @$areqs; |
1018
|
|
|
|
|
|
|
|
1019
|
0
|
|
|
|
|
|
return $areqs; |
1020
|
|
|
|
|
|
|
} |
1021
|
|
|
|
|
|
|
|
1022
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
1023
|
|
|
|
|
|
|
## \@aqs = $coldb->parseRequest($request, %opts) |
1024
|
|
|
|
|
|
|
## + guts for parsing user target and groupby requests into attribute-wise ARRAY-ref [[$attr1,$val1], ...] |
1025
|
|
|
|
|
|
|
## + see parseQuery() method for supported $request formats and %opts |
1026
|
|
|
|
|
|
|
## + wraps $coldb->queryAttributes($coldb->parseQuery($request,%opts)) |
1027
|
|
|
|
|
|
|
sub parseRequest { |
1028
|
0
|
|
|
0
|
1
|
|
my ($coldb,$req,%opts) = @_; |
1029
|
0
|
|
|
|
|
|
return $coldb->queryAttributes($coldb->parseQuery($req,%opts),%opts); |
1030
|
|
|
|
|
|
|
} |
1031
|
|
|
|
|
|
|
|
1032
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
1033
|
|
|
|
|
|
|
## \%groupby = $coldb->groupby($groupby_request, %opts) |
1034
|
|
|
|
|
|
|
## \%groupby = $coldb->groupby(\%groupby, %opts) |
1035
|
|
|
|
|
|
|
## + $grouby_request : see parseRequest() |
1036
|
|
|
|
|
|
|
## + returns a HASH-ref: |
1037
|
|
|
|
|
|
|
## ( |
1038
|
|
|
|
|
|
|
## req => $request, ##-- save request |
1039
|
|
|
|
|
|
|
## ti2g => \&ti2g, ##-- group-tuple extraction code ($ti => $gtuple) : $g_packed = $ti2g->($ti) |
1040
|
|
|
|
|
|
|
## ts2g => \&ts2g, ##-- group-tuple extraction code ($ts => $gtuple) : $g_packed = $ts2g->($ts) |
1041
|
|
|
|
|
|
|
## g2s => \&g2s, ##-- stringification object suitable for DiaColloDB::Profile::stringify() [CODE,enum, or undef] : $g_str = $g2s->($g_packed) |
1042
|
|
|
|
|
|
|
## s2g => \&s2g, ##-- inverse-stringification object (for 2nd-pass processing) |
1043
|
|
|
|
|
|
|
## s2gx => \&s2gx, ##-- inverse-stringification object (for use with extend(), returns undef for unknown string components) |
1044
|
|
|
|
|
|
|
## g2txt => \&g2txt, ##-- compatible join()-string stringifcation sub (decimal numeric strings) |
1045
|
|
|
|
|
|
|
## txt2g => \&txt2g, ##-- compatible inverse-string stringifcation sub (decimal numeric strings) |
1046
|
|
|
|
|
|
|
## tpack => \@tpack, ##-- group-attribute-wise pack-templates, given @ttuple |
1047
|
|
|
|
|
|
|
## gpack => \@gpack, ##-- group-attribute-wise pack-templates, given @gtuple |
1048
|
|
|
|
|
|
|
## areqs => \@areqs, ##-- parsed attribute requests ([$attr,$ahaving],...) |
1049
|
|
|
|
|
|
|
## attrs => \@attrs, ##-- like $coldb->attrs($groupby_request), modulo "having" parts |
1050
|
|
|
|
|
|
|
## titles => \@titles, ##-- like map {$coldb->attrTitle($_)} @attrs |
1051
|
|
|
|
|
|
|
## ) |
1052
|
|
|
|
|
|
|
## + %opts: |
1053
|
|
|
|
|
|
|
## warn => $level, ##-- log-level for unknown attributes (default: 'warn') |
1054
|
|
|
|
|
|
|
## relax => $bool, ##-- allow unsupported attributes (default=0) |
1055
|
|
|
|
|
|
|
## tenum => $tenum, ##-- enum to use for \&t2g and \&t2s (default: $coldb->{tenum}) |
1056
|
|
|
|
|
|
|
sub groupby { |
1057
|
0
|
|
|
0
|
1
|
|
my ($coldb,$gbreq,%opts) = @_; |
1058
|
0
|
0
|
|
|
|
|
return $gbreq if (UNIVERSAL::isa($gbreq,'HASH')); |
1059
|
|
|
|
|
|
|
|
1060
|
|
|
|
|
|
|
##-- get data |
1061
|
0
|
|
0
|
|
|
|
my $wlevel = $opts{warn} // 'warn'; |
1062
|
0
|
|
|
|
|
|
my $gb = { req=>$gbreq }; |
1063
|
|
|
|
|
|
|
|
1064
|
|
|
|
|
|
|
##-- get attribute requests |
1065
|
0
|
|
|
|
|
|
my $gbareqs = $gb->{areqs} = $coldb->parseRequest($gb->{req}, %opts,parseas=>'groupby',logas=>'groupby'); |
1066
|
|
|
|
|
|
|
|
1067
|
|
|
|
|
|
|
##-- get attribute names (compat) |
1068
|
0
|
|
|
|
|
|
my $gbattrs = $gb->{attrs} = [map {$_->[0]} @$gbareqs]; |
|
0
|
|
|
|
|
|
|
1069
|
|
|
|
|
|
|
|
1070
|
|
|
|
|
|
|
##-- get attribute titles |
1071
|
0
|
|
|
|
|
|
$gb->{titles} = [map {$coldb->attrTitle($_)} @$gbattrs]; |
|
0
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
|
1073
|
|
|
|
|
|
|
##-- get groupby-sub |
1074
|
0
|
|
0
|
|
|
|
my $tenum = $opts{tenum} // $coldb->{tenum}; |
1075
|
0
|
|
|
|
|
|
my $pack_id = $coldb->{pack_id}; |
1076
|
0
|
|
|
|
|
|
my $pack_ids = "($pack_id)*"; |
1077
|
0
|
|
|
|
|
|
my $len_id = packsize($pack_id); |
1078
|
0
|
|
|
|
|
|
my @gbtpack = @{$gb->{tpack} = [map {$coldb->{"pack_t$_"}} @$gbattrs]}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1079
|
0
|
|
|
|
|
|
my $gbtpack = join('',@gbtpack); |
1080
|
0
|
|
|
|
|
|
my @gbgpack = @{$gb->{gpack} = [map {'@'.($_*$len_id).$pack_id} (0..$#$gbattrs)]}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1081
|
0
|
|
|
|
|
|
my ($ids); |
1082
|
|
|
|
|
|
|
my @gbids = ( |
1083
|
|
|
|
|
|
|
map { |
1084
|
0
|
|
|
|
|
|
($_->[1] && !UNIVERSAL::isa($_->[1],'DDC::Any::CQTokAny') |
1085
|
|
|
|
|
|
|
? { |
1086
|
0
|
|
|
|
|
|
map {($_=>undef)} |
1087
|
0
|
0
|
0
|
|
|
|
@{$coldb->enumIds($coldb->{"$_->[0]enum"}, $_->[1], logLevel=>$coldb->{logProfile}, logPrefix=>"groupby(): fetch filter ids: $_->[0]")} |
|
0
|
|
|
|
|
|
|
1088
|
|
|
|
|
|
|
} |
1089
|
|
|
|
|
|
|
: undef) |
1090
|
|
|
|
|
|
|
} @$gbareqs); |
1091
|
|
|
|
|
|
|
|
1092
|
0
|
|
|
|
|
|
my (@gi,$gi,$ti2g_code,$ts2g_code); |
1093
|
0
|
0
|
|
|
|
|
if (grep {$_} @gbids) { |
|
0
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
##-- group-by code: with having-filters |
1095
|
|
|
|
|
|
|
$ts2g_code = ('' |
1096
|
|
|
|
|
|
|
.qq{ \@gi=unpack('$gbtpack',\$_[0]);} |
1097
|
0
|
|
|
|
|
|
.qq{ return undef if (}.join(' || ', map {"!exists(\$gbids[$_]{\$gi[$_]})"} grep {defined($gbids[$_])} (0..$#gbids)).qq{);} |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1098
|
|
|
|
|
|
|
.qq{ return pack('$pack_ids',\@gi); } |
1099
|
|
|
|
|
|
|
); |
1100
|
|
|
|
|
|
|
} |
1101
|
|
|
|
|
|
|
else { |
1102
|
|
|
|
|
|
|
##-- group-by code: no filters |
1103
|
0
|
|
|
|
|
|
$ts2g_code = qq{ pack('$pack_ids', unpack('$gbtpack', \$_[0])) }; |
1104
|
|
|
|
|
|
|
} |
1105
|
0
|
|
|
|
|
|
my $ts2g_sub = eval qq{sub {$ts2g_code}}; |
1106
|
0
|
0
|
|
|
|
|
$coldb->logconfess($coldb->{error}="groupby(): could not compile tuple-based aggregation code sub {$ts2g_code}: $@") if (!$ts2g_sub); |
1107
|
0
|
|
|
|
|
|
$@=''; |
1108
|
0
|
|
|
|
|
|
$gb->{ts2g} = $ts2g_sub; |
1109
|
|
|
|
|
|
|
|
1110
|
0
|
|
|
|
|
|
($ti2g_code = $ts2g_code) =~ s{\$_\[0\]}{\$tenum->i2s(\$_[0])}; |
1111
|
0
|
|
|
|
|
|
my $ti2g_sub = eval qq{sub {$ti2g_code}}; |
1112
|
0
|
0
|
|
|
|
|
$coldb->logconfess($coldb->{error}="groupby(): could not compile id-based aggregation code sub {$ti2g_code}: $@") if (!$ti2g_sub); |
1113
|
0
|
|
|
|
|
|
$@=''; |
1114
|
0
|
|
|
|
|
|
$gb->{ti2g} = $ti2g_sub; |
1115
|
|
|
|
|
|
|
|
1116
|
|
|
|
|
|
|
##-- get stringification sub(s) |
1117
|
0
|
|
|
|
|
|
my ($genum,@genums,$g2scode,$s2gcode,$s2gxcode); |
1118
|
0
|
0
|
|
|
|
|
if (@$gbattrs == 1) { |
1119
|
|
|
|
|
|
|
##-- stringify a single attribute |
1120
|
0
|
|
|
|
|
|
$genum = $coldb->{$gbattrs->[0]."enum"}; |
1121
|
0
|
|
|
|
|
|
$g2scode = qq{ \$genum->i2s(unpack('$pack_id',\$_[0])) }; |
1122
|
0
|
|
|
|
|
|
$s2gcode = qq{ pack('$pack_id', \$genum->s2i(\$_[0]) // 0) }; |
1123
|
0
|
|
|
|
|
|
$s2gxcode = qq{ return undef if (!defined(\$gi=\$genum->s2i(\$_[0]))); return pack('$pack_id',\$gi); }; |
1124
|
|
|
|
|
|
|
} |
1125
|
|
|
|
|
|
|
else { |
1126
|
0
|
|
|
|
|
|
@genums = map {$coldb->{$_."enum"}} @$gbattrs; |
|
0
|
|
|
|
|
|
|
1127
|
|
|
|
|
|
|
$g2scode = ('' |
1128
|
|
|
|
|
|
|
.qq{ \@gi=unpack('$pack_ids', \$_[0]); } |
1129
|
0
|
|
|
|
|
|
.q{ join("\t",}.join(', ', map {"\$genums[$_]->i2s(\$gi[$_])"} (0..$#genums)).q{)} |
|
0
|
|
|
|
|
|
|
1130
|
|
|
|
|
|
|
); |
1131
|
|
|
|
|
|
|
$s2gcode = ('' |
1132
|
|
|
|
|
|
|
.qq{ \@gi=split(/\\t/, \$_[0]); } |
1133
|
0
|
|
|
|
|
|
.qq{ pack('$pack_ids',}.join(', ', map {"\$genums[$_]->s2i(\$gi[$_]) // 0"} (0..$#genums)).q{)} |
|
0
|
|
|
|
|
|
|
1134
|
|
|
|
|
|
|
); |
1135
|
|
|
|
|
|
|
$s2gxcode = ('' |
1136
|
|
|
|
|
|
|
.qq{ \@gi=split(/\\t/, \$_[0]); } |
1137
|
0
|
|
|
|
|
|
.qq{ return undef if (}.join(' || ', map {"!defined(\$gi[$_]=\$genums[$_]->s2i(\$gi[$_]))"} (0..$#genums)).q{);} |
|
0
|
|
|
|
|
|
|
1138
|
|
|
|
|
|
|
.qq{ return pack('$pack_ids',\@gi); } |
1139
|
|
|
|
|
|
|
); |
1140
|
|
|
|
|
|
|
} |
1141
|
0
|
|
|
|
|
|
my $g2s = eval qq{sub {$g2scode}}; |
1142
|
0
|
0
|
|
|
|
|
$coldb->logconfess($coldb->{error}="groupby(): could not compile stringification code sub {$g2scode}: $@") if (!$g2s); |
1143
|
0
|
|
|
|
|
|
$@=''; |
1144
|
0
|
|
|
|
|
|
$gb->{g2s} = $g2s; |
1145
|
|
|
|
|
|
|
|
1146
|
0
|
|
|
|
|
|
my $s2g = eval qq{sub {$s2gcode}}; |
1147
|
0
|
0
|
|
|
|
|
$coldb->logconfess($coldb->{error}="groupby(): could not compile inverse-stringification code sub {$s2gcode}: $@") if (!$s2g); |
1148
|
0
|
|
|
|
|
|
$@=''; |
1149
|
0
|
|
|
|
|
|
$gb->{s2g} = $s2g; |
1150
|
|
|
|
|
|
|
|
1151
|
0
|
|
|
|
|
|
my $s2gx = eval qq{sub {$s2gxcode}}; |
1152
|
0
|
0
|
|
|
|
|
$coldb->logconfess($coldb->{error}="groupby(): could not compile inverse-stringification code sub for extend {$s2gxcode}: $@") if (!$s2gx); |
1153
|
0
|
|
|
|
|
|
$@=''; |
1154
|
0
|
|
|
|
|
|
$gb->{s2gx} = $s2gx; |
1155
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
##-- get pseudo-stringification sub ("\t"-joined decimal integer ids) |
1157
|
0
|
|
|
|
|
|
my ($g2txt_code,$txt2g_code); |
1158
|
0
|
0
|
|
|
|
|
if (@$gbattrs == 1) { |
1159
|
|
|
|
|
|
|
##-- stringify a single attribute |
1160
|
0
|
|
|
|
|
|
$g2txt_code = qq{ unpack('$pack_id',\$_[0]) }; |
1161
|
0
|
|
|
|
|
|
$txt2g_code = qq{ pack('$pack_id',\$_[0] // 0) }; |
1162
|
|
|
|
|
|
|
} |
1163
|
|
|
|
|
|
|
else { |
1164
|
0
|
|
|
|
|
|
$g2txt_code = qq{ join("\t",unpack('$pack_ids', \$_[0])); }; |
1165
|
0
|
|
|
|
|
|
$txt2g_code = qq{ pack('$pack_ids', split(/\t/, \$_[0] // 0)); }; |
1166
|
|
|
|
|
|
|
} |
1167
|
0
|
|
|
|
|
|
my $g2txt = eval qq{sub {$g2txt_code}}; |
1168
|
0
|
0
|
|
|
|
|
$coldb->logconfess($coldb->{error}="groupby(): could not compile pseudo-stringification code sub {$g2txt_code}: $@") if (!$g2txt); |
1169
|
0
|
|
|
|
|
|
$@=''; |
1170
|
0
|
|
|
|
|
|
$gb->{g2txt} = $g2txt; |
1171
|
|
|
|
|
|
|
|
1172
|
0
|
|
|
|
|
|
my $txt2g = eval qq{sub {$txt2g_code}}; |
1173
|
0
|
0
|
|
|
|
|
$coldb->logconfess($coldb->{error}="groupby(): could not compile inverse pseudo-stringification code sub {$txt2g_code}: $@") if (!$txt2g); |
1174
|
0
|
|
|
|
|
|
$@=''; |
1175
|
0
|
|
|
|
|
|
$gb->{txt2g} = $txt2g; |
1176
|
|
|
|
|
|
|
|
1177
|
0
|
|
|
|
|
|
return $gb; |
1178
|
|
|
|
|
|
|
} |
1179
|
|
|
|
|
|
|
|
1180
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
1181
|
|
|
|
|
|
|
## $cqfilter = $coldb->query2filter($attr,$cquery,%opts) |
1182
|
|
|
|
|
|
|
## + converts a CQToken to a CQFilter, for ddc parsing |
1183
|
|
|
|
|
|
|
## + %opts: |
1184
|
|
|
|
|
|
|
## logas => $logas, ##-- log-prefix for warnings |
1185
|
|
|
|
|
|
|
sub query2filter { |
1186
|
0
|
|
|
0
|
1
|
|
my ($coldb,$attr,$q,%opts) = @_; |
1187
|
0
|
0
|
|
|
|
|
return undef if (!defined($q)); |
1188
|
0
|
|
0
|
|
|
|
my $logas = $opts{logas} || 'query2filter'; |
1189
|
|
|
|
|
|
|
|
1190
|
|
|
|
|
|
|
##-- document attribute ("doc.ATTR" convention) |
1191
|
0
|
|
0
|
|
|
|
my $field = $coldb->attrName( $attr // $q->getIndexName ); |
1192
|
0
|
0
|
|
|
|
|
$field = $1 if ($field =~ /^doc\.(.*)$/); |
1193
|
0
|
0
|
0
|
|
|
|
if (UNIVERSAL::isa($q, 'DDC::Any::CQTokAny')) { |
|
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1194
|
0
|
|
|
|
|
|
return undef; |
1195
|
|
|
|
|
|
|
} elsif (UNIVERSAL::isa($q, 'DDC::Any::CQTokExact') || UNIVERSAL::isa($q, 'DDC::Any::CQTokInfl')) { |
1196
|
0
|
|
|
|
|
|
return DDC::Any::CQFHasField->new($field, $q->getValue, $q->getNegated); |
1197
|
|
|
|
|
|
|
} elsif (UNIVERSAL::isa($q, 'DDC::Any::CQTokSet') || UNIVERSAL::isa($q, 'DDC::Any::CQTokSetInfl')) { |
1198
|
0
|
|
|
|
|
|
return DDC::Any::CQFHasFieldSet->new($field, $q->getValues, $q->getNegated); |
1199
|
|
|
|
|
|
|
} elsif (UNIVERSAL::isa($q, 'DDC::Any::CQTokRegex')) { |
1200
|
0
|
|
|
|
|
|
return DDC::Any::CQFHasFieldRegex->new($field, $q->getValue, $q->getNegated); |
1201
|
|
|
|
|
|
|
} elsif (UNIVERSAL::isa($q, 'DDC::Any::CQTokPrefix')) { |
1202
|
0
|
|
|
|
|
|
return DDC::Any::CQFHasFieldPrefix->new($field, $q->getValue, $q->getNegated); |
1203
|
|
|
|
|
|
|
} elsif (UNIVERSAL::isa($q, 'DDC::Any::CQTokSuffix')) { |
1204
|
0
|
|
|
|
|
|
return DDC::Any::CQFHasFieldSuffix->new($field, $q->getValue, $q->getNegated); |
1205
|
|
|
|
|
|
|
} elsif (UNIVERSAL::isa($q, 'DDC::Any::CQTokInfix')) { |
1206
|
0
|
|
|
|
|
|
return DDC::Any::CQFHasFieldInfix->new($field, $q->getValue, $q->getNegated); |
1207
|
|
|
|
|
|
|
} else { |
1208
|
0
|
|
|
|
|
|
$coldb->logconfess("can't handle metadata restriction of type ", ref($q), " in $logas request: \`", $q->toString, "'"); |
1209
|
|
|
|
|
|
|
} |
1210
|
|
|
|
|
|
|
} |
1211
|
|
|
|
|
|
|
|
1212
|
|
|
|
|
|
|
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
1213
|
|
|
|
|
|
|
## ($CQCountKeyExprs,\$CQRestrict,\@CQFilters) = $coldb->parseGroupBy($groupby_string_or_request,%opts) |
1214
|
|
|
|
|
|
|
## + for ddc-mode parsing |
1215
|
|
|
|
|
|
|
## + %opts: |
1216
|
|
|
|
|
|
|
## date => $date, |
1217
|
|
|
|
|
|
|
## slice => $slice, |
1218
|
|
|
|
|
|
|
## matchid => $matchid, ##-- default match-id |
1219
|
|
|
|
|
|
|
sub parseGroupBy { |
1220
|
0
|
|
|
0
|
1
|
|
my ($coldb,$req,%opts) = @_; |
1221
|
0
|
|
0
|
|
|
|
$req //= $coldb->attrs; |
1222
|
|
|
|
|
|
|
|
1223
|
|
|
|
|
|
|
##-- groupby clause: date |
1224
|
|
|
|
|
|
|
my $gbdate = ($opts{slice}<=0 |
1225
|
|
|
|
|
|
|
? DDC::Any::CQCountKeyExprConstant->new($opts{slice}||'0') |
1226
|
0
|
0
|
0
|
|
|
|
: DDC::Any::CQCountKeyExprDateSlice->new('date',$opts{slice})); |
1227
|
|
|
|
|
|
|
|
1228
|
|
|
|
|
|
|
##-- groupby clause: user request |
1229
|
0
|
|
|
|
|
|
my $gbexprs = [$gbdate]; |
1230
|
0
|
|
|
|
|
|
my $gbfilters = []; |
1231
|
0
|
|
|
|
|
|
my ($gbrestr); |
1232
|
0
|
0
|
0
|
|
|
|
if (!ref($req) && $req =~ m{^\s*(?:\#by)?\s*\[(.*)\]\s*$}) { |
1233
|
|
|
|
|
|
|
##-- ddc-style request; no restriction-clauses are allowed |
1234
|
0
|
|
|
|
|
|
my $cbstr = $1; |
1235
|
|
|
|
|
|
|
my $gbq = $coldb->qparse("count(*) #by[$cbstr]") |
1236
|
0
|
0
|
|
|
|
|
or $coldb->logconfess($coldb->{error}="failed to parse DDC groupby request \`$req': $coldb->{error}"); |
1237
|
0
|
|
|
|
|
|
push(@$gbexprs, @{$gbq->getKeys->getExprs}); |
|
0
|
|
|
|
|
|
|
1238
|
|
|
|
|
|
|
$_->setMatchId($opts{matchid}//0) |
1239
|
0
|
0
|
0
|
|
|
|
foreach (grep {UNIVERSAL::isa($_,'DDC::Any::CQCountKeyExprToken') && !$_->HasMatchId} @$gbexprs); |
|
0
|
|
|
|
|
|
|
1240
|
|
|
|
|
|
|
} |
1241
|
|
|
|
|
|
|
else { |
1242
|
|
|
|
|
|
|
##-- native-style request with optional restrictions |
1243
|
0
|
|
|
|
|
|
my $gbreq = $coldb->parseRequest($req, parseas=>'groupby', logas=>'groupby', default=>undef, relax=>1, allowUnknown=>1); |
1244
|
0
|
|
|
|
|
|
my ($filter); |
1245
|
0
|
|
|
|
|
|
foreach (@$gbreq) { |
1246
|
0
|
|
|
|
|
|
push(@$gbexprs, $coldb->attrCountBy($_->[0], 2)); |
1247
|
0
|
0
|
|
|
|
|
if ($_->[0] =~ /^doc\./) { |
1248
|
|
|
|
|
|
|
##-- document attribute ("doc.ATTR" convention) |
1249
|
0
|
0
|
|
|
|
|
push(@$gbfilters, $filter) if (defined($filter=$coldb->query2filter($_->[0], $_->[1]))); |
1250
|
|
|
|
|
|
|
} |
1251
|
|
|
|
|
|
|
else { |
1252
|
|
|
|
|
|
|
##-- token attribute |
1253
|
0
|
0
|
0
|
|
|
|
if (defined($_->[1]) && !UNIVERSAL::isa($_->[1], 'DDC::Any::CQTokAny')) { |
1254
|
0
|
0
|
|
|
|
|
$gbrestr = (defined($gbrestr) ? DDC::Any::CQWith->new($gbrestr,$_->[1]) : $_->[1]); |
1255
|
|
|
|
|
|
|
} |
1256
|
|
|
|
|
|
|
} |
1257
|
|
|
|
|
|
|
} |
1258
|
|
|
|
|
|
|
} |
1259
|
|
|
|
|
|
|
|
1260
|
|
|
|
|
|
|
##-- finalize: expression list |
1261
|
0
|
|
|
|
|
|
my $xlist = DDC::Any::CQCountKeyExprList->new; |
1262
|
0
|
|
|
|
|
|
$xlist->setExprs($gbexprs); |
1263
|
|
|
|
|
|
|
|
1264
|
0
|
|
|
|
|
|
return ($xlist,$gbrestr,$gbfilters); |
1265
|
|
|
|
|
|
|
} |
1266
|
|
|
|
|
|
|
|
1267
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
1268
|
|
|
|
|
|
|
## Profiling: Generic |
1269
|
|
|
|
|
|
|
|
1270
|
|
|
|
|
|
|
## $mprf = $coldb->profile($relation, %opts) |
1271
|
|
|
|
|
|
|
## + get a relation profile for selected items as a DiaColloDB::Profile::Multi object |
1272
|
|
|
|
|
|
|
## + %opts: |
1273
|
|
|
|
|
|
|
## ( |
1274
|
|
|
|
|
|
|
## ##-- selection parameters |
1275
|
|
|
|
|
|
|
## query => $query, ##-- target request ATTR:REQ... |
1276
|
|
|
|
|
|
|
## date => $date1, ##-- string or array or range "MIN-MAX" (inclusive) : default=all |
1277
|
|
|
|
|
|
|
## ## |
1278
|
|
|
|
|
|
|
## ##-- aggregation parameters |
1279
|
|
|
|
|
|
|
## slice => $slice, ##-- date slice (default=1, 0 for global profile) |
1280
|
|
|
|
|
|
|
## groupby => $groupby, ##-- string or array "ATTR1[:HAVING1] ...": default=$coldb->attrs; see groupby() method |
1281
|
|
|
|
|
|
|
## ## |
1282
|
|
|
|
|
|
|
## ##-- scoring and trimming parameters |
1283
|
|
|
|
|
|
|
## eps => $eps, ##-- smoothing constant (default=0) |
1284
|
|
|
|
|
|
|
## score => $func, ##-- scoring function (f|fm|lf|lfm|mi|ld) : default="f" |
1285
|
|
|
|
|
|
|
## kbest => $k, ##-- return only $k best collocates per date (slice) : default=-1:all |
1286
|
|
|
|
|
|
|
## cutoff => $cutoff, ##-- minimum score |
1287
|
|
|
|
|
|
|
## global => $bool, ##-- trim profiles globally (vs. locally for each date-slice?) (default=0) |
1288
|
|
|
|
|
|
|
## ## |
1289
|
|
|
|
|
|
|
## ##-- profiling and debugging parameters |
1290
|
|
|
|
|
|
|
## strings => $bool, ##-- do/don't stringify output profile(s) (default=do) |
1291
|
|
|
|
|
|
|
## fill => $bool, ##-- if true, returned multi-profile will have null profiles inserted for missing slices |
1292
|
|
|
|
|
|
|
## onepass => $bool, ##-- if true, use fast but incorrect 1-pass method (Cofreqs profiling only, >= v0.09.001) |
1293
|
|
|
|
|
|
|
## ) |
1294
|
|
|
|
|
|
|
## + sets default %opts and wraps $coldb->relation($rel)->profile($coldb, %opts) |
1295
|
|
|
|
|
|
|
sub profile { |
1296
|
0
|
|
|
0
|
1
|
|
my ($coldb,$rel,%opts) = @_; |
1297
|
|
|
|
|
|
|
|
1298
|
|
|
|
|
|
|
##-- defaults |
1299
|
0
|
|
|
|
|
|
$coldb->profileOptions(\%opts); |
1300
|
|
|
|
|
|
|
|
1301
|
|
|
|
|
|
|
##-- debug |
1302
|
|
|
|
|
|
|
$coldb->vlog($coldb->{logRequest}, |
1303
|
|
|
|
|
|
|
"profile(" |
1304
|
|
|
|
|
|
|
.join(', ', |
1305
|
0
|
|
0
|
|
|
|
map {"$_->[0]='".quotemeta($_->[1]//'')."'"} |
1306
|
|
|
|
|
|
|
([rel=>$rel], |
1307
|
|
|
|
|
|
|
[query=>$opts{query}], |
1308
|
0
|
|
|
|
|
|
[groupby=>UNIVERSAL::isa($opts{groupby},'ARRAY') ? join(',', @{$opts{groupby}}) : $opts{groupby}], |
1309
|
0
|
0
|
|
|
|
|
(map {[$_=>$opts{$_}]} qw(date slice score eps kbest cutoff global onepass)), |
|
0
|
|
|
|
|
|
|
1310
|
|
|
|
|
|
|
)) |
1311
|
|
|
|
|
|
|
.")"); |
1312
|
|
|
|
|
|
|
|
1313
|
|
|
|
|
|
|
##-- relation |
1314
|
0
|
|
|
|
|
|
my ($reldb); |
1315
|
0
|
0
|
0
|
|
|
|
if (!defined($reldb=$coldb->relation($rel||'cof'))) { |
1316
|
0
|
|
0
|
|
|
|
$coldb->logwarn($coldb->{error}="profile(): unknown relation '".($rel//'-undef-')."'"); |
1317
|
0
|
|
|
|
|
|
return undef; |
1318
|
|
|
|
|
|
|
} |
1319
|
|
|
|
|
|
|
|
1320
|
|
|
|
|
|
|
##-- delegate |
1321
|
0
|
|
|
|
|
|
return $reldb->profile($coldb,%opts); |
1322
|
|
|
|
|
|
|
} |
1323
|
|
|
|
|
|
|
|
1324
|
|
|
|
|
|
|
## \%opts = $CLASS_OR_OBJECT->profileOptions(\%opts) |
1325
|
|
|
|
|
|
|
## + instantiates default options for profile() method |
1326
|
|
|
|
|
|
|
sub profileOptions { |
1327
|
0
|
|
|
0
|
1
|
|
my ($that,$opts) = @_; |
1328
|
|
|
|
|
|
|
|
1329
|
|
|
|
|
|
|
##-- defaults |
1330
|
0
|
|
0
|
|
|
|
$opts->{query} = (grep {defined($_)} @$opts{qw(query q lemma lem l)})[0] // ''; |
|
0
|
|
|
|
|
|
|
1331
|
0
|
|
0
|
|
|
|
$opts->{date} //= ''; |
1332
|
0
|
|
0
|
|
|
|
$opts->{slice} //= 1; |
1333
|
0
|
0
|
0
|
|
|
|
$opts->{groupby} ||= join(',', map {quotemeta($_)} @{$that->attrs}) if (ref($that)); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1334
|
0
|
|
0
|
|
|
|
$opts->{score} //= 'f'; |
1335
|
0
|
|
0
|
|
|
|
$opts->{eps} //= 0; #0.5; |
1336
|
0
|
|
0
|
|
|
|
$opts->{kbest} //= -1; |
1337
|
0
|
|
0
|
|
|
|
$opts->{cutoff} //= ''; |
1338
|
0
|
|
0
|
|
|
|
$opts->{global} //= 0; |
1339
|
0
|
|
0
|
|
|
|
$opts->{strings} //= 1; |
1340
|
0
|
|
0
|
|
|
|
$opts->{fill} //= 0; |
1341
|
0
|
|
0
|
|
|
|
$opts->{onepass} //= 0; |
1342
|
|
|
|
|
|
|
|
1343
|
0
|
|
|
|
|
|
return $opts; |
1344
|
|
|
|
|
|
|
} |
1345
|
|
|
|
|
|
|
|
1346
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
1347
|
|
|
|
|
|
|
## Profiling: extend (pass-2 for multi-clients) |
1348
|
|
|
|
|
|
|
|
1349
|
|
|
|
|
|
|
## $mprf = $coldb->extend($relation, %opts) |
1350
|
|
|
|
|
|
|
## + get independent f2 frequencies for $opts{slice2keys}, which is EITHER |
1351
|
|
|
|
|
|
|
## - a HASH-ref {$slice1=>\@keys1, ...}, |
1352
|
|
|
|
|
|
|
## OR |
1353
|
|
|
|
|
|
|
## - a JSON-string encoding a such a HASH-ref |
1354
|
|
|
|
|
|
|
## + %opts, as for profile(), except: |
1355
|
|
|
|
|
|
|
## ( |
1356
|
|
|
|
|
|
|
## ##-- selection parameters |
1357
|
|
|
|
|
|
|
## query => $query, ##-- target request ATTR:REQ... : mostly IGNORED (but used e.g. by ddc back-end) |
1358
|
|
|
|
|
|
|
## slice2keys => \%slice2keys, ##-- target f2-items or JSON-string |
1359
|
|
|
|
|
|
|
## ##-- scoring and trimming parameters : IGNORED |
1360
|
|
|
|
|
|
|
## ##-- profiling and debugging parameters: IGNORED |
1361
|
|
|
|
|
|
|
## ) |
1362
|
|
|
|
|
|
|
## + returns a DiaColloDB::Profile::Multi containing the appropriate f2 entries |
1363
|
|
|
|
|
|
|
sub extend { |
1364
|
0
|
|
|
0
|
1
|
|
my ($coldb,$rel,%opts) = @_; |
1365
|
|
|
|
|
|
|
|
1366
|
|
|
|
|
|
|
##-- defaults |
1367
|
0
|
|
|
|
|
|
$coldb->profileOptions(\%opts); |
1368
|
|
|
|
|
|
|
|
1369
|
|
|
|
|
|
|
##-- items |
1370
|
0
|
|
0
|
|
|
|
$opts{slice2keys} //= ''; |
1371
|
|
|
|
|
|
|
$opts{slice2keys} = DiaColloDB::Utils::loadJsonString($opts{slice2keys}) |
1372
|
0
|
0
|
0
|
|
|
|
if ($opts{slice2keys} && !ref($opts{slice2keys})); |
1373
|
|
|
|
|
|
|
|
1374
|
|
|
|
|
|
|
##-- debug |
1375
|
|
|
|
|
|
|
$coldb->vlog($coldb->{logRequest}, |
1376
|
|
|
|
|
|
|
"extend(" |
1377
|
|
|
|
|
|
|
.join(', ', |
1378
|
0
|
|
0
|
|
|
|
map {"$_->[0]='".quotemeta($_->[1]//'')."'"} |
1379
|
|
|
|
|
|
|
([rel=>$rel], |
1380
|
|
|
|
|
|
|
[query=>$opts{query}], |
1381
|
0
|
|
|
|
|
|
[groupby=>UNIVERSAL::isa($opts{groupby},'ARRAY') ? join(',', @{$opts{groupby}}) : $opts{groupby}], |
1382
|
0
|
0
|
|
|
|
|
(map {[$_=>$opts{$_}]} qw(date slice)), |
|
0
|
|
|
|
|
|
|
1383
|
|
|
|
|
|
|
)) |
1384
|
|
|
|
|
|
|
.")"); |
1385
|
|
|
|
|
|
|
|
1386
|
|
|
|
|
|
|
##-- relation |
1387
|
0
|
|
|
|
|
|
my ($reldb); |
1388
|
0
|
0
|
0
|
|
|
|
if (!defined($reldb=$coldb->relation($rel||'cof'))) { |
1389
|
0
|
|
0
|
|
|
|
$coldb->logwarn($coldb->{error}="extend(): unknown relation '".($rel//'-undef-')."'"); |
1390
|
0
|
|
|
|
|
|
return undef; |
1391
|
|
|
|
|
|
|
} |
1392
|
|
|
|
|
|
|
|
1393
|
|
|
|
|
|
|
##-- delegate |
1394
|
0
|
|
|
|
|
|
return $reldb->extend($coldb,%opts); |
1395
|
|
|
|
|
|
|
} |
1396
|
|
|
|
|
|
|
|
1397
|
|
|
|
|
|
|
|
1398
|
|
|
|
|
|
|
|
1399
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
1400
|
|
|
|
|
|
|
## Profiling: Comparison (diff) |
1401
|
|
|
|
|
|
|
|
1402
|
|
|
|
|
|
|
## $mprf = $coldb->compare($relation, %opts) |
1403
|
|
|
|
|
|
|
## + get a relation comparison profile for selected items as a DiaColloDB::Profile::MultiDiff object |
1404
|
|
|
|
|
|
|
## + %opts: |
1405
|
|
|
|
|
|
|
## ( |
1406
|
|
|
|
|
|
|
## ##-- selection parameters |
1407
|
|
|
|
|
|
|
## (a|b)?query => $query, ##-- target query as for parseRequest() |
1408
|
|
|
|
|
|
|
## (a|b)?date => $date1, ##-- string or array or range "MIN-MAX" (inclusive) : default=all |
1409
|
|
|
|
|
|
|
## ## |
1410
|
|
|
|
|
|
|
## ##-- aggregation parameters |
1411
|
|
|
|
|
|
|
## groupby => $groupby, ##-- string or array "ATTR1[:HAVING1] ...": default=$coldb->attrs; see groupby() method |
1412
|
|
|
|
|
|
|
## (a|b)?slice => $slice, ##-- date slice (default=1, 0 for global profile) |
1413
|
|
|
|
|
|
|
## ## |
1414
|
|
|
|
|
|
|
## ##-- scoring and trimming parameters |
1415
|
|
|
|
|
|
|
## eps => $eps, ##-- smoothing constant (default=0) |
1416
|
|
|
|
|
|
|
## score => $func, ##-- scoring function (f|fm|lf|lfm|mi|ld) : default="f" |
1417
|
|
|
|
|
|
|
## kbest => $k, ##-- return only $k best collocates per date (slice) : default=-1:all |
1418
|
|
|
|
|
|
|
## cutoff => $cutoff, ##-- minimum score (UNUSED for comparison profiles) |
1419
|
|
|
|
|
|
|
## global => $bool, ##-- trim profiles globally (vs. locally for each date-slice?) (default=0) |
1420
|
|
|
|
|
|
|
## diff => $diff, ##-- low-level score-diff operation (diff|adiff|sum|min|max|avg|havg); default='adiff' |
1421
|
|
|
|
|
|
|
## ## |
1422
|
|
|
|
|
|
|
## ##-- profiling and debugging parameters |
1423
|
|
|
|
|
|
|
## strings => $bool, ##-- do/don't stringify (default=do) |
1424
|
|
|
|
|
|
|
## ) |
1425
|
|
|
|
|
|
|
## + sets default %opts and wraps $coldb->relation($rel)->compare($coldb, %opts) |
1426
|
1
|
|
|
1
|
|
637
|
BEGIN { *diff = \&compare; } |
1427
|
|
|
|
|
|
|
sub compare { |
1428
|
0
|
|
|
0
|
1
|
|
my ($coldb,$rel,%opts) = @_; |
1429
|
0
|
|
0
|
|
|
|
$rel //= 'cof'; |
1430
|
|
|
|
|
|
|
|
1431
|
|
|
|
|
|
|
##-- defaults and '[ab]OPTION' parsing |
1432
|
0
|
|
|
|
|
|
$coldb->compareOptions(\%opts); |
1433
|
|
|
|
|
|
|
|
1434
|
|
|
|
|
|
|
##-- debug |
1435
|
|
|
|
|
|
|
$coldb->vlog($coldb->{logRequest}, |
1436
|
|
|
|
|
|
|
"compare(" |
1437
|
|
|
|
|
|
|
.join(', ', |
1438
|
0
|
|
0
|
|
|
|
map {"$_->[0]=".quotemeta($_->[1]//'')."'"} |
1439
|
|
|
|
|
|
|
([rel=>$rel], |
1440
|
0
|
|
|
|
|
|
(map {["a$_"=>$opts{"a$_"}]} (qw(query date slice))), |
1441
|
0
|
|
|
|
|
|
(map {["b$_"=>$opts{"b$_"}]} (qw(query date slice))), |
1442
|
0
|
|
|
|
|
|
[groupby=>(UNIVERSAL::isa($opts{groupby},'ARRAY') ? join(',',@{$opts{groupby}}) : $opts{groupby})], |
1443
|
0
|
0
|
|
|
|
|
(map {[$_=>$opts{$_}]} qw(score eps kbest cutoff global diff)), |
|
0
|
|
|
|
|
|
|
1444
|
|
|
|
|
|
|
)) |
1445
|
|
|
|
|
|
|
.")"); |
1446
|
|
|
|
|
|
|
|
1447
|
|
|
|
|
|
|
##-- relation |
1448
|
0
|
|
|
|
|
|
my ($reldb); |
1449
|
0
|
0
|
0
|
|
|
|
if (!defined($reldb=$coldb->relation($rel||'cof'))) { |
1450
|
0
|
|
0
|
|
|
|
$coldb->logwarn($coldb->{error}="profile(): unknown relation '".($rel//'-undef-')."'"); |
1451
|
0
|
|
|
|
|
|
return undef; |
1452
|
|
|
|
|
|
|
} |
1453
|
|
|
|
|
|
|
|
1454
|
|
|
|
|
|
|
##-- delegate |
1455
|
0
|
|
|
|
|
|
return $reldb->compare($coldb,%opts); |
1456
|
|
|
|
|
|
|
} |
1457
|
|
|
|
|
|
|
|
1458
|
|
|
|
|
|
|
## \%opts = $CLASS_OR_OBJECT->compareOptions(\%opts) |
1459
|
|
|
|
|
|
|
## + instantiates default options for compare() method |
1460
|
|
|
|
|
|
|
sub compareOptions { |
1461
|
0
|
|
|
0
|
1
|
|
my ($that,$opts) = @_; |
1462
|
|
|
|
|
|
|
|
1463
|
|
|
|
|
|
|
##-- defaults and '[ab]OPTION' parsing |
1464
|
0
|
|
|
|
|
|
foreach my $ab (qw(a b)) { |
1465
|
0
|
|
|
|
|
|
$opts->{"${ab}query"} = ((grep {defined($_)} @$opts{map {"${ab}$_"} qw(query q lemma lem l)}), |
|
0
|
|
|
|
|
|
|
1466
|
0
|
|
0
|
|
|
|
(grep {defined($_)} @$opts{qw(query q lemma lem l)}), |
|
0
|
|
|
|
|
|
|
1467
|
|
|
|
|
|
|
)[0]//''; |
1468
|
|
|
|
|
|
|
} |
1469
|
0
|
|
|
|
|
|
foreach my $attr (qw(date slice)) { |
1470
|
0
|
0
|
|
|
|
|
$opts->{"a$attr"} = ((map {defined($opts->{"a$_"}) ? $opts->{"a$_"} : qw()} @{$ATTR_RALIAS{$attr}}), |
|
0
|
|
|
|
|
|
|
1471
|
0
|
0
|
0
|
|
|
|
(map {defined($opts->{$_}) ? $opts->{$_} : qw()} @{$ATTR_RALIAS{$attr}}), |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1472
|
|
|
|
|
|
|
)[0]//''; |
1473
|
0
|
0
|
|
|
|
|
$opts->{"b$attr"} = ((map {defined($opts->{"b$_"}) ? $opts->{"b$_"} : qw()} @{$ATTR_RALIAS{$attr}}), |
|
0
|
|
|
|
|
|
|
1474
|
0
|
0
|
0
|
|
|
|
(map {defined($opts->{$_}) ? $opts->{$_} : qw()} @{$ATTR_RALIAS{$attr}}), |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1475
|
|
|
|
|
|
|
)[0]//''; |
1476
|
|
|
|
|
|
|
} |
1477
|
0
|
|
|
|
|
|
delete @$opts{keys %ATTR_ALIAS}; |
1478
|
|
|
|
|
|
|
|
1479
|
|
|
|
|
|
|
##-- diff defaults |
1480
|
0
|
|
0
|
|
|
|
$opts->{diff} //= 'adiff'; |
1481
|
0
|
|
0
|
|
|
|
$opts->{fill} //= 1; |
1482
|
|
|
|
|
|
|
|
1483
|
|
|
|
|
|
|
##-- common defaults |
1484
|
0
|
|
|
|
|
|
$that->profileOptions($opts); |
1485
|
|
|
|
|
|
|
|
1486
|
0
|
|
|
|
|
|
return $opts; |
1487
|
|
|
|
|
|
|
} |
1488
|
|
|
|
|
|
|
|
1489
|
|
|
|
|
|
|
##============================================================================== |
1490
|
|
|
|
|
|
|
## Footer |
1491
|
|
|
|
|
|
|
1; |
1492
|
|
|
|
|
|
|
|
1493
|
|
|
|
|
|
|
__END__ |