line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
## -*- Mode: CPerl -*- |
2
|
|
|
|
|
|
|
## File: DiaColloDB::methods::compile.pm |
3
|
|
|
|
|
|
|
## Author: Bryan Jurish <moocow@cpan.org> |
4
|
|
|
|
|
|
|
## Description: collocation db, top-level compile-time methods (create, union, etc.) |
5
|
|
|
|
|
|
|
## + really just adds methods to top-level DiaColloDB package |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
##-- dummy package |
8
|
|
|
|
|
|
|
package DiaColloDB::methods::compile; |
9
|
1
|
|
|
1
|
|
7
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
34
|
|
10
|
|
|
|
|
|
|
1; |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
package DiaColloDB; |
13
|
1
|
|
|
1
|
|
5
|
use vars qw($MMCLASS $ECLASS $XECLASS %TDF_OPTS $NJOBS); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
52
|
|
14
|
1
|
|
|
1
|
|
5
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
31
|
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
##============================================================================== |
17
|
|
|
|
|
|
|
## DiaColloDB: create/compile |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
20
|
|
|
|
|
|
|
## create: utils |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
## \%line2undef = $coldb->loadFilterFile($filename_or_undef) |
23
|
|
|
|
|
|
|
## + now in DiaColloDB::Corpus::Filters (since v0.12.012_01); alias retained for compatibility |
24
|
1
|
|
|
1
|
|
720
|
BEGIN { *loadFilterFile = \&DiaColloDB::Corpus::Filters::loadListFile; } |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
## $filters = $coldb->corpusFilters() |
27
|
|
|
|
|
|
|
## + DiaColloDB::Corpus::Filters object from $coldb options |
28
|
|
|
|
|
|
|
sub corpusFilters { |
29
|
0
|
|
|
0
|
0
|
|
my $coldb = shift; |
30
|
0
|
|
|
|
|
|
return DiaColloDB::Corpus::Filters->new(map {($_=>$coldb->{$_})} |
|
0
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
@DiaColloDB::Corpus::Filters::NAMES, |
32
|
|
|
|
|
|
|
@DiaColloDB::Corpus::Filters::FILES); |
33
|
|
|
|
|
|
|
} |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
## $multimap = $coldb->create_multimap($base, \%ts2i, $packfmt, $label="multimap") |
36
|
|
|
|
|
|
|
sub create_multimap { |
37
|
0
|
|
|
0
|
0
|
|
my ($coldb,$base,$ts2i,$packfmt,$label) = @_; |
38
|
0
|
|
0
|
|
|
|
$label //= "multimap"; |
39
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate},"create_multimap(): creating $label $base.*"); |
40
|
|
|
|
|
|
|
|
41
|
0
|
|
|
|
|
|
my $pack_id = $coldb->{pack_id}; |
42
|
0
|
|
|
|
|
|
my $pack_mmb = "${pack_id}*"; ##-- multimap target-set pack format |
43
|
0
|
|
|
|
|
|
my @v2ti = qw(); |
44
|
0
|
|
|
|
|
|
my ($t,$ti,$vi); |
45
|
0
|
|
|
|
|
|
while (($t,$ti)=each %$ts2i) { |
46
|
0
|
|
|
|
|
|
($vi) = unpack($packfmt,$t); |
47
|
0
|
|
|
|
|
|
$v2ti[$vi] .= pack($pack_id,$ti); |
48
|
|
|
|
|
|
|
} |
49
|
0
|
|
0
|
|
|
|
$_ = pack($pack_mmb, sort {$a<=>$b} unpack($pack_mmb,$_//'')) foreach (@v2ti); ##-- ensure multimap target-sets are sorted |
|
0
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
my $v2t = $coldb->mmclass($MMCLASS)->new(base=>$base, flags=>'rw', perms=>$coldb->{perms}, pack_i=>$pack_id, pack_o=>$coldb->{pack_off}, pack_l=>$coldb->{pack_id}) |
52
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("create_multimap(): failed to create $base.*: $!"); |
53
|
0
|
0
|
|
|
|
|
$v2t->fromArray(\@v2ti) |
54
|
|
|
|
|
|
|
or $coldb->logconfess("create_multimap(): failed to populate $base.*: $!"); |
55
|
0
|
0
|
|
|
|
|
$v2t->flush() |
56
|
|
|
|
|
|
|
or $coldb->logconfess("create_multimap(): failed to flush $base.*: $!"); |
57
|
|
|
|
|
|
|
|
58
|
0
|
|
|
|
|
|
return $v2t; |
59
|
|
|
|
|
|
|
} |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
## \@attrs = $coldb->attrs() |
62
|
|
|
|
|
|
|
## \@attrs = $coldb->attrs($attrs=$coldb->{attrs}, $default=[]) |
63
|
|
|
|
|
|
|
## + parse attributes in $attrs as array |
64
|
|
|
|
|
|
|
sub attrs { |
65
|
0
|
|
|
0
|
0
|
|
my ($coldb,$attrs,$default) = @_; |
66
|
0
|
|
0
|
|
|
|
$attrs //= $coldb->{attrs} // $default // []; |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
67
|
0
|
0
|
|
|
|
|
return $attrs if (UNIVERSAL::isa($attrs,'ARRAY')); |
68
|
0
|
0
|
|
|
|
|
return [grep {defined($_) && $_ ne ''} split(/[\s\,]+/, $attrs)]; |
|
0
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
## $aname = $CLASS_OR_OBJECT->attrName($attr) |
72
|
|
|
|
|
|
|
## + returns canonical (short) attribute name for $attr |
73
|
|
|
|
|
|
|
## + supports aliases in %ATTR_ALIAS = ($alias=>$name, ...) |
74
|
|
|
|
|
|
|
## + see also: |
75
|
|
|
|
|
|
|
## %ATTR_RALIAS = ($name=>\@aliases, ...) |
76
|
|
|
|
|
|
|
## %ATTR_CBEXPR = ($name=>$ddcCountByExpr, ...) |
77
|
|
|
|
|
|
|
## %ATTR_TITLE = ($name_or_alias=>$title, ...) |
78
|
|
|
|
|
|
|
our (%ATTR_ALIAS,%ATTR_RALIAS,%ATTR_TITLE,%ATTR_CBEXPR); |
79
|
|
|
|
|
|
|
BEGIN { |
80
|
|
|
|
|
|
|
%ATTR_RALIAS = ( |
81
|
3
|
|
|
|
|
11
|
'l' => [map {(uc($_),ucfirst($_),$_)} qw(lemma lem l)], |
82
|
3
|
|
|
|
|
9
|
'w' => [map {(uc($_),ucfirst($_),$_)} qw(token word w)], |
83
|
5
|
|
|
|
|
26
|
'p' => [map {(uc($_),ucfirst($_),$_)} qw(postag tag pt pos p)], |
84
|
|
|
|
|
|
|
## |
85
|
|
|
|
|
|
|
'doc.collection' => [qw(doc.collection collection doc.corpus corpus)], |
86
|
|
|
|
|
|
|
'doc.textClass' => [qw(doc.textClass textClass textclass tc)], #doc.genre genre |
87
|
|
|
|
|
|
|
'doc.genre' => [qw(doc.genre genre doc.textClass0 textClass0 textclass0 tc0)], |
88
|
|
|
|
|
|
|
'doc.title' => [qw(doc.title title)], |
89
|
|
|
|
|
|
|
'doc.author' => [qw(doc.author author)], |
90
|
|
|
|
|
|
|
'doc.basename' => [qw(doc.basename basename)], |
91
|
|
|
|
|
|
|
'doc.bibl' => [qw(doc.bibl bibl)], |
92
|
|
|
|
|
|
|
'doc.flags' => [qw(doc.flags flags)], |
93
|
|
|
|
|
|
|
## |
94
|
2
|
|
|
|
|
5
|
date => [map {(uc($_),ucfirst($_),$_)} qw(date d)], |
95
|
1
|
|
|
1
|
|
4
|
slice => [map {(uc($_),ucfirst($_),$_)} qw(dslice slice sl ds s)], |
|
5
|
|
|
|
|
31
|
|
96
|
|
|
|
|
|
|
); |
97
|
1
|
|
|
|
|
7
|
%ATTR_ALIAS = (map {my $attr=$_; map {($_=>$attr)} @{$ATTR_RALIAS{$attr}}} keys %ATTR_RALIAS); |
|
13
|
|
|
|
|
14
|
|
|
13
|
|
|
|
|
14
|
|
|
78
|
|
|
|
|
152
|
|
|
13
|
|
|
|
|
18
|
|
98
|
1
|
|
|
|
|
7
|
%ATTR_TITLE = ( |
99
|
|
|
|
|
|
|
'l'=>'lemma', |
100
|
|
|
|
|
|
|
'w'=>'word', |
101
|
|
|
|
|
|
|
'p'=>'pos', |
102
|
|
|
|
|
|
|
); |
103
|
1
|
|
|
|
|
15
|
%ATTR_CBEXPR = ( |
104
|
|
|
|
|
|
|
'doc.textClass' => DDC::Any::CQCountKeyExprRegex->new(DDC::Any::CQCountKeyExprBibl->new('textClass'),':.*$',''), |
105
|
|
|
|
|
|
|
'doc.genre' => DDC::Any::CQCountKeyExprRegex->new(DDC::Any::CQCountKeyExprBibl->new('textClass'),':.*$',''), |
106
|
|
|
|
|
|
|
); |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
sub attrName { |
109
|
0
|
0
|
|
0
|
0
|
|
shift if (UNIVERSAL::isa($_[0],__PACKAGE__)); |
110
|
0
|
|
0
|
|
|
|
return $ATTR_ALIAS{($_[0]//'')} // $_[0]; |
|
|
|
0
|
|
|
|
|
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
## $atitle = $CLASS_OR_OBJECT->attrTitle($attr_or_alias) |
114
|
|
|
|
|
|
|
## + returns an attribute title for $attr_or_alias |
115
|
|
|
|
|
|
|
sub attrTitle { |
116
|
0
|
|
|
0
|
0
|
|
my ($that,$attr) = @_; |
117
|
0
|
|
0
|
|
|
|
$attr = $that->attrName($attr//''); |
118
|
0
|
0
|
|
|
|
|
return $ATTR_TITLE{$attr} if (exists($ATTR_TITLE{$attr})); |
119
|
0
|
|
|
|
|
|
$attr =~ s/^(?:doc|meta)\.//; |
120
|
0
|
|
|
|
|
|
return $attr; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
## $acbexpr = $CLASS_OR_OBJECT->attrCountBy($attr_or_alias,$matchid=0) |
124
|
|
|
|
|
|
|
sub attrCountBy { |
125
|
0
|
|
|
0
|
0
|
|
my ($that,$attr,$matchid) = @_; |
126
|
0
|
|
0
|
|
|
|
$attr = $that->attrName($attr//''); |
127
|
0
|
0
|
|
|
|
|
if (exists($ATTR_CBEXPR{$attr})) { |
128
|
|
|
|
|
|
|
##-- aliased attribute |
129
|
0
|
|
|
|
|
|
return $ATTR_CBEXPR{$attr}; |
130
|
|
|
|
|
|
|
} |
131
|
0
|
0
|
|
|
|
|
if ($attr =~ /^doc\.(.*)$/) { |
132
|
|
|
|
|
|
|
##-- document attribute ("doc.ATTR" convention) |
133
|
0
|
|
|
|
|
|
return DDC::Any::CQCountKeyExprBibl->new($1); |
134
|
|
|
|
|
|
|
} else { |
135
|
|
|
|
|
|
|
##-- token attribute |
136
|
0
|
|
0
|
|
|
|
return DDC::Any::CQCountKeyExprToken->new($attr, ($matchid||0), 0); |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
## $aquery_or_filter_or_undef = $CLASS_OR_OBJECT->attrQuery($attr_or_alias,$cquery) |
141
|
|
|
|
|
|
|
## + returns a CQuery or CQFilter object for condition $cquery on $attr_or_alias |
142
|
|
|
|
|
|
|
sub attrQuery { |
143
|
0
|
|
|
0
|
0
|
|
my ($that,$attr,$cquery) = @_; |
144
|
0
|
0
|
0
|
|
|
|
$attr = $that->attrName( $attr // ($cquery ? $cquery->getIndexName : undef) // '' ); |
|
|
|
0
|
|
|
|
|
145
|
0
|
0
|
|
|
|
|
if ($attr =~ /^doc\./) { |
146
|
|
|
|
|
|
|
##-- document attribute ("doc.ATTR" convention) |
147
|
0
|
|
|
|
|
|
return $that->query2filter($attr,$cquery); |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
##-- token condition (use literal $cquery) |
150
|
0
|
|
|
|
|
|
return $cquery; |
151
|
|
|
|
|
|
|
} |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
## \@attrdata = $coldb->attrData() |
154
|
|
|
|
|
|
|
## \@attrdata = $coldb->attrData(\@attrs=$coldb->attrs) |
155
|
|
|
|
|
|
|
## + get attribute data for \@attrs |
156
|
|
|
|
|
|
|
## + return @attrdata = ({a=>$attr, i=>$i, enum=>$aenum, pack_t=>$pack_xa, a2t=>$a2t, ...}) |
157
|
|
|
|
|
|
|
sub attrData { |
158
|
0
|
|
|
0
|
0
|
|
my ($coldb,$attrs) = @_; |
159
|
0
|
|
0
|
|
|
|
$attrs //= $coldb->attrs; |
160
|
0
|
|
|
|
|
|
my ($attr); |
161
|
|
|
|
|
|
|
return [map { |
162
|
0
|
|
|
|
|
|
$attr = $coldb->attrName($attrs->[$_]); |
|
0
|
|
|
|
|
|
|
163
|
0
|
|
|
|
|
|
{i=>$_, a=>$attr, enum=>$coldb->{"${attr}enum"}, pack_t=>$coldb->{"pack_t$attr"}, a2t=>$coldb->{"${attr}2t"}} |
164
|
|
|
|
|
|
|
} (0..$#$attrs)]; |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
## $bool = $coldb->hasAttr($attr) |
168
|
|
|
|
|
|
|
sub hasAttr { |
169
|
0
|
0
|
|
0
|
0
|
|
return 0 if (!defined($_[1])); |
170
|
0
|
|
0
|
|
|
|
return $_[1] ne 'x' && defined($_[0]{$_[0]->attrName($_[1]).'enum'}); |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
175
|
|
|
|
|
|
|
## create: from corpus |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
## $bool = $coldb->create($corpus,%opts) |
178
|
|
|
|
|
|
|
## + %opts: |
179
|
|
|
|
|
|
|
## $key => $val, ##-- clobbers $coldb->{$key} |
180
|
|
|
|
|
|
|
sub create { |
181
|
0
|
|
|
0
|
1
|
|
my ($coldb,$corpus,%opts) = @_; |
182
|
0
|
0
|
|
|
|
|
$coldb = $coldb->new() if (!ref($coldb)); |
183
|
0
|
|
|
|
|
|
@$coldb{keys %opts} = values %opts; |
184
|
0
|
|
|
|
|
|
my $flags = O_RDWR|O_CREAT|O_TRUNC; |
185
|
0
|
|
|
|
|
|
my $debug = $coldb->{debug}; |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
##-- initialize: output directory |
188
|
|
|
|
|
|
|
my $dbdir = $coldb->{dbdir} |
189
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("create() called but 'dbdir' key not set!"); |
190
|
0
|
|
|
|
|
|
$dbdir =~ s{/$}{}; |
191
|
0
|
|
|
|
|
|
$coldb->vlog('info', "create($dbdir) v$coldb->{version}"); |
192
|
0
|
0
|
0
|
|
|
|
!-d $dbdir |
193
|
|
|
|
|
|
|
or remove_tree($dbdir) |
194
|
|
|
|
|
|
|
or $coldb->logconfess("create(): could not remove stale $dbdir: $!"); |
195
|
0
|
0
|
|
|
|
|
make_path($dbdir) |
196
|
|
|
|
|
|
|
or $coldb->logconfess("create(): could not create DB directory $dbdir: $!"); |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
##-- initialize: tdf |
199
|
0
|
|
0
|
|
|
|
$coldb->{index_tdf} //= 1; |
200
|
0
|
0
|
|
|
|
|
if ($coldb->{index_tdf}) { |
201
|
0
|
0
|
|
|
|
|
if (!require "DiaColloDB/Relation/TDF.pm") { |
202
|
0
|
0
|
|
|
|
|
$coldb->logwarn("create(): require failed for DiaColloDB/Relation/TDF.pm ; (term x document) matrix modelling disabled", ($@ ? "\n: $@" : '')); |
203
|
0
|
|
|
|
|
|
$coldb->{index_tdf} = 0; |
204
|
|
|
|
|
|
|
} else { |
205
|
0
|
|
|
|
|
|
$coldb->info("(term x document) matrix modelling via DiaColloDB::Relation::TDF enabled."); |
206
|
|
|
|
|
|
|
} |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
##-- initialize: attributes |
210
|
0
|
|
|
|
|
|
my $attrs = $coldb->{attrs} = [map {$coldb->attrName($_)} @{$coldb->attrs(undef,['l'])}]; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
##-- pack-formats |
213
|
0
|
|
|
|
|
|
my $pack_id = $coldb->{pack_id}; |
214
|
0
|
|
|
|
|
|
my $pack_date = $coldb->{pack_date}; |
215
|
0
|
|
|
|
|
|
my $pack_f = $coldb->{pack_f}; |
216
|
0
|
|
|
|
|
|
my $pack_off = $coldb->{pack_off}; |
217
|
0
|
|
|
|
|
|
my $pack_len = $coldb->{pack_len}; |
218
|
0
|
|
|
|
|
|
my $pack_t = $coldb->{pack_t} = $pack_id."[".scalar(@$attrs)."]"; |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
##-- initialize: common flags |
221
|
0
|
|
|
|
|
|
my %efopts = (flags=>$flags, pack_i=>$coldb->{pack_id}, pack_o=>$coldb->{pack_off}, pack_l=>$coldb->{pack_len}); |
222
|
0
|
|
|
|
|
|
my %mmopts = (flags=>$flags, pack_i=>$coldb->{pack_id}); |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
##-- initialize: attribute enums |
225
|
0
|
|
|
|
|
|
my $aconf = []; ##-- [{a=>$attr, i=>$i, enum=>$aenum, pack_t=>$pack_ta, s2i=>\%s2i, ns=>$nstrings, ?i2j=>$pftmp, ...}, ] |
226
|
0
|
|
|
|
|
|
my $axpos = 0; |
227
|
0
|
|
|
|
|
|
my ($attr,$ac); |
228
|
0
|
|
|
|
|
|
foreach (0..$#$attrs) { |
229
|
0
|
|
|
|
|
|
push(@$aconf,$ac={i=>$_, a=>($attr=$attrs->[$_])}); |
230
|
0
|
|
|
|
|
|
$ac->{enum} = $coldb->{"${attr}enum"} = $coldb->mmclass($ECLASS)->new(%efopts); |
231
|
0
|
|
|
|
|
|
$ac->{pack_t} = $coldb->{"pack_t$attr"} = '@'.$axpos.$pack_id; |
232
|
0
|
|
|
|
|
|
$ac->{s2i} = $ac->{enum}{s2i}; |
233
|
0
|
0
|
|
|
|
|
$ac->{ma} = $1 if ($attr =~ /^(?:meta|doc)\.(.*)$/); |
234
|
0
|
|
|
|
|
|
$axpos += packsize($pack_id); |
235
|
|
|
|
|
|
|
} |
236
|
0
|
|
|
|
|
|
my @aconfm = grep { defined($_->{ma})} @$aconf; ##-- meta-attributes |
|
0
|
|
|
|
|
|
|
237
|
0
|
|
|
|
|
|
my @aconfw = grep {!defined($_->{ma})} @$aconf; ##-- token-attributes |
|
0
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
##-- initialize: tuple enum (+dates) |
240
|
0
|
|
|
|
|
|
my $tenum = $coldb->{tenum} = $coldb->mmclass($XECLASS)->new(%efopts, pack_s=>$pack_t); |
241
|
0
|
|
|
|
|
|
my $ts2i = $tenum->{s2i}; |
242
|
0
|
|
|
|
|
|
my $nt = 0; |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
##-- initialize: corpus token-list (temporary) |
245
|
|
|
|
|
|
|
## + 1 token/line, blank lines ~ EOS, token lines ~ "$a0i $a1i ... $aNi $date" |
246
|
0
|
|
|
|
|
|
my $atokfile = "$dbdir/atokens.dat"; |
247
|
0
|
0
|
|
|
|
|
CORE::open(my $atokfh, ">:raw", $atokfile) |
248
|
|
|
|
|
|
|
or $coldb->logconfess("$0: open failed for $atokfile: $!"); |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
##-- initialize: tdf: doc-data array (temporary) |
251
|
0
|
|
|
|
|
|
my ($docmeta,$docoff); |
252
|
0
|
|
|
|
|
|
my $ndocs = 0; ##-- current size of @$docmeta, @$docoff |
253
|
0
|
|
|
|
|
|
my $index_tdf = $coldb->{index_tdf}; |
254
|
0
|
0
|
|
|
|
|
if ($index_tdf) { |
255
|
0
|
0
|
|
|
|
|
$docmeta = $coldb->{docmeta} = tmparray("$dbdir/docmeta", UNLINK=>!$coldb->{keeptmp}, pack_o=>'J', pack_l=>'J') |
256
|
|
|
|
|
|
|
or $coldb->logconfess("create(): could not tie temporary doc-data array to $dbdir/docmeta.*: $!"); |
257
|
|
|
|
|
|
|
$docoff = $coldb->{docoff} = tmparrayp("$dbdir/docoff", 'J', UNLINK=>!$coldb->{keeptmp}) |
258
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("create(): could not tie temporary doc-offset array to $dbdir/docoff.*: $!"); |
259
|
|
|
|
|
|
|
} |
260
|
0
|
|
0
|
|
|
|
my $dbreak = ($coldb->{dbreak} // '#file'); |
261
|
0
|
0
|
|
|
|
|
$dbreak = "#$dbreak" if ($dbreak !~ /^#/); |
262
|
0
|
|
|
|
|
|
$coldb->{dbreak} = $dbreak; |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
##-- initialize: pre-compile corpus |
265
|
0
|
0
|
|
|
|
|
if (!UNIVERSAL::isa($corpus,'DiaColloDB::Corpus::Compiled')) { |
266
|
0
|
|
|
|
|
|
$coldb->vlog('info', "create(): pre-compiling & filtering corpus to $dbdir/corpus.d/"); |
267
|
|
|
|
|
|
|
$corpus = $corpus->compile("$dbdir/corpus.d", |
268
|
|
|
|
|
|
|
njobs=>$NJOBS, |
269
|
|
|
|
|
|
|
filters=>$coldb->corpusFilters, |
270
|
|
|
|
|
|
|
logFileN=>max2(1,$corpus->size/10), |
271
|
|
|
|
|
|
|
temp=>!$coldb->{keeptmp} |
272
|
|
|
|
|
|
|
) |
273
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("failed to pre-compile corpus to $dbdir/corpus.d/"); |
274
|
|
|
|
|
|
|
} else { |
275
|
0
|
|
|
|
|
|
$coldb->vlog('info', "create(): using pre-compiled corpus ".$corpus->dbdir.'/'); |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
##-- always use pre-compiled corpus filters -- but warn about overrides |
278
|
0
|
|
|
|
|
|
my ($cfilters,$dbfilters) = ($corpus->filters,$coldb->corpusFilters); |
279
|
0
|
|
|
|
|
|
foreach my $key (@DiaColloDB::Corpus::Filters::NAMES,@DiaColloDB::Corpus::Filters::FILES) { |
280
|
0
|
0
|
0
|
|
|
|
if (($dbfilters->{$key}//'') ne ($cfilters->{$key}//'')) { |
|
|
|
0
|
|
|
|
|
281
|
0
|
|
0
|
|
|
|
$coldb->warn("create(): WARNING: pre-compiled corpus filter $key=".($cfilters->{$key}//'(null)')." overrides user request=".($dbfilters->{$key}//'(null)')); |
|
|
|
0
|
|
|
|
|
282
|
0
|
|
|
|
|
|
$coldb->{$key} = $cfilters->{$key}; |
283
|
|
|
|
|
|
|
} |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
} |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
##-- initialize: logging |
288
|
0
|
|
|
|
|
|
my $nfiles = $corpus->size(); |
289
|
0
|
|
0
|
|
|
|
my $logFileN = $coldb->{logCorpusFileN} // max2(1,int($nfiles/20)); |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
##-- initialize: enums, date-range |
292
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate},"create(): processing $nfiles corpus file(s)"); |
293
|
0
|
|
|
|
|
|
my ($xdmin,$xdmax) = ('inf','-inf'); |
294
|
0
|
|
|
|
|
|
my ($doc, $date,$tok,@ais,$aistr,$t,$ti, $nsigs, $filei, $last_was_eos); |
295
|
0
|
|
|
|
|
|
my $docoff_cur = -1; |
296
|
0
|
|
|
|
|
|
my $toki = 0; |
297
|
0
|
|
|
|
|
|
for ($corpus->ibegin(); $corpus->iok; $corpus->inext) { |
298
|
0
|
|
|
|
|
|
$doc = $corpus->idocument(); |
299
|
|
|
|
|
|
|
$coldb->vlog($coldb->{logCorpusFile}, |
300
|
0
|
0
|
0
|
|
|
|
sprintf("create(): processing files [%3.0f%%]: %s", 100*($filei-1)/$nfiles, ($doc->{label} || $corpus->ifile))) |
|
|
|
0
|
|
|
|
|
301
|
|
|
|
|
|
|
if ($logFileN && ($filei++ % $logFileN)==0); |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
##-- initalize tdf data (#/sigs) |
304
|
0
|
|
|
|
|
|
$nsigs = 0; |
305
|
0
|
|
|
|
|
|
$docoff_cur=$toki; |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
##-- get date-range |
308
|
0
|
|
|
|
|
|
$date = $doc->{date}; |
309
|
0
|
0
|
|
|
|
|
$xdmin = $date if ($date < $xdmin); |
310
|
0
|
0
|
|
|
|
|
$xdmax = $date if ($date > $xdmax); |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
##-- get meta-attributes |
313
|
0
|
|
|
|
|
|
@ais = qw(); |
314
|
0
|
|
0
|
|
|
|
$ais[$_->{i}] = ($_->{s2i}{$doc->{meta}{$_->{ma}}} //= ++$_->{ns}) foreach (@aconfm); |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
##-- iterate over tokens, populating initial attribute-enums and writing $atokfile |
317
|
0
|
|
|
|
|
|
$last_was_eos = 1; |
318
|
0
|
|
|
|
|
|
foreach $tok (@{$doc->{tokens}}) { |
|
0
|
|
|
|
|
|
|
319
|
0
|
0
|
0
|
|
|
|
if (ref($tok)) { |
|
|
0
|
0
|
|
|
|
|
|
|
0
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
320
|
|
|
|
|
|
|
##-- normal token: get attribute value-ids and build tuple |
321
|
0
|
|
0
|
|
|
|
$ais[$_->{i}] = ($_->{s2i}{$tok->{$_->{a}//''}} //= ++$_->{ns}) foreach (@aconfw); |
|
|
|
0
|
|
|
|
|
322
|
0
|
|
|
|
|
|
$aistr = join(' ',@ais); |
323
|
|
|
|
|
|
|
|
324
|
0
|
|
|
|
|
|
$atokfh->print("$aistr $date\n"); |
325
|
0
|
|
|
|
|
|
$last_was_eos = 0; |
326
|
0
|
|
|
|
|
|
++$toki; |
327
|
|
|
|
|
|
|
} |
328
|
|
|
|
|
|
|
elsif (!defined($tok) && !$last_was_eos) { |
329
|
|
|
|
|
|
|
##-- eos |
330
|
0
|
|
|
|
|
|
$atokfh->print("\n"); |
331
|
0
|
|
|
|
|
|
$last_was_eos = 1; |
332
|
|
|
|
|
|
|
} |
333
|
|
|
|
|
|
|
elsif (defined($tok) && $tok eq $dbreak && $docoff && $docoff_cur < $toki) { |
334
|
|
|
|
|
|
|
##-- break:tdf |
335
|
0
|
|
|
|
|
|
++$nsigs; |
336
|
0
|
|
|
|
|
|
push(@$docoff, $docoff_cur); |
337
|
0
|
|
|
|
|
|
$docoff_cur = $toki; |
338
|
|
|
|
|
|
|
} |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
##-- store final doc-break (for tdf) |
342
|
0
|
0
|
0
|
|
|
|
if ($docoff && $docoff_cur < $toki) { |
343
|
0
|
|
|
|
|
|
++$nsigs; |
344
|
0
|
|
|
|
|
|
push(@$docoff, $docoff_cur); |
345
|
0
|
|
|
|
|
|
$docoff_cur = $toki; |
346
|
|
|
|
|
|
|
} |
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
##-- store doc-data (for tdf) |
349
|
0
|
0
|
|
|
|
|
if ($docmeta) { |
350
|
|
|
|
|
|
|
push(@$docmeta, { |
351
|
|
|
|
|
|
|
id => $ndocs++, |
352
|
|
|
|
|
|
|
nsigs => $nsigs, |
353
|
|
|
|
|
|
|
file => $corpus->ifile, |
354
|
0
|
|
|
|
|
|
(map {($_=>$doc->{$_})} qw(meta date label)), |
|
0
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
}) |
356
|
|
|
|
|
|
|
} |
357
|
|
|
|
|
|
|
} |
358
|
|
|
|
|
|
|
##-- store final pseudo-doc offset (total #/tokens) |
359
|
0
|
0
|
|
|
|
|
push(@$docoff, $toki) if ($docoff); |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
##-- store date-range |
362
|
0
|
|
|
|
|
|
@$coldb{qw(xdmin xdmax)} = ($xdmin,$xdmax); |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
##-- close temporary attribute-token file(s) |
365
|
0
|
0
|
|
|
|
|
CORE::close($atokfh) |
366
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to close temporary token storage file '$atokfile': $!"); |
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
##-- close/free temporary corpus |
369
|
0
|
0
|
|
|
|
|
undef $corpus if ($corpus->{temp}); |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
##-- filter: by attribute frequency |
372
|
0
|
|
|
|
|
|
my $ibad = unpack($pack_id,pack($pack_id,-1)); |
373
|
0
|
|
|
|
|
|
foreach $ac (@$aconf) { |
374
|
0
|
|
0
|
|
|
|
my $afmin = $coldb->{"fmin_".$ac->{a}} // ''; |
375
|
0
|
0
|
0
|
|
|
|
$afmin = $coldb->{tfmin} // 0 if (($afmin//'') eq ''); |
|
|
|
0
|
|
|
|
|
376
|
0
|
0
|
|
|
|
|
next if ($afmin <= 0); |
377
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "create(): building attribute frequency filter (fmin_$ac->{a}=$afmin)"); |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
##-- filter: by attribute frequency: setup re-numbering map $ac->{i2j} |
380
|
0
|
|
|
|
|
|
my $i2j = $ac->{i2j} = tmparrayp("$dbdir/i2j_$ac->{a}.tmp", 'J', UNLINK=>!$coldb->{keeptmp}); |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
##-- filter: by attribute frequency: populate $ac->{i2j} and update $ac->{s2i} |
383
|
0
|
|
|
|
|
|
env_push(LC_ALL=>'C'); |
384
|
0
|
|
|
|
|
|
my $ai1 = $ac->{i}+1; |
385
|
0
|
0
|
|
|
|
|
my $cmdfh = opencmd("sort -nk$ai1 $atokfile ".sortJobs()." | cut -d\" \" -f $ai1 | uniq -c |") |
386
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to open pipe from sort for attribute frequency filter (fmin_$ac->{a}=$afmin)"); |
387
|
0
|
|
|
|
|
|
my ($f,$i); |
388
|
0
|
|
|
|
|
|
my $nj = 0; |
389
|
0
|
|
|
|
|
|
while (defined($_=<$cmdfh>)) { |
390
|
0
|
|
|
|
|
|
chomp; |
391
|
0
|
|
|
|
|
|
($f,$i) = split(' ',$_,2); |
392
|
0
|
0
|
|
|
|
|
$i2j->[$i] = ($f >= $afmin ? ++$nj : $ibad) if ($i) |
|
|
0
|
|
|
|
|
|
393
|
|
|
|
|
|
|
} |
394
|
0
|
|
|
|
|
|
$cmdfh->close(); |
395
|
0
|
|
|
|
|
|
env_pop(); |
396
|
|
|
|
|
|
|
|
397
|
0
|
|
|
|
|
|
my $nabad = $ac->{ns} - $nj; |
398
|
0
|
0
|
|
|
|
|
my $pabad = $ac->{ns} ? sprintf("%.2f%%", 100*$nabad/$ac->{ns}) : 'nan%'; |
399
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "create(): filter (fmin_$ac->{a}=$afmin) pruning $nabad of $ac->{ns} attribute value type(s) ($pabad)"); |
400
|
|
|
|
|
|
|
|
401
|
0
|
|
|
|
|
|
tied(@$i2j)->flush; |
402
|
0
|
|
|
|
|
|
my $s2i = $ac->{s2i}; |
403
|
0
|
|
|
|
|
|
my ($s,$j,@badkeys); |
404
|
0
|
|
|
|
|
|
while (($s,$i)=each %$s2i) { |
405
|
0
|
0
|
|
|
|
|
if (($j=$i2j->[$i])==$ibad) { |
406
|
0
|
|
|
|
|
|
delete $s2i->{$s}; |
407
|
|
|
|
|
|
|
} else { |
408
|
0
|
|
|
|
|
|
$s2i->{$s} = $j; |
409
|
|
|
|
|
|
|
} |
410
|
|
|
|
|
|
|
} |
411
|
0
|
|
|
|
|
|
$ac->{ns} = $nj; |
412
|
0
|
|
|
|
|
|
tied(@$i2j)->flush; |
413
|
|
|
|
|
|
|
} |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
##-- filter: terms: populate $ts2t (map IDs) |
416
|
|
|
|
|
|
|
## + $ts2t = { join(' ',@ais) => pack($pack_t,i2j(@ais)), ...} |
417
|
|
|
|
|
|
|
## + includes attribute-id re-mappings |
418
|
|
|
|
|
|
|
## + only populated if we have any frequency filters active |
419
|
0
|
|
|
|
|
|
my $ts2t = undef; |
420
|
0
|
|
0
|
|
|
|
my $tfmin = $coldb->{tfmin}//0; |
421
|
0
|
0
|
0
|
|
|
|
if ($tfmin > 0 || grep {defined($_->{i2j})} @$aconf) { |
|
0
|
|
|
|
|
|
|
422
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "create(): populating global term enum (tfmin=$tfmin)"); |
423
|
0
|
0
|
|
|
|
|
my @ai2j = map {defined($_->{i2j}) ? $_->{i2j} : undef} @$aconf; |
|
0
|
|
|
|
|
|
|
424
|
0
|
|
|
|
|
|
my @ai2ji = grep {defined($ai2j[$_])} (0..$#ai2j); |
|
0
|
|
|
|
|
|
|
425
|
0
|
|
|
|
|
|
my $na = scalar(@$attrs); |
426
|
0
|
|
|
|
|
|
my ($nw0,$nw) = (0,0); |
427
|
0
|
|
|
|
|
|
my ($f); |
428
|
0
|
|
|
|
|
|
env_push(LC_ALL=>'C'); |
429
|
|
|
|
|
|
|
my $cmdfh = |
430
|
0
|
0
|
|
|
|
|
opencmd("sort ".join(' ', map {"-nk$_"} (1..$na))." ".sortJobs()." $atokfile | cut -d\" \" -f -$na | uniq -c |") |
|
0
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to open pipe from sort for global term filter"); |
432
|
|
|
|
|
|
|
FILTER_WTUPLES: |
433
|
0
|
|
|
|
|
|
while (defined($_=<$cmdfh>)) { |
434
|
0
|
|
|
|
|
|
chomp; |
435
|
0
|
|
|
|
|
|
++$nw0; |
436
|
0
|
|
|
|
|
|
($f,$aistr) = split(' ',$_,2); |
437
|
0
|
0
|
0
|
|
|
|
next if (!$aistr || $f < $tfmin); |
438
|
0
|
|
|
|
|
|
@ais = split(' ',$aistr,$na); |
439
|
0
|
|
|
|
|
|
foreach (@ai2ji) { |
440
|
|
|
|
|
|
|
##-- apply attribute-wise re-mappings |
441
|
0
|
|
0
|
|
|
|
$ais[$_] = $ai2j[$_][$ais[$_]//0]; |
442
|
0
|
0
|
|
|
|
|
next FILTER_WTUPLES if ($ais[$_] == $ibad); |
443
|
|
|
|
|
|
|
} |
444
|
0
|
|
|
|
|
|
$ts2t->{$aistr} = pack($pack_t,@ais); |
445
|
0
|
|
|
|
|
|
++$nw; |
446
|
|
|
|
|
|
|
} |
447
|
0
|
|
|
|
|
|
$cmdfh->close(); |
448
|
0
|
|
|
|
|
|
env_pop(); |
449
|
|
|
|
|
|
|
|
450
|
0
|
|
|
|
|
|
my $nwbad = $nw0 - $nw; |
451
|
0
|
0
|
|
|
|
|
my $pwbad = $nw0 ? sprintf("%.2f%%", 100*$nwbad/$nw0) : 'nan%'; |
452
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "create(): will prune $nwbad of $nw0 term tuple type(s) ($pwbad)"); |
453
|
|
|
|
|
|
|
} |
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
##-- compile: apply filters & assign term-ids |
456
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "create(): filtering corpus tokens & assigning term-IDs"); |
457
|
0
|
|
|
|
|
|
my $tokfile = "$dbdir/tokens.dat"; ##-- v0.10.x: new format: "TID DATE\n" | "\n" |
458
|
0
|
0
|
|
|
|
|
CORE::open(my $tokfh, ">:raw", $tokfile) |
459
|
|
|
|
|
|
|
or $coldb->logconfess("$0: open failed for $tokfile: $!"); |
460
|
0
|
|
|
|
|
|
my $vtokfile = "$dbdir/vtokens.bin"; |
461
|
0
|
0
|
|
|
|
|
CORE::open(my $vtokfh, ">:raw", $vtokfile) ##-- format: pack($pack_t,@ais) |
462
|
|
|
|
|
|
|
or $coldb->logconfess("$0: open failed for $vtokfile: $!"); |
463
|
0
|
0
|
|
|
|
|
CORE::open($atokfh, "<:raw", $atokfile) |
464
|
|
|
|
|
|
|
or $coldb->logconfess("$0: re-open failed for $atokfile: $!"); |
465
|
0
|
|
|
|
|
|
$nt = 0; |
466
|
0
|
|
|
|
|
|
my $ntok_in = $toki; |
467
|
0
|
|
|
|
|
|
my ($toki_in,$toki_out) = (0,0); |
468
|
0
|
|
|
|
|
|
my $doci_cur = 0; |
469
|
0
|
0
|
|
|
|
|
tied(@$docoff)->flush() if ($docoff); |
470
|
0
|
0
|
|
|
|
|
my $docoff_in = $docoff ? $docoff->[$doci_cur] : -1; |
471
|
0
|
|
|
|
|
|
while (defined($_=<$atokfh>)) { |
472
|
0
|
|
|
|
|
|
chomp; |
473
|
0
|
0
|
|
|
|
|
if ($_) { |
474
|
0
|
0
|
|
|
|
|
if ($toki_in == $docoff_in) { |
475
|
|
|
|
|
|
|
##-- update break-indices for tdf |
476
|
|
|
|
|
|
|
|
477
|
0
|
0
|
|
|
|
|
if ($debug) { |
478
|
|
|
|
|
|
|
##-- BUGHUNT/Birmingham: weird errors around here: Tue, 05 Jul 2016 09:27:11 +0200 |
479
|
0
|
0
|
|
|
|
|
$coldb->logconfess("create(): \$doci_cur not defined at \$atokfh line ", $atokfh->input_line_number) |
480
|
|
|
|
|
|
|
if (!defined($doci_cur)); |
481
|
0
|
0
|
|
|
|
|
$coldb->logconfess("create(): \$toki_out not defined at \$atokfh line ", $atokfh->input_line_number) |
482
|
|
|
|
|
|
|
if (!defined($toki_out)); |
483
|
0
|
0
|
|
|
|
|
$coldb->logconfess("create(): \$docoff->[\$doci_cur=$doci_cur] not defined at \$atokfh line ", $atokfh->input_line_number) |
484
|
|
|
|
|
|
|
if (!defined($docoff->[$doci_cur])); |
485
|
0
|
0
|
|
|
|
|
$coldb->logconfess("create(): next \$docoff_in=\$docoff->[++(\$doci_cur=$doci_cur)] not defined at \$atokfh line ", $atokfh->input_line_number) |
486
|
|
|
|
|
|
|
if (!defined($docoff->[$doci_cur+1])); |
487
|
|
|
|
|
|
|
##--/BUGHUNT |
488
|
|
|
|
|
|
|
} |
489
|
|
|
|
|
|
|
|
490
|
0
|
|
|
|
|
|
$docoff->[$doci_cur] = $toki_out; |
491
|
0
|
|
|
|
|
|
$docoff_in = $docoff->[++$doci_cur]; |
492
|
|
|
|
|
|
|
} |
493
|
0
|
|
|
|
|
|
++$toki_in; |
494
|
0
|
0
|
|
|
|
|
$date = $1 if (s/ ([0-9]+)$//); |
495
|
0
|
0
|
|
|
|
|
if (defined($ts2t)) { |
496
|
0
|
0
|
|
|
|
|
next if (!defined($t=$ts2t->{$_})); |
497
|
|
|
|
|
|
|
} else { |
498
|
0
|
|
|
|
|
|
$t = pack($pack_t, split(' ',$_)); |
499
|
|
|
|
|
|
|
} |
500
|
0
|
0
|
|
|
|
|
$ti = $ts2i->{$t} = ++$nt if (!defined($ti=$ts2i->{$t})); |
501
|
0
|
|
|
|
|
|
$tokfh->print($ti, "\t", $date, "\n"); |
502
|
0
|
|
|
|
|
|
$vtokfh->print($t); |
503
|
0
|
|
|
|
|
|
++$toki_out; |
504
|
|
|
|
|
|
|
} |
505
|
|
|
|
|
|
|
else { |
506
|
0
|
|
|
|
|
|
$tokfh->print("\n"); |
507
|
|
|
|
|
|
|
} |
508
|
|
|
|
|
|
|
} |
509
|
|
|
|
|
|
|
##-- update any trailing tdf break indices |
510
|
0
|
0
|
|
|
|
|
if ($docoff) { |
511
|
0
|
|
|
|
|
|
$ndocs = $#$docoff; |
512
|
0
|
|
|
|
|
|
for (; $doci_cur <= $ndocs; ++$doci_cur) { |
513
|
0
|
|
|
|
|
|
$docoff->[$doci_cur] = $toki_out; |
514
|
|
|
|
|
|
|
} |
515
|
0
|
|
|
|
|
|
tied(@$docoff)->flush(); |
516
|
|
|
|
|
|
|
} |
517
|
|
|
|
|
|
|
|
518
|
0
|
0
|
|
|
|
|
CORE::close($atokfh) |
519
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to close temporary attribute-token-file $atokfile: $!"); |
520
|
0
|
0
|
|
|
|
|
CORE::close($tokfh) |
521
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to close temporary token-file $tokfile: $!"); |
522
|
0
|
0
|
|
|
|
|
CORE::close($vtokfh) |
523
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to close temporary tdf-token-file $vtokfile: $!"); |
524
|
0
|
|
|
|
|
|
my $ntok_out = $toki_out; |
525
|
0
|
0
|
|
|
|
|
my $ptokbad = $ntok_in ? sprintf("%.2f%%",100*($ntok_in-$ntok_out)/$ntok_in) : 'nan%'; |
526
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "create(): assigned $nt term tuple-IDs to $ntok_out of $ntok_in tokens (pruned $ptokbad)"); |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
##-- cleanup: drop $aconf->[$ai]{i2j} now that we've used it |
529
|
0
|
|
|
|
|
|
delete($_->{i2j}) foreach (@$aconf); |
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
##-- compile: tenum |
532
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "create(): creating tuple-enum $dbdir/tenum.*"); |
533
|
0
|
|
|
|
|
|
$tenum->fromHash($ts2i); |
534
|
0
|
0
|
|
|
|
|
$tenum->save("$dbdir/tenum") |
535
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to save $dbdir/tenum.*: $!"); |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
##-- compile: by attribute |
538
|
0
|
|
|
|
|
|
foreach $ac (@$aconf) { |
539
|
|
|
|
|
|
|
##-- compile: by attribte: enum |
540
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate},"create(): creating enum $dbdir/$ac->{a}_enum.*"); |
541
|
0
|
|
|
|
|
|
$ac->{enum}->fromHash($ac->{s2i}); |
542
|
0
|
0
|
|
|
|
|
$ac->{enum}->save("$dbdir/$ac->{a}_enum") |
543
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to save $dbdir/$ac->{a}_enum: $!"); |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
##-- compile: by attribute: expansion multimaps (+dates) |
546
|
0
|
|
|
|
|
|
$coldb->create_multimap("$dbdir/$ac->{a}_2t",$ts2i,$ac->{pack_t},"attribute expansion multimap"); |
547
|
|
|
|
|
|
|
} |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
##-- compute unigrams |
550
|
0
|
0
|
0
|
|
|
|
if ($coldb->{index_xf}//1) { |
551
|
0
|
|
|
|
|
|
$coldb->info("creating unigram index $dbdir/xf.*"); |
552
|
|
|
|
|
|
|
my $xfdb = $coldb->{xf} = DiaColloDB::Relation::Unigrams->new(base=>"$dbdir/xf", flags=>$flags, mmap=>$coldb->{mmap}, |
553
|
0
|
0
|
|
|
|
|
pack_i=>$pack_id, pack_f=>$pack_f, pack_d=>$pack_date) |
554
|
|
|
|
|
|
|
or $coldb->logconfess("create(): could not create $dbdir/xf.*: $!"); |
555
|
0
|
0
|
|
|
|
|
$xfdb->create($coldb, $tokfile) |
556
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to create unigram index: $!"); |
557
|
|
|
|
|
|
|
} else { |
558
|
0
|
|
|
|
|
|
$coldb->info("NOT creating unigram index $dbdir/xf.*; set index_xf=1 to enable"); |
559
|
|
|
|
|
|
|
} |
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
##-- compute collocation frequencies |
562
|
0
|
0
|
0
|
|
|
|
if ($coldb->{index_cof}//1) { |
563
|
0
|
|
|
|
|
|
$coldb->info("creating co-frequency index $dbdir/cof.* [dmax=$coldb->{dmax}, fmin=$coldb->{cfmin}]"); |
564
|
|
|
|
|
|
|
my $cof = $coldb->{cof} = DiaColloDB::Relation::Cofreqs->new(base=>"$dbdir/cof", flags=>$flags, mmap=>$coldb->{mmap}, |
565
|
|
|
|
|
|
|
pack_i=>$pack_id, pack_f=>$pack_f, pack_d=>$pack_date, |
566
|
|
|
|
|
|
|
dmax=>$coldb->{dmax}, fmin=>$coldb->{cfmin}, |
567
|
|
|
|
|
|
|
keeptmp=>$coldb->{keeptmp}, |
568
|
|
|
|
|
|
|
) |
569
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("create(): failed to create co-frequency index $dbdir/cof.*: $!"); |
570
|
0
|
0
|
|
|
|
|
$cof->create($coldb, $tokfile) |
571
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to create co-frequency index: $!"); |
572
|
|
|
|
|
|
|
} else { |
573
|
0
|
|
|
|
|
|
$coldb->info("NOT creating co-frequency index $dbdir/cof.*; set index_cof=1 to enable"); |
574
|
|
|
|
|
|
|
} |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
##-- create tdf-model (if requested & available) |
577
|
0
|
0
|
|
|
|
|
if ($coldb->{index_tdf}) { |
578
|
0
|
|
|
|
|
|
$coldb->info("creating (term x document) index $dbdir/tdf* [dbreak=$dbreak]"); |
579
|
0
|
|
0
|
|
|
|
$coldb->{tdfopts} //= {}; |
580
|
0
|
|
0
|
|
|
|
$coldb->{tdfopts}{$_} //= $TDF_OPTS{$_} foreach (keys %TDF_OPTS); ##-- tdf: default options |
581
|
0
|
|
|
|
|
|
$coldb->{tdf} = DiaColloDB::Relation::TDF->create($coldb, undef, base=>"$dbdir/tdf", dbreak=>$dbreak); |
582
|
|
|
|
|
|
|
} else { |
583
|
0
|
|
|
|
|
|
$coldb->info("NOT creating (term x document) index, 'tdf' profiling relation disabled"); |
584
|
|
|
|
|
|
|
} |
585
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
##-- create ddc client relation (no-op if ddcServer option is not set) |
587
|
0
|
0
|
|
|
|
|
if ($coldb->{ddcServer}) { |
588
|
0
|
|
|
|
|
|
$coldb->info("creating ddc client configuration $dbdir/ddc.hdr [ddcServer=$coldb->{ddcServer}]"); |
589
|
0
|
|
|
|
|
|
$coldb->{ddc} = DiaColloDB::Relation::DDC->create($coldb); |
590
|
|
|
|
|
|
|
} else { |
591
|
0
|
|
|
|
|
|
$coldb->info("ddcServer option unset, NOT creating ddc client configuration"); |
592
|
|
|
|
|
|
|
} |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
##-- save header |
595
|
0
|
0
|
|
|
|
|
$coldb->saveHeader() |
596
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to save header: $!"); |
597
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
##-- all done |
599
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "create(): DB $dbdir created."); |
600
|
|
|
|
|
|
|
|
601
|
|
|
|
|
|
|
##-- cleanup |
602
|
0
|
0
|
0
|
|
|
|
!$docmeta |
|
|
|
0
|
|
|
|
|
603
|
|
|
|
|
|
|
or !tied(@$docmeta) |
604
|
|
|
|
|
|
|
or untie(@$docmeta) |
605
|
|
|
|
|
|
|
or $coldb->logwarn("create(): could untie temporary doc-data array $dbdir/docmeta.*: $!"); |
606
|
0
|
|
|
|
|
|
delete $coldb->{docmeta}; |
607
|
|
|
|
|
|
|
|
608
|
0
|
0
|
0
|
|
|
|
!$docoff |
|
|
|
0
|
|
|
|
|
609
|
|
|
|
|
|
|
or !tied(@$docoff) |
610
|
|
|
|
|
|
|
or untie(@$docoff) |
611
|
|
|
|
|
|
|
or $coldb->logwarn("create(): could untie temporary doc-offset array $dbdir/docoff.*: $!"); |
612
|
0
|
|
|
|
|
|
delete $coldb->{docoff}; |
613
|
|
|
|
|
|
|
|
614
|
0
|
0
|
|
|
|
|
if (!$coldb->{keeptmp}) { |
615
|
0
|
|
|
|
|
|
foreach ($vtokfile,$tokfile,$atokfile) { |
616
|
0
|
0
|
|
|
|
|
CORE::unlink($_) |
617
|
|
|
|
|
|
|
or $coldb->logwarne("creat(): could not remove temporary file '$_': $!"); |
618
|
|
|
|
|
|
|
} |
619
|
|
|
|
|
|
|
} |
620
|
|
|
|
|
|
|
|
621
|
0
|
|
|
|
|
|
return $coldb; |
622
|
|
|
|
|
|
|
} |
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
625
|
|
|
|
|
|
|
## create: union (aka merge) |
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
## $coldb = $CLASS_OR_OBJECT->union(\@coldbs_or_dbdirs,%opts) |
628
|
|
|
|
|
|
|
## + populates $coldb as union over @coldbs_or_dbdirs |
629
|
|
|
|
|
|
|
## + clobbers argument dbs {_union_${a}i2u}, {_union_xi2u}, {_union_argi} |
630
|
1
|
|
|
1
|
|
5642
|
BEGIN { *merge = \&union; } |
631
|
|
|
|
|
|
|
sub union { |
632
|
0
|
|
|
0
|
0
|
|
my ($coldb,$args,%opts) = @_; |
633
|
0
|
0
|
|
|
|
|
$coldb = $coldb->new() if (!ref($coldb)); |
634
|
0
|
|
|
|
|
|
@$coldb{keys %opts} = values %opts; |
635
|
0
|
0
|
|
|
|
|
my @dbargs = map {ref($_) ? $_ : $coldb->new(dbdir=>$_)} @$args; |
|
0
|
|
|
|
|
|
|
636
|
0
|
|
|
|
|
|
my $flags = O_RDWR|O_CREAT|O_TRUNC; |
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
##-- sanity check(s): version |
639
|
0
|
|
|
|
|
|
my $min_db_version = '0.10.000'; |
640
|
0
|
|
|
|
|
|
foreach (@dbargs) { |
641
|
0
|
|
0
|
|
|
|
my $dbversion = $_->{version} // '0'; |
642
|
0
|
0
|
|
|
|
|
$coldb->logconfess("union(): can't handle v$dbversion index in '$_->{dbdir}'; try running \`dcdb-upgrade.perl $_->{dbdir}'") |
643
|
|
|
|
|
|
|
if (version->parse($dbversion) < $min_db_version); |
644
|
|
|
|
|
|
|
} |
645
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
##-- initialize: output directory |
647
|
|
|
|
|
|
|
my $dbdir = $coldb->{dbdir} |
648
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("union() called but 'dbdir' key not set!"); |
649
|
0
|
|
|
|
|
|
$dbdir =~ s{/$}{}; |
650
|
0
|
|
0
|
|
|
|
$coldb->vlog('info', "union($dbdir) v$coldb->{version}: ", join(' ', map {$_->{dbdir}//''} @dbargs)); |
|
0
|
|
|
|
|
|
|
651
|
0
|
0
|
0
|
|
|
|
!-d $dbdir |
652
|
|
|
|
|
|
|
or remove_tree($dbdir) |
653
|
|
|
|
|
|
|
or $coldb->logconfess("union(): could not remove stale $dbdir: $!"); |
654
|
0
|
0
|
|
|
|
|
make_path($dbdir) |
655
|
|
|
|
|
|
|
or $coldb->logconfess("union(): could not create DB directory $dbdir: $!"); |
656
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
##-- attributes |
658
|
0
|
|
|
|
|
|
my $attrs = [map {$coldb->attrName($_)} @{$coldb->attrs(undef,[])}]; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
659
|
0
|
|
|
|
|
|
my ($db,$dba); |
660
|
0
|
0
|
|
|
|
|
if (!@$attrs) { |
661
|
|
|
|
|
|
|
##-- use intersection of @dbargs attrs |
662
|
0
|
|
|
|
|
|
my @dbakeys = map {$db=$_; scalar {map {($_=>undef)} @{$db->attrs}}} @dbargs; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
663
|
0
|
|
|
|
|
|
my %akeys = qw(); |
664
|
0
|
|
|
|
|
|
foreach $dba (map {@{$_->attrs}} @dbargs) { |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
665
|
0
|
0
|
0
|
|
|
|
next if (exists($akeys{$dba}) || grep {!exists($_->{$dba})} @dbakeys); |
|
0
|
|
|
|
|
|
|
666
|
0
|
|
|
|
|
|
$akeys{$dba}=undef; |
667
|
0
|
|
|
|
|
|
push(@$attrs, $dba); |
668
|
|
|
|
|
|
|
} |
669
|
|
|
|
|
|
|
} |
670
|
0
|
|
|
|
|
|
$coldb->{attrs} = $attrs; |
671
|
0
|
0
|
|
|
|
|
$coldb->logconfess("union(): no attributes defined and intersection over db attributes is empty!") if (!@$attrs); |
672
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
##-- pack-formats |
674
|
0
|
|
|
|
|
|
my $pack_id = $coldb->{pack_id}; |
675
|
0
|
|
|
|
|
|
my $pack_date = $coldb->{pack_date}; |
676
|
0
|
|
|
|
|
|
my $pack_f = $coldb->{pack_f}; |
677
|
0
|
|
|
|
|
|
my $pack_off = $coldb->{pack_off}; |
678
|
0
|
|
|
|
|
|
my $pack_len = $coldb->{pack_len}; |
679
|
0
|
|
|
|
|
|
my $pack_t = $coldb->{pack_t} = $pack_id."[".scalar(@$attrs)."]"; ##-- pack("${pack_id}*${pack_date}", @ais) |
680
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
##-- tuple packing |
682
|
0
|
|
|
|
|
|
$coldb->{"pack_t$attrs->[$_]"} = '@'.($_*packsize($pack_id)).$pack_id foreach (0..$#$attrs); |
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
##-- common variables: enums |
685
|
0
|
|
|
|
|
|
my %efopts = (flags=>$flags, pack_i=>$coldb->{pack_id}, pack_o=>$coldb->{pack_off}, pack_l=>$coldb->{pack_len}); |
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
##-- union: attribute enums; also sets $db->{"_union_${a}i2u"} for each attribute $attr |
688
|
|
|
|
|
|
|
## + $db->{"${a}i2u"} is a PackedFile temporary in $dbdir/"${a}_i2u.tmp${argi}" |
689
|
0
|
|
|
|
|
|
my ($ac,$attr,$aenum,$as2i,$argi); |
690
|
0
|
|
|
|
|
|
my $adata = $coldb->attrData($attrs); |
691
|
0
|
|
|
|
|
|
foreach $ac (@$adata) { |
692
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): creating attribute enum $dbdir/$ac->{a}_enum.*"); |
693
|
0
|
|
|
|
|
|
$attr = $ac->{a}; |
694
|
0
|
|
|
|
|
|
$aenum = $coldb->{"${attr}enum"} = $ac->{enum} = $coldb->mmclass($ECLASS)->new(%efopts); |
695
|
0
|
|
|
|
|
|
$as2i = $aenum->{s2i}; |
696
|
0
|
|
|
|
|
|
foreach $argi (0..$#dbargs) { |
697
|
|
|
|
|
|
|
##-- enum union: guts |
698
|
0
|
|
|
|
|
|
$db = $dbargs[$argi]; |
699
|
0
|
|
|
|
|
|
my $dbenum = $db->{"${attr}enum"}; |
700
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): processing $dbenum->{base}.*"); |
701
|
0
|
|
|
|
|
|
$aenum->addEnum($dbenum); |
702
|
0
|
|
|
|
|
|
$db->{"_union_argi"} = $argi; |
703
|
|
|
|
|
|
|
$db->{"_union_${attr}i2u"} = (DiaColloDB::PackedFile |
704
|
|
|
|
|
|
|
->new(file=>"$dbdir/${attr}_i2u.tmp${argi}", flags=>'rw', packas=>$coldb->{pack_id}) |
705
|
0
|
0
|
|
|
|
|
->fromArray( [@$as2i{$dbenum ? @{$dbenum->toArray} : ''}] )) |
|
0
|
0
|
|
|
|
|
|
706
|
|
|
|
|
|
|
or $coldb->logconfess("union(): failed to create temporary $dbdir/${attr}_i2u.tmp${argi}"); |
707
|
0
|
|
|
|
|
|
$db->{"_union_${attr}i2u"}->flush(); |
708
|
|
|
|
|
|
|
} |
709
|
0
|
0
|
|
|
|
|
$aenum->save("$dbdir/${attr}_enum") |
710
|
|
|
|
|
|
|
or $coldb->logconfess("union(): failed to save attribute enum $dbdir/${attr}_enum: $!"); |
711
|
|
|
|
|
|
|
} |
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
##-- union: date-range |
714
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): computing date-range"); |
715
|
0
|
|
|
|
|
|
@$coldb{qw(xdmin xdmax)} = (undef,undef); |
716
|
0
|
|
|
|
|
|
foreach $db (@dbargs) { |
717
|
0
|
0
|
0
|
|
|
|
$coldb->{xdmin} = $db->{xdmin} if (!defined($coldb->{xdmin}) || $db->{xdmin} < $coldb->{xdmin}); |
718
|
0
|
0
|
0
|
|
|
|
$coldb->{xdmax} = $db->{xdmax} if (!defined($coldb->{xdmax}) || $db->{xdmax} > $coldb->{xdmax}); |
719
|
|
|
|
|
|
|
} |
720
|
0
|
|
0
|
|
|
|
$coldb->{xdmin} //= 0; |
721
|
0
|
|
0
|
|
|
|
$coldb->{xdmax} //= 0; |
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
##-- union: tenum |
724
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): creating tuple-enum $dbdir/tenum.*"); |
725
|
0
|
|
|
|
|
|
my $tenum = $coldb->{tenum} = $coldb->mmclass($XECLASS)->new(%efopts, pack_s=>$pack_t); |
726
|
0
|
|
|
|
|
|
my $ts2i = $tenum->{s2i}; |
727
|
0
|
|
|
|
|
|
my $nt = 0; |
728
|
0
|
|
|
|
|
|
foreach $db (@dbargs) { |
729
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): processing $db->{tenum}{base}.*"); |
730
|
0
|
|
|
|
|
|
my $db_pack_t = $db->{pack_t}; |
731
|
0
|
|
|
|
|
|
my $dbattrs = $db->{attrs}; |
732
|
0
|
|
|
|
|
|
my %a2dbti = map { ($dbattrs->[$_]=>$_) } (0..$#$dbattrs); |
|
0
|
|
|
|
|
|
|
733
|
0
|
|
|
|
|
|
my %a2i2u = map { ($_=>$db->{"_union_${_}i2u"}) } @$attrs; |
|
0
|
|
|
|
|
|
|
734
|
0
|
|
|
|
|
|
$argi = $db->{_union_argi}; |
735
|
0
|
|
|
|
|
|
my $ti2u = $db->{_union_ti2u} = DiaColloDB::PackedFile->new(file=>"$dbdir/t_i2u.tmp${argi}", flags=>'rw', packas=>$coldb->{pack_id}); |
736
|
0
|
|
|
|
|
|
my $dbti = 0; |
737
|
0
|
|
|
|
|
|
my (@dbt,@ut,$uts,$uti); |
738
|
0
|
|
|
|
|
|
foreach (@{$db->{tenum}->toArray}) { |
|
0
|
|
|
|
|
|
|
739
|
0
|
|
|
|
|
|
@dbt = unpack($db_pack_t,$_); |
740
|
|
|
|
|
|
|
$uts = pack($pack_t, |
741
|
|
|
|
|
|
|
(map { |
742
|
0
|
|
0
|
|
|
|
(exists($a2dbti{$_}) |
743
|
|
|
|
|
|
|
? $a2i2u{$_}->fetch($dbt[$a2dbti{$_}]//0)//0 |
744
|
0
|
0
|
0
|
|
|
|
: $a2i2u{$_}->fetch(0)//0) |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
745
|
|
|
|
|
|
|
} @$attrs), |
746
|
|
|
|
|
|
|
$dbt[$#dbt]//0); |
747
|
0
|
0
|
|
|
|
|
$uti = $ts2i->{$uts} = $nt++ if (!defined($uti=$ts2i->{$uts})); |
748
|
0
|
|
|
|
|
|
$ti2u->store($dbti++, $uti); |
749
|
|
|
|
|
|
|
} |
750
|
|
|
|
|
|
|
$ti2u->flush() |
751
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("could not flush temporary $dbdir/t_i2u.tmp${argi}"); |
752
|
|
|
|
|
|
|
} |
753
|
0
|
|
|
|
|
|
$tenum->fromHash($ts2i); |
754
|
0
|
0
|
|
|
|
|
$tenum->save("$dbdir/tenum") |
755
|
|
|
|
|
|
|
or $coldb->logconfess("union(): failed to save $dbdir/tenum.*: $!"); |
756
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
##-- union: expansion maps |
758
|
0
|
|
|
|
|
|
foreach (@$adata) { |
759
|
0
|
|
|
|
|
|
$coldb->create_multimap("$dbdir/$_->{a}_2t",$ts2i,$_->{pack_t},"attribute expansion multimap"); |
760
|
|
|
|
|
|
|
} |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
##-- intermediate cleanup: ts2i |
763
|
0
|
|
|
|
|
|
undef $ts2i; |
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
##-- unigrams: populate |
766
|
0
|
0
|
0
|
|
|
|
if ($coldb->{index_xf}//1) { |
767
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): creating tuple unigram index $dbdir/xf.*"); |
768
|
|
|
|
|
|
|
$coldb->{xf} = DiaColloDB::Relation::Unigrams->new(base=>"$dbdir/xf", flags=>$flags, mmap=>$coldb->{mmap}, |
769
|
|
|
|
|
|
|
pack_i=>$pack_id, pack_f=>$pack_f, pack_d=>$pack_date, |
770
|
|
|
|
|
|
|
keeptmp => $coldb->{keeptmp}, |
771
|
|
|
|
|
|
|
) |
772
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("union(): could not create $dbdir/xf.*: $!"); |
773
|
0
|
0
|
|
|
|
|
$coldb->{xf}->union($coldb, [map {[@$_{qw(xf _union_ti2u)}]} @dbargs]) |
|
0
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
or $coldb->logconfess("union(): could not populate unigram index $dbdir/xf.*: $!"); |
775
|
|
|
|
|
|
|
} else { |
776
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): NOT creating unigram index $dbdir/xf.*; set index_xf=1 to enable"); |
777
|
|
|
|
|
|
|
} |
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
##-- co-frequencies: populate |
780
|
0
|
0
|
0
|
|
|
|
if ($coldb->{index_cof}//1) { |
781
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): creating co-frequency index $dbdir/cof.* [fmin=$coldb->{cfmin}]"); |
782
|
|
|
|
|
|
|
$coldb->{cof} = DiaColloDB::Relation::Cofreqs->new(base=>"$dbdir/cof", flags=>$flags, mmap=>$coldb->{mmap}, |
783
|
|
|
|
|
|
|
pack_i=>$pack_id, pack_f=>$pack_f, pack_d=>$pack_date, |
784
|
|
|
|
|
|
|
dmax=>$coldb->{dmax}, fmin=>$coldb->{cfmin}, |
785
|
|
|
|
|
|
|
keeptmp=>$coldb->{keeptmp}, |
786
|
|
|
|
|
|
|
) |
787
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("create(): failed to open co-frequency index $dbdir/cof.*: $!"); |
788
|
0
|
0
|
|
|
|
|
$coldb->{cof}->union($coldb, [map {[@$_{qw(cof _union_ti2u)}]} @dbargs]) |
|
0
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
or $coldb->logconfess("union(): could not populate co-frequency index $dbdir/cof.*: $!"); |
790
|
|
|
|
|
|
|
} else { |
791
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): NOT creating co-frequency index $dbdir/cof.*; set index_cof=1 to enable"); |
792
|
|
|
|
|
|
|
} |
793
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
##-- tdf: populate |
795
|
0
|
|
|
|
|
|
my $db_tdf = !grep {!$_->{index_tdf}} @dbargs; |
|
0
|
|
|
|
|
|
|
796
|
0
|
|
0
|
|
|
|
$coldb->{index_tdf} //= $db_tdf; |
797
|
0
|
0
|
0
|
|
|
|
if ($coldb->{index_tdf} && $db_tdf) { |
798
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): creating (term x document) index $dbdir/tdf.*"); |
799
|
|
|
|
|
|
|
## |
800
|
0
|
|
|
|
|
|
my $tdfopts0 = $dbargs[0]{tdfopts}; |
801
|
0
|
|
0
|
|
|
|
$coldb->{tdfopts} //= {}; |
802
|
0
|
|
0
|
|
|
|
$coldb->{tdfopts} //= $tdfopts0->{$_} foreach (keys %$tdfopts0); ##-- tdf: inherit options |
803
|
0
|
|
0
|
|
|
|
$coldb->{tdfopts}{$_} //= $TDF_OPTS{$_} foreach (keys %TDF_OPTS); ##-- tdf: default options |
804
|
|
|
|
|
|
|
## |
805
|
0
|
|
0
|
|
|
|
my $dbreak = ($coldb->{dbreak} // $dbargs[0]{dbreak} // '#file'); |
|
|
|
0
|
|
|
|
|
806
|
0
|
0
|
|
|
|
|
$dbreak = "#$dbreak" if ($dbreak !~ /^#/); |
807
|
0
|
|
|
|
|
|
$coldb->{dbreak} = $dbreak; |
808
|
|
|
|
|
|
|
## |
809
|
|
|
|
|
|
|
$coldb->{tdf} = DiaColloDB::Relation::TDF->union($coldb, \@dbargs, |
810
|
|
|
|
|
|
|
base => "$dbdir/tdf", |
811
|
|
|
|
|
|
|
flags => $flags, |
812
|
|
|
|
|
|
|
keeptmp => $coldb->{keeptmp}, |
813
|
0
|
0
|
|
|
|
|
%{$coldb->{tdfopts}}, |
|
0
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
) |
815
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to populate (term x document) index $dbdir/tdf.*: $!"); |
816
|
|
|
|
|
|
|
} else { |
817
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): NOT creating (term x document) index $dbdir/tdf.*; set index_tdf=1 on all argument DBs to enable"); |
818
|
|
|
|
|
|
|
} |
819
|
|
|
|
|
|
|
|
820
|
|
|
|
|
|
|
##-- cleanup |
821
|
0
|
0
|
|
|
|
|
if (!$coldb->{keeptmp}) { |
822
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): cleaning up temporary files"); |
823
|
0
|
|
|
|
|
|
foreach $db (@dbargs) { |
824
|
0
|
|
|
|
|
|
foreach my $pfkey ('_union_ti2u', map {"_union_${_}i2u"} @$attrs) { |
|
0
|
|
|
|
|
|
|
825
|
0
|
0
|
|
|
|
|
$db->{$pfkey}->unlink() if ($db->{$pfkey}->can('unlink')); |
826
|
0
|
|
|
|
|
|
delete $db->{$pfkey}; |
827
|
|
|
|
|
|
|
} |
828
|
0
|
|
|
|
|
|
delete $db->{_union_argi}; |
829
|
|
|
|
|
|
|
} |
830
|
|
|
|
|
|
|
} |
831
|
|
|
|
|
|
|
|
832
|
|
|
|
|
|
|
##-- save header |
833
|
|
|
|
|
|
|
$coldb->saveHeader() |
834
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("union(): failed to save header: $!"); |
835
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
##-- all done |
837
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): union DB $dbdir created."); |
838
|
|
|
|
|
|
|
|
839
|
0
|
|
|
|
|
|
return $coldb; |
840
|
|
|
|
|
|
|
} |
841
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
1; ##-- be happy |