| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
## -*- Mode: CPerl -*- |
|
2
|
|
|
|
|
|
|
## File: DiaColloDB::methods::compile.pm |
|
3
|
|
|
|
|
|
|
## Author: Bryan Jurish <moocow@cpan.org> |
|
4
|
|
|
|
|
|
|
## Description: collocation db, top-level compile-time methods (create, union, etc.) |
|
5
|
|
|
|
|
|
|
## + really just adds methods to top-level DiaColloDB package |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
##-- dummy package |
|
8
|
|
|
|
|
|
|
package DiaColloDB::methods::compile; |
|
9
|
1
|
|
|
1
|
|
8
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
44
|
|
|
10
|
|
|
|
|
|
|
1; |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
package DiaColloDB; |
|
13
|
1
|
|
|
1
|
|
7
|
use vars qw($MMCLASS $ECLASS $XECLASS %TDF_OPTS $NJOBS); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
62
|
|
|
14
|
1
|
|
|
1
|
|
8
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
40
|
|
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
##============================================================================== |
|
17
|
|
|
|
|
|
|
## DiaColloDB: create/compile |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
20
|
|
|
|
|
|
|
## create: utils |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
## \%line2undef = $coldb->loadFilterFile($filename_or_undef) |
|
23
|
|
|
|
|
|
|
## + now in DiaColloDB::Corpus::Filters (since v0.12.012_01); alias retained for compatibility |
|
24
|
1
|
|
|
1
|
|
835
|
BEGIN { *loadFilterFile = \&DiaColloDB::Corpus::Filters::loadListFile; } |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
## $filters = $coldb->corpusFilters() |
|
27
|
|
|
|
|
|
|
## + DiaColloDB::Corpus::Filters object from $coldb options |
|
28
|
|
|
|
|
|
|
sub corpusFilters { |
|
29
|
0
|
|
|
0
|
0
|
|
my $coldb = shift; |
|
30
|
0
|
|
|
|
|
|
return DiaColloDB::Corpus::Filters->new(map {($_=>$coldb->{$_})} |
|
|
0
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
@DiaColloDB::Corpus::Filters::NAMES, |
|
32
|
|
|
|
|
|
|
@DiaColloDB::Corpus::Filters::FILES); |
|
33
|
|
|
|
|
|
|
} |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
## $multimap = $coldb->create_multimap($base, \%ts2i, $packfmt, $label="multimap") |
|
36
|
|
|
|
|
|
|
sub create_multimap { |
|
37
|
0
|
|
|
0
|
0
|
|
my ($coldb,$base,$ts2i,$packfmt,$label) = @_; |
|
38
|
0
|
|
0
|
|
|
|
$label //= "multimap"; |
|
39
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate},"create_multimap(): creating $label $base.*"); |
|
40
|
|
|
|
|
|
|
|
|
41
|
0
|
|
|
|
|
|
my $pack_id = $coldb->{pack_id}; |
|
42
|
0
|
|
|
|
|
|
my $pack_mmb = "${pack_id}*"; ##-- multimap target-set pack format |
|
43
|
0
|
|
|
|
|
|
my @v2ti = qw(); |
|
44
|
0
|
|
|
|
|
|
my ($t,$ti,$vi); |
|
45
|
0
|
|
|
|
|
|
while (($t,$ti)=each %$ts2i) { |
|
46
|
0
|
|
|
|
|
|
($vi) = unpack($packfmt,$t); |
|
47
|
0
|
|
|
|
|
|
$v2ti[$vi] .= pack($pack_id,$ti); |
|
48
|
|
|
|
|
|
|
} |
|
49
|
0
|
|
0
|
|
|
|
$_ = pack($pack_mmb, sort {$a<=>$b} unpack($pack_mmb,$_//'')) foreach (@v2ti); ##-- ensure multimap target-sets are sorted |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
my $v2t = $coldb->mmclass($MMCLASS)->new(base=>$base, flags=>'rw', perms=>$coldb->{perms}, pack_i=>$pack_id, pack_o=>$coldb->{pack_off}, pack_l=>$coldb->{pack_id}) |
|
52
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("create_multimap(): failed to create $base.*: $!"); |
|
53
|
0
|
0
|
|
|
|
|
$v2t->fromArray(\@v2ti) |
|
54
|
|
|
|
|
|
|
or $coldb->logconfess("create_multimap(): failed to populate $base.*: $!"); |
|
55
|
0
|
0
|
|
|
|
|
$v2t->flush() |
|
56
|
|
|
|
|
|
|
or $coldb->logconfess("create_multimap(): failed to flush $base.*: $!"); |
|
57
|
|
|
|
|
|
|
|
|
58
|
0
|
|
|
|
|
|
return $v2t; |
|
59
|
|
|
|
|
|
|
} |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
## \@attrs = $coldb->attrs() |
|
62
|
|
|
|
|
|
|
## \@attrs = $coldb->attrs($attrs=$coldb->{attrs}, $default=[]) |
|
63
|
|
|
|
|
|
|
## + parse attributes in $attrs as array |
|
64
|
|
|
|
|
|
|
sub attrs { |
|
65
|
0
|
|
|
0
|
0
|
|
my ($coldb,$attrs,$default) = @_; |
|
66
|
0
|
|
0
|
|
|
|
$attrs //= $coldb->{attrs} // $default // []; |
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
67
|
0
|
0
|
|
|
|
|
return $attrs if (UNIVERSAL::isa($attrs,'ARRAY')); |
|
68
|
0
|
0
|
|
|
|
|
return [grep {defined($_) && $_ ne ''} split(/[\s\,]+/, $attrs)]; |
|
|
0
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
} |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
## $aname = $CLASS_OR_OBJECT->attrName($attr) |
|
72
|
|
|
|
|
|
|
## + returns canonical (short) attribute name for $attr |
|
73
|
|
|
|
|
|
|
## + supports aliases in %ATTR_ALIAS = ($alias=>$name, ...) |
|
74
|
|
|
|
|
|
|
## + see also: |
|
75
|
|
|
|
|
|
|
## %ATTR_RALIAS = ($name=>\@aliases, ...) |
|
76
|
|
|
|
|
|
|
## %ATTR_CBEXPR = ($name=>$ddcCountByExpr, ...) |
|
77
|
|
|
|
|
|
|
## %ATTR_TITLE = ($name_or_alias=>$title, ...) |
|
78
|
|
|
|
|
|
|
our (%ATTR_ALIAS,%ATTR_RALIAS,%ATTR_TITLE,%ATTR_CBEXPR); |
|
79
|
|
|
|
|
|
|
BEGIN { |
|
80
|
|
|
|
|
|
|
%ATTR_RALIAS = ( |
|
81
|
3
|
|
|
|
|
15
|
'l' => [map {(uc($_),ucfirst($_),$_)} qw(lemma lem l)], |
|
82
|
3
|
|
|
|
|
9
|
'w' => [map {(uc($_),ucfirst($_),$_)} qw(token word w)], |
|
83
|
5
|
|
|
|
|
26
|
'p' => [map {(uc($_),ucfirst($_),$_)} qw(postag tag pt pos p)], |
|
84
|
|
|
|
|
|
|
## |
|
85
|
|
|
|
|
|
|
'doc.collection' => [qw(doc.collection collection doc.corpus corpus)], |
|
86
|
|
|
|
|
|
|
'doc.textClass' => [qw(doc.textClass textClass textclass tc)], #doc.genre genre |
|
87
|
|
|
|
|
|
|
'doc.genre' => [qw(doc.genre genre doc.textClass0 textClass0 textclass0 tc0)], |
|
88
|
|
|
|
|
|
|
'doc.title' => [qw(doc.title title)], |
|
89
|
|
|
|
|
|
|
'doc.author' => [qw(doc.author author)], |
|
90
|
|
|
|
|
|
|
'doc.basename' => [qw(doc.basename basename)], |
|
91
|
|
|
|
|
|
|
'doc.bibl' => [qw(doc.bibl bibl)], |
|
92
|
|
|
|
|
|
|
'doc.flags' => [qw(doc.flags flags)], |
|
93
|
|
|
|
|
|
|
## |
|
94
|
2
|
|
|
|
|
7
|
date => [map {(uc($_),ucfirst($_),$_)} qw(date d)], |
|
95
|
1
|
|
|
1
|
|
6
|
slice => [map {(uc($_),ucfirst($_),$_)} qw(dslice slice sl ds s)], |
|
|
5
|
|
|
|
|
36
|
|
|
96
|
|
|
|
|
|
|
); |
|
97
|
1
|
|
|
|
|
8
|
%ATTR_ALIAS = (map {my $attr=$_; map {($_=>$attr)} @{$ATTR_RALIAS{$attr}}} keys %ATTR_RALIAS); |
|
|
13
|
|
|
|
|
22
|
|
|
|
13
|
|
|
|
|
17
|
|
|
|
78
|
|
|
|
|
181
|
|
|
|
13
|
|
|
|
|
24
|
|
|
98
|
1
|
|
|
|
|
7
|
%ATTR_TITLE = ( |
|
99
|
|
|
|
|
|
|
'l'=>'lemma', |
|
100
|
|
|
|
|
|
|
'w'=>'word', |
|
101
|
|
|
|
|
|
|
'p'=>'pos', |
|
102
|
|
|
|
|
|
|
); |
|
103
|
1
|
|
|
|
|
18
|
%ATTR_CBEXPR = ( |
|
104
|
|
|
|
|
|
|
'doc.textClass' => DDC::Any::CQCountKeyExprRegex->new(DDC::Any::CQCountKeyExprBibl->new('textClass'),':.*$',''), |
|
105
|
|
|
|
|
|
|
'doc.genre' => DDC::Any::CQCountKeyExprRegex->new(DDC::Any::CQCountKeyExprBibl->new('textClass'),':.*$',''), |
|
106
|
|
|
|
|
|
|
); |
|
107
|
|
|
|
|
|
|
} |
|
108
|
|
|
|
|
|
|
sub attrName { |
|
109
|
0
|
0
|
|
0
|
0
|
|
shift if (UNIVERSAL::isa($_[0],__PACKAGE__)); |
|
110
|
0
|
|
0
|
|
|
|
return $ATTR_ALIAS{($_[0]//'')} // $_[0]; |
|
|
|
|
0
|
|
|
|
|
|
111
|
|
|
|
|
|
|
} |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
## $atitle = $CLASS_OR_OBJECT->attrTitle($attr_or_alias) |
|
114
|
|
|
|
|
|
|
## + returns an attribute title for $attr_or_alias |
|
115
|
|
|
|
|
|
|
sub attrTitle { |
|
116
|
0
|
|
|
0
|
0
|
|
my ($that,$attr) = @_; |
|
117
|
0
|
|
0
|
|
|
|
$attr = $that->attrName($attr//''); |
|
118
|
0
|
0
|
|
|
|
|
return $ATTR_TITLE{$attr} if (exists($ATTR_TITLE{$attr})); |
|
119
|
0
|
|
|
|
|
|
$attr =~ s/^(?:doc|meta)\.//; |
|
120
|
0
|
|
|
|
|
|
return $attr; |
|
121
|
|
|
|
|
|
|
} |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
## $acbexpr = $CLASS_OR_OBJECT->attrCountBy($attr_or_alias,$matchid=0) |
|
124
|
|
|
|
|
|
|
sub attrCountBy { |
|
125
|
0
|
|
|
0
|
0
|
|
my ($that,$attr,$matchid) = @_; |
|
126
|
0
|
|
0
|
|
|
|
$attr = $that->attrName($attr//''); |
|
127
|
0
|
0
|
|
|
|
|
if (exists($ATTR_CBEXPR{$attr})) { |
|
128
|
|
|
|
|
|
|
##-- aliased attribute |
|
129
|
0
|
|
|
|
|
|
return $ATTR_CBEXPR{$attr}; |
|
130
|
|
|
|
|
|
|
} |
|
131
|
0
|
0
|
|
|
|
|
if ($attr =~ /^doc\.(.*)$/) { |
|
132
|
|
|
|
|
|
|
##-- document attribute ("doc.ATTR" convention) |
|
133
|
0
|
|
|
|
|
|
return DDC::Any::CQCountKeyExprBibl->new($1); |
|
134
|
|
|
|
|
|
|
} else { |
|
135
|
|
|
|
|
|
|
##-- token attribute |
|
136
|
0
|
|
0
|
|
|
|
return DDC::Any::CQCountKeyExprToken->new($attr, ($matchid||0), 0); |
|
137
|
|
|
|
|
|
|
} |
|
138
|
|
|
|
|
|
|
} |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
## $aquery_or_filter_or_undef = $CLASS_OR_OBJECT->attrQuery($attr_or_alias,$cquery) |
|
141
|
|
|
|
|
|
|
## + returns a CQuery or CQFilter object for condition $cquery on $attr_or_alias |
|
142
|
|
|
|
|
|
|
sub attrQuery { |
|
143
|
0
|
|
|
0
|
0
|
|
my ($that,$attr,$cquery) = @_; |
|
144
|
0
|
0
|
0
|
|
|
|
$attr = $that->attrName( $attr // ($cquery ? $cquery->getIndexName : undef) // '' ); |
|
|
|
|
0
|
|
|
|
|
|
145
|
0
|
0
|
|
|
|
|
if ($attr =~ /^doc\./) { |
|
146
|
|
|
|
|
|
|
##-- document attribute ("doc.ATTR" convention) |
|
147
|
0
|
|
|
|
|
|
return $that->query2filter($attr,$cquery); |
|
148
|
|
|
|
|
|
|
} |
|
149
|
|
|
|
|
|
|
##-- token condition (use literal $cquery) |
|
150
|
0
|
|
|
|
|
|
return $cquery; |
|
151
|
|
|
|
|
|
|
} |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
## \@attrdata = $coldb->attrData() |
|
154
|
|
|
|
|
|
|
## \@attrdata = $coldb->attrData(\@attrs=$coldb->attrs) |
|
155
|
|
|
|
|
|
|
## + get attribute data for \@attrs |
|
156
|
|
|
|
|
|
|
## + return @attrdata = ({a=>$attr, i=>$i, enum=>$aenum, pack_t=>$pack_xa, a2t=>$a2t, ...}) |
|
157
|
|
|
|
|
|
|
sub attrData { |
|
158
|
0
|
|
|
0
|
0
|
|
my ($coldb,$attrs) = @_; |
|
159
|
0
|
|
0
|
|
|
|
$attrs //= $coldb->attrs; |
|
160
|
0
|
|
|
|
|
|
my ($attr); |
|
161
|
|
|
|
|
|
|
return [map { |
|
162
|
0
|
|
|
|
|
|
$attr = $coldb->attrName($attrs->[$_]); |
|
|
0
|
|
|
|
|
|
|
|
163
|
0
|
|
|
|
|
|
{i=>$_, a=>$attr, enum=>$coldb->{"${attr}enum"}, pack_t=>$coldb->{"pack_t$attr"}, a2t=>$coldb->{"${attr}2t"}} |
|
164
|
|
|
|
|
|
|
} (0..$#$attrs)]; |
|
165
|
|
|
|
|
|
|
} |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
## $bool = $coldb->hasAttr($attr) |
|
168
|
|
|
|
|
|
|
sub hasAttr { |
|
169
|
0
|
0
|
|
0
|
0
|
|
return 0 if (!defined($_[1])); |
|
170
|
0
|
|
0
|
|
|
|
return $_[1] ne 'x' && defined($_[0]{$_[0]->attrName($_[1]).'enum'}); |
|
171
|
|
|
|
|
|
|
} |
|
172
|
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
175
|
|
|
|
|
|
|
## create: from corpus |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
## $bool = $coldb->create($corpus,%opts) |
|
178
|
|
|
|
|
|
|
## + %opts: |
|
179
|
|
|
|
|
|
|
## $key => $val, ##-- clobbers $coldb->{$key} |
|
180
|
|
|
|
|
|
|
sub create { |
|
181
|
0
|
|
|
0
|
1
|
|
my ($coldb,$corpus,%opts) = @_; |
|
182
|
0
|
0
|
|
|
|
|
$coldb = $coldb->new() if (!ref($coldb)); |
|
183
|
0
|
|
|
|
|
|
@$coldb{keys %opts} = values %opts; |
|
184
|
0
|
|
|
|
|
|
my $flags = O_RDWR|O_CREAT|O_TRUNC; |
|
185
|
0
|
|
|
|
|
|
my $debug = $coldb->{debug}; |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
##-- initialize: output directory |
|
188
|
|
|
|
|
|
|
my $dbdir = $coldb->{dbdir} |
|
189
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("create() called but 'dbdir' key not set!"); |
|
190
|
0
|
|
|
|
|
|
$dbdir =~ s{/$}{}; |
|
191
|
0
|
|
|
|
|
|
$coldb->vlog('info', "create($dbdir) v$coldb->{version}"); |
|
192
|
0
|
0
|
0
|
|
|
|
!-d $dbdir |
|
193
|
|
|
|
|
|
|
or remove_tree($dbdir) |
|
194
|
|
|
|
|
|
|
or $coldb->logconfess("create(): could not remove stale $dbdir: $!"); |
|
195
|
0
|
0
|
|
|
|
|
make_path($dbdir) |
|
196
|
|
|
|
|
|
|
or $coldb->logconfess("create(): could not create DB directory $dbdir: $!"); |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
##-- initialize: tdf |
|
199
|
0
|
|
0
|
|
|
|
$coldb->{index_tdf} //= 1; |
|
200
|
0
|
0
|
|
|
|
|
if ($coldb->{index_tdf}) { |
|
201
|
0
|
0
|
|
|
|
|
if (!require "DiaColloDB/Relation/TDF.pm") { |
|
202
|
0
|
0
|
|
|
|
|
$coldb->logwarn("create(): require failed for DiaColloDB/Relation/TDF.pm ; (term x document) matrix modelling disabled", ($@ ? "\n: $@" : '')); |
|
203
|
0
|
|
|
|
|
|
$coldb->{index_tdf} = 0; |
|
204
|
|
|
|
|
|
|
} else { |
|
205
|
0
|
|
|
|
|
|
$coldb->info("(term x document) matrix modelling via DiaColloDB::Relation::TDF enabled."); |
|
206
|
|
|
|
|
|
|
} |
|
207
|
|
|
|
|
|
|
} |
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
##-- initialize: attributes |
|
210
|
0
|
|
|
|
|
|
my $attrs = $coldb->{attrs} = [map {$coldb->attrName($_)} @{$coldb->attrs(undef,['l'])}]; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
##-- pack-formats |
|
213
|
0
|
|
|
|
|
|
my $pack_id = $coldb->{pack_id}; |
|
214
|
0
|
|
|
|
|
|
my $pack_date = $coldb->{pack_date}; |
|
215
|
0
|
|
|
|
|
|
my $pack_f = $coldb->{pack_f}; |
|
216
|
0
|
|
|
|
|
|
my $pack_off = $coldb->{pack_off}; |
|
217
|
0
|
|
|
|
|
|
my $pack_len = $coldb->{pack_len}; |
|
218
|
0
|
|
|
|
|
|
my $pack_t = $coldb->{pack_t} = $pack_id."[".scalar(@$attrs)."]"; |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
##-- initialize: common flags |
|
221
|
0
|
|
|
|
|
|
my %efopts = (flags=>$flags, pack_i=>$coldb->{pack_id}, pack_o=>$coldb->{pack_off}, pack_l=>$coldb->{pack_len}); |
|
222
|
0
|
|
|
|
|
|
my %mmopts = (flags=>$flags, pack_i=>$coldb->{pack_id}); |
|
223
|
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
##-- initialize: attribute enums |
|
225
|
0
|
|
|
|
|
|
my $aconf = []; ##-- [{a=>$attr, i=>$i, enum=>$aenum, pack_t=>$pack_ta, s2i=>\%s2i, ns=>$nstrings, ?i2j=>$pftmp, ...}, ] |
|
226
|
0
|
|
|
|
|
|
my $axpos = 0; |
|
227
|
0
|
|
|
|
|
|
my ($attr,$ac); |
|
228
|
0
|
|
|
|
|
|
foreach (0..$#$attrs) { |
|
229
|
0
|
|
|
|
|
|
push(@$aconf,$ac={i=>$_, a=>($attr=$attrs->[$_])}); |
|
230
|
0
|
|
|
|
|
|
$ac->{enum} = $coldb->{"${attr}enum"} = $coldb->mmclass($ECLASS)->new(%efopts); |
|
231
|
0
|
|
|
|
|
|
$ac->{pack_t} = $coldb->{"pack_t$attr"} = '@'.$axpos.$pack_id; |
|
232
|
0
|
|
|
|
|
|
$ac->{s2i} = $ac->{enum}{s2i}; |
|
233
|
0
|
0
|
|
|
|
|
$ac->{ma} = $1 if ($attr =~ /^(?:meta|doc)\.(.*)$/); |
|
234
|
0
|
|
|
|
|
|
$axpos += packsize($pack_id); |
|
235
|
|
|
|
|
|
|
} |
|
236
|
0
|
|
|
|
|
|
my @aconfm = grep { defined($_->{ma})} @$aconf; ##-- meta-attributes |
|
|
0
|
|
|
|
|
|
|
|
237
|
0
|
|
|
|
|
|
my @aconfw = grep {!defined($_->{ma})} @$aconf; ##-- token-attributes |
|
|
0
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
##-- initialize: tuple enum (+dates) |
|
240
|
0
|
|
|
|
|
|
my $tenum = $coldb->{tenum} = $coldb->mmclass($XECLASS)->new(%efopts, pack_s=>$pack_t); |
|
241
|
0
|
|
|
|
|
|
my $ts2i = $tenum->{s2i}; |
|
242
|
0
|
|
|
|
|
|
my $nt = 0; |
|
243
|
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
##-- initialize: corpus token-list (temporary) |
|
245
|
|
|
|
|
|
|
## + 1 token/line, blank lines ~ EOS, token lines ~ "$a0i $a1i ... $aNi $date" |
|
246
|
0
|
|
|
|
|
|
my $atokfile = "$dbdir/atokens.dat"; |
|
247
|
0
|
0
|
|
|
|
|
CORE::open(my $atokfh, ">:raw", $atokfile) |
|
248
|
|
|
|
|
|
|
or $coldb->logconfess("$0: open failed for $atokfile: $!"); |
|
249
|
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
##-- initialize: tdf: doc-data array (temporary) |
|
251
|
0
|
|
|
|
|
|
my ($docmeta,$docoff); |
|
252
|
0
|
|
|
|
|
|
my $ndocs = 0; ##-- current size of @$docmeta, @$docoff |
|
253
|
0
|
|
|
|
|
|
my $index_tdf = $coldb->{index_tdf}; |
|
254
|
0
|
0
|
|
|
|
|
if ($index_tdf) { |
|
255
|
0
|
0
|
|
|
|
|
$docmeta = $coldb->{docmeta} = tmparray("$dbdir/docmeta", UNLINK=>!$coldb->{keeptmp}, pack_o=>'J', pack_l=>'J') |
|
256
|
|
|
|
|
|
|
or $coldb->logconfess("create(): could not tie temporary doc-data array to $dbdir/docmeta.*: $!"); |
|
257
|
|
|
|
|
|
|
$docoff = $coldb->{docoff} = tmparrayp("$dbdir/docoff", 'J', UNLINK=>!$coldb->{keeptmp}) |
|
258
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("create(): could not tie temporary doc-offset array to $dbdir/docoff.*: $!"); |
|
259
|
|
|
|
|
|
|
} |
|
260
|
0
|
|
0
|
|
|
|
my $dbreak = ($coldb->{dbreak} // '#file'); |
|
261
|
0
|
0
|
|
|
|
|
$dbreak = "#$dbreak" if ($dbreak !~ /^#/); |
|
262
|
0
|
|
|
|
|
|
$coldb->{dbreak} = $dbreak; |
|
263
|
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
##-- initialize: pre-compile corpus |
|
265
|
0
|
0
|
|
|
|
|
if (!UNIVERSAL::isa($corpus,'DiaColloDB::Corpus::Compiled')) { |
|
266
|
0
|
|
|
|
|
|
$coldb->vlog('info', "create(): pre-compiling & filtering corpus to $dbdir/corpus.d/"); |
|
267
|
|
|
|
|
|
|
$corpus = $corpus->compile("$dbdir/corpus.d", |
|
268
|
|
|
|
|
|
|
njobs=>$NJOBS, |
|
269
|
|
|
|
|
|
|
filters=>$coldb->corpusFilters, |
|
270
|
|
|
|
|
|
|
logFileN=>max2(1,$corpus->size/10), |
|
271
|
|
|
|
|
|
|
temp=>!$coldb->{keeptmp} |
|
272
|
|
|
|
|
|
|
) |
|
273
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("failed to pre-compile corpus to $dbdir/corpus.d/"); |
|
274
|
|
|
|
|
|
|
} else { |
|
275
|
0
|
|
|
|
|
|
$coldb->vlog('info', "create(): using pre-compiled corpus ".$corpus->dbdir.'/'); |
|
276
|
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
##-- always use pre-compiled corpus filters -- but warn about overrides |
|
278
|
0
|
|
|
|
|
|
my ($cfilters,$dbfilters) = ($corpus->filters,$coldb->corpusFilters); |
|
279
|
0
|
|
|
|
|
|
foreach my $key (@DiaColloDB::Corpus::Filters::NAMES,@DiaColloDB::Corpus::Filters::FILES) { |
|
280
|
0
|
0
|
0
|
|
|
|
if (($dbfilters->{$key}//'') ne ($cfilters->{$key}//'')) { |
|
|
|
|
0
|
|
|
|
|
|
281
|
0
|
|
0
|
|
|
|
$coldb->warn("create(): WARNING: pre-compiled corpus filter $key=".($cfilters->{$key}//'(null)')." overrides user request=".($dbfilters->{$key}//'(null)')); |
|
|
|
|
0
|
|
|
|
|
|
282
|
0
|
|
|
|
|
|
$coldb->{$key} = $cfilters->{$key}; |
|
283
|
|
|
|
|
|
|
} |
|
284
|
|
|
|
|
|
|
} |
|
285
|
|
|
|
|
|
|
} |
|
286
|
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
##-- initialize: logging |
|
288
|
0
|
|
|
|
|
|
my $nfiles = $corpus->size(); |
|
289
|
0
|
|
0
|
|
|
|
my $logFileN = $coldb->{logCorpusFileN} // max2(1,int($nfiles/20)); |
|
290
|
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
##-- initialize: enums, date-range |
|
292
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate},"create(): processing $nfiles corpus file(s)"); |
|
293
|
0
|
|
|
|
|
|
my ($xdmin,$xdmax) = ('inf','-inf'); |
|
294
|
0
|
|
|
|
|
|
my ($doc, $date,$tok,@ais,$aistr,$t,$ti, $nsigs, $filei, $last_was_eos); |
|
295
|
0
|
|
|
|
|
|
my $docoff_cur = -1; |
|
296
|
0
|
|
|
|
|
|
my $toki = 0; |
|
297
|
0
|
|
|
|
|
|
for ($corpus->ibegin(); $corpus->iok; $corpus->inext) { |
|
298
|
0
|
|
|
|
|
|
$doc = $corpus->idocument(); |
|
299
|
|
|
|
|
|
|
$coldb->vlog($coldb->{logCorpusFile}, |
|
300
|
0
|
0
|
0
|
|
|
|
sprintf("create(): processing files [%3.0f%%]: %s", 100*($filei-1)/$nfiles, ($doc->{label} || $corpus->ifile))) |
|
|
|
|
0
|
|
|
|
|
|
301
|
|
|
|
|
|
|
if ($logFileN && ($filei++ % $logFileN)==0); |
|
302
|
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
##-- initalize tdf data (#/sigs) |
|
304
|
0
|
|
|
|
|
|
$nsigs = 0; |
|
305
|
0
|
|
|
|
|
|
$docoff_cur=$toki; |
|
306
|
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
##-- get date-range |
|
308
|
0
|
|
|
|
|
|
$date = $doc->{date}; |
|
309
|
0
|
0
|
|
|
|
|
$xdmin = $date if ($date < $xdmin); |
|
310
|
0
|
0
|
|
|
|
|
$xdmax = $date if ($date > $xdmax); |
|
311
|
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
##-- get meta-attributes |
|
313
|
0
|
|
|
|
|
|
@ais = qw(); |
|
314
|
0
|
|
0
|
|
|
|
$ais[$_->{i}] = ($_->{s2i}{$doc->{meta}{$_->{ma}}} //= ++$_->{ns}) foreach (@aconfm); |
|
315
|
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
##-- iterate over tokens, populating initial attribute-enums and writing $atokfile |
|
317
|
0
|
|
|
|
|
|
$last_was_eos = 1; |
|
318
|
0
|
|
|
|
|
|
foreach $tok (@{$doc->{tokens}}) { |
|
|
0
|
|
|
|
|
|
|
|
319
|
0
|
0
|
0
|
|
|
|
if (ref($tok)) { |
|
|
|
0
|
0
|
|
|
|
|
|
|
|
0
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
320
|
|
|
|
|
|
|
##-- normal token: get attribute value-ids and build tuple |
|
321
|
0
|
|
0
|
|
|
|
$ais[$_->{i}] = ($_->{s2i}{$tok->{$_->{a}//''}} //= ++$_->{ns}) foreach (@aconfw); |
|
|
|
|
0
|
|
|
|
|
|
322
|
0
|
|
|
|
|
|
$aistr = join(' ',@ais); |
|
323
|
|
|
|
|
|
|
|
|
324
|
0
|
|
|
|
|
|
$atokfh->print("$aistr $date\n"); |
|
325
|
0
|
|
|
|
|
|
$last_was_eos = 0; |
|
326
|
0
|
|
|
|
|
|
++$toki; |
|
327
|
|
|
|
|
|
|
} |
|
328
|
|
|
|
|
|
|
elsif (!defined($tok) && !$last_was_eos) { |
|
329
|
|
|
|
|
|
|
##-- eos |
|
330
|
0
|
|
|
|
|
|
$atokfh->print("\n"); |
|
331
|
0
|
|
|
|
|
|
$last_was_eos = 1; |
|
332
|
|
|
|
|
|
|
} |
|
333
|
|
|
|
|
|
|
elsif (defined($tok) && $tok eq $dbreak && $docoff && $docoff_cur < $toki) { |
|
334
|
|
|
|
|
|
|
##-- break:tdf |
|
335
|
0
|
|
|
|
|
|
++$nsigs; |
|
336
|
0
|
|
|
|
|
|
push(@$docoff, $docoff_cur); |
|
337
|
0
|
|
|
|
|
|
$docoff_cur = $toki; |
|
338
|
|
|
|
|
|
|
} |
|
339
|
|
|
|
|
|
|
} |
|
340
|
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
##-- store final doc-break (for tdf) |
|
342
|
0
|
0
|
0
|
|
|
|
if ($docoff && $docoff_cur < $toki) { |
|
343
|
0
|
|
|
|
|
|
++$nsigs; |
|
344
|
0
|
|
|
|
|
|
push(@$docoff, $docoff_cur); |
|
345
|
0
|
|
|
|
|
|
$docoff_cur = $toki; |
|
346
|
|
|
|
|
|
|
} |
|
347
|
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
##-- store doc-data (for tdf) |
|
349
|
0
|
0
|
|
|
|
|
if ($docmeta) { |
|
350
|
|
|
|
|
|
|
push(@$docmeta, { |
|
351
|
|
|
|
|
|
|
id => $ndocs++, |
|
352
|
|
|
|
|
|
|
nsigs => $nsigs, |
|
353
|
|
|
|
|
|
|
file => $corpus->ifile, |
|
354
|
0
|
|
|
|
|
|
(map {($_=>$doc->{$_})} qw(meta date label)), |
|
|
0
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
}) |
|
356
|
|
|
|
|
|
|
} |
|
357
|
|
|
|
|
|
|
} |
|
358
|
|
|
|
|
|
|
##-- store final pseudo-doc offset (total #/tokens) |
|
359
|
0
|
0
|
|
|
|
|
push(@$docoff, $toki) if ($docoff); |
|
360
|
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
##-- store date-range |
|
362
|
0
|
|
|
|
|
|
@$coldb{qw(xdmin xdmax)} = ($xdmin,$xdmax); |
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
##-- close temporary attribute-token file(s) |
|
365
|
0
|
0
|
|
|
|
|
CORE::close($atokfh) |
|
366
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to close temporary token storage file '$atokfile': $!"); |
|
367
|
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
##-- close/free temporary corpus |
|
369
|
0
|
0
|
|
|
|
|
undef $corpus if ($corpus->{temp}); |
|
370
|
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
##-- filter: by attribute frequency |
|
372
|
0
|
|
|
|
|
|
my $ibad = unpack($pack_id,pack($pack_id,-1)); |
|
373
|
0
|
|
|
|
|
|
foreach $ac (@$aconf) { |
|
374
|
0
|
|
0
|
|
|
|
my $afmin = $coldb->{"fmin_".$ac->{a}} // ''; |
|
375
|
0
|
0
|
0
|
|
|
|
$afmin = $coldb->{tfmin} // 0 if (($afmin//'') eq ''); |
|
|
|
|
0
|
|
|
|
|
|
376
|
0
|
0
|
|
|
|
|
next if ($afmin <= 0); |
|
377
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "create(): building attribute frequency filter (fmin_$ac->{a}=$afmin)"); |
|
378
|
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
##-- filter: by attribute frequency: setup re-numbering map $ac->{i2j} |
|
380
|
0
|
|
|
|
|
|
my $i2j = $ac->{i2j} = tmparrayp("$dbdir/i2j_$ac->{a}.tmp", 'J', UNLINK=>!$coldb->{keeptmp}); |
|
381
|
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
##-- filter: by attribute frequency: populate $ac->{i2j} and update $ac->{s2i} |
|
383
|
0
|
|
|
|
|
|
env_push(LC_ALL=>'C'); |
|
384
|
0
|
|
|
|
|
|
my $ai1 = $ac->{i}+1; |
|
385
|
0
|
0
|
|
|
|
|
my $cmdfh = opencmd("sort -nk$ai1 $atokfile ".sortJobs()." | cut -d\" \" -f $ai1 | uniq -c |") |
|
386
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to open pipe from sort for attribute frequency filter (fmin_$ac->{a}=$afmin)"); |
|
387
|
0
|
|
|
|
|
|
my ($f,$i); |
|
388
|
0
|
|
|
|
|
|
my $nj = 0; |
|
389
|
0
|
|
|
|
|
|
while (defined($_=<$cmdfh>)) { |
|
390
|
0
|
|
|
|
|
|
chomp; |
|
391
|
0
|
|
|
|
|
|
($f,$i) = split(' ',$_,2); |
|
392
|
0
|
0
|
|
|
|
|
$i2j->[$i] = ($f >= $afmin ? ++$nj : $ibad) if ($i) |
|
|
|
0
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
} |
|
394
|
0
|
|
|
|
|
|
$cmdfh->close(); |
|
395
|
0
|
|
|
|
|
|
env_pop(); |
|
396
|
|
|
|
|
|
|
|
|
397
|
0
|
|
|
|
|
|
my $nabad = $ac->{ns} - $nj; |
|
398
|
0
|
0
|
|
|
|
|
my $pabad = $ac->{ns} ? sprintf("%.2f%%", 100*$nabad/$ac->{ns}) : 'nan%'; |
|
399
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "create(): filter (fmin_$ac->{a}=$afmin) pruning $nabad of $ac->{ns} attribute value type(s) ($pabad)"); |
|
400
|
|
|
|
|
|
|
|
|
401
|
0
|
|
|
|
|
|
tied(@$i2j)->flush; |
|
402
|
0
|
|
|
|
|
|
my $s2i = $ac->{s2i}; |
|
403
|
0
|
|
|
|
|
|
my ($s,$j,@badkeys); |
|
404
|
0
|
|
|
|
|
|
while (($s,$i)=each %$s2i) { |
|
405
|
0
|
0
|
|
|
|
|
if (($j=$i2j->[$i])==$ibad) { |
|
406
|
0
|
|
|
|
|
|
delete $s2i->{$s}; |
|
407
|
|
|
|
|
|
|
} else { |
|
408
|
0
|
|
|
|
|
|
$s2i->{$s} = $j; |
|
409
|
|
|
|
|
|
|
} |
|
410
|
|
|
|
|
|
|
} |
|
411
|
0
|
|
|
|
|
|
$ac->{ns} = $nj; |
|
412
|
0
|
|
|
|
|
|
tied(@$i2j)->flush; |
|
413
|
|
|
|
|
|
|
} |
|
414
|
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
##-- filter: terms: populate $ts2t (map IDs) |
|
416
|
|
|
|
|
|
|
## + $ts2t = { join(' ',@ais) => pack($pack_t,i2j(@ais)), ...} |
|
417
|
|
|
|
|
|
|
## + includes attribute-id re-mappings |
|
418
|
|
|
|
|
|
|
## + only populated if we have any frequency filters active |
|
419
|
0
|
|
|
|
|
|
my $ts2t = undef; |
|
420
|
0
|
|
0
|
|
|
|
my $tfmin = $coldb->{tfmin}//0; |
|
421
|
0
|
0
|
0
|
|
|
|
if ($tfmin > 0 || grep {defined($_->{i2j})} @$aconf) { |
|
|
0
|
|
|
|
|
|
|
|
422
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "create(): populating global term enum (tfmin=$tfmin)"); |
|
423
|
0
|
0
|
|
|
|
|
my @ai2j = map {defined($_->{i2j}) ? $_->{i2j} : undef} @$aconf; |
|
|
0
|
|
|
|
|
|
|
|
424
|
0
|
|
|
|
|
|
my @ai2ji = grep {defined($ai2j[$_])} (0..$#ai2j); |
|
|
0
|
|
|
|
|
|
|
|
425
|
0
|
|
|
|
|
|
my $na = scalar(@$attrs); |
|
426
|
0
|
|
|
|
|
|
my ($nw0,$nw) = (0,0); |
|
427
|
0
|
|
|
|
|
|
my ($f); |
|
428
|
0
|
|
|
|
|
|
env_push(LC_ALL=>'C'); |
|
429
|
|
|
|
|
|
|
my $cmdfh = |
|
430
|
0
|
0
|
|
|
|
|
opencmd("sort ".join(' ', map {"-nk$_"} (1..$na))." ".sortJobs()." $atokfile | cut -d\" \" -f -$na | uniq -c |") |
|
|
0
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to open pipe from sort for global term filter"); |
|
432
|
|
|
|
|
|
|
FILTER_WTUPLES: |
|
433
|
0
|
|
|
|
|
|
while (defined($_=<$cmdfh>)) { |
|
434
|
0
|
|
|
|
|
|
chomp; |
|
435
|
0
|
|
|
|
|
|
++$nw0; |
|
436
|
0
|
|
|
|
|
|
($f,$aistr) = split(' ',$_,2); |
|
437
|
0
|
0
|
0
|
|
|
|
next if (!$aistr || $f < $tfmin); |
|
438
|
0
|
|
|
|
|
|
@ais = split(' ',$aistr,$na); |
|
439
|
0
|
|
|
|
|
|
foreach (@ai2ji) { |
|
440
|
|
|
|
|
|
|
##-- apply attribute-wise re-mappings |
|
441
|
0
|
|
0
|
|
|
|
$ais[$_] = $ai2j[$_][$ais[$_]//0]; |
|
442
|
0
|
0
|
|
|
|
|
next FILTER_WTUPLES if ($ais[$_] == $ibad); |
|
443
|
|
|
|
|
|
|
} |
|
444
|
0
|
|
|
|
|
|
$ts2t->{$aistr} = pack($pack_t,@ais); |
|
445
|
0
|
|
|
|
|
|
++$nw; |
|
446
|
|
|
|
|
|
|
} |
|
447
|
0
|
|
|
|
|
|
$cmdfh->close(); |
|
448
|
0
|
|
|
|
|
|
env_pop(); |
|
449
|
|
|
|
|
|
|
|
|
450
|
0
|
|
|
|
|
|
my $nwbad = $nw0 - $nw; |
|
451
|
0
|
0
|
|
|
|
|
my $pwbad = $nw0 ? sprintf("%.2f%%", 100*$nwbad/$nw0) : 'nan%'; |
|
452
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "create(): will prune $nwbad of $nw0 term tuple type(s) ($pwbad)"); |
|
453
|
|
|
|
|
|
|
} |
|
454
|
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
##-- compile: apply filters & assign term-ids |
|
456
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "create(): filtering corpus tokens & assigning term-IDs"); |
|
457
|
0
|
|
|
|
|
|
my $tokfile = "$dbdir/tokens.dat"; ##-- v0.10.x: new format: "TID DATE\n" | "\n" |
|
458
|
0
|
0
|
|
|
|
|
CORE::open(my $tokfh, ">:raw", $tokfile) |
|
459
|
|
|
|
|
|
|
or $coldb->logconfess("$0: open failed for $tokfile: $!"); |
|
460
|
0
|
|
|
|
|
|
my $vtokfile = "$dbdir/vtokens.bin"; |
|
461
|
0
|
0
|
|
|
|
|
CORE::open(my $vtokfh, ">:raw", $vtokfile) ##-- format: pack($pack_t,@ais) |
|
462
|
|
|
|
|
|
|
or $coldb->logconfess("$0: open failed for $vtokfile: $!"); |
|
463
|
0
|
0
|
|
|
|
|
CORE::open($atokfh, "<:raw", $atokfile) |
|
464
|
|
|
|
|
|
|
or $coldb->logconfess("$0: re-open failed for $atokfile: $!"); |
|
465
|
0
|
|
|
|
|
|
$nt = 0; |
|
466
|
0
|
|
|
|
|
|
my $ntok_in = $toki; |
|
467
|
0
|
|
|
|
|
|
my ($toki_in,$toki_out) = (0,0); |
|
468
|
0
|
|
|
|
|
|
my $doci_cur = 0; |
|
469
|
0
|
0
|
|
|
|
|
tied(@$docoff)->flush() if ($docoff); |
|
470
|
0
|
0
|
|
|
|
|
my $docoff_in = $docoff ? $docoff->[$doci_cur] : -1; |
|
471
|
0
|
|
|
|
|
|
while (defined($_=<$atokfh>)) { |
|
472
|
0
|
|
|
|
|
|
chomp; |
|
473
|
0
|
0
|
|
|
|
|
if ($_) { |
|
474
|
0
|
0
|
|
|
|
|
if ($toki_in == $docoff_in) { |
|
475
|
|
|
|
|
|
|
##-- update break-indices for tdf |
|
476
|
|
|
|
|
|
|
|
|
477
|
0
|
0
|
|
|
|
|
if ($debug) { |
|
478
|
|
|
|
|
|
|
##-- BUGHUNT/Birmingham: weird errors around here: Tue, 05 Jul 2016 09:27:11 +0200 |
|
479
|
0
|
0
|
|
|
|
|
$coldb->logconfess("create(): \$doci_cur not defined at \$atokfh line ", $atokfh->input_line_number) |
|
480
|
|
|
|
|
|
|
if (!defined($doci_cur)); |
|
481
|
0
|
0
|
|
|
|
|
$coldb->logconfess("create(): \$toki_out not defined at \$atokfh line ", $atokfh->input_line_number) |
|
482
|
|
|
|
|
|
|
if (!defined($toki_out)); |
|
483
|
0
|
0
|
|
|
|
|
$coldb->logconfess("create(): \$docoff->[\$doci_cur=$doci_cur] not defined at \$atokfh line ", $atokfh->input_line_number) |
|
484
|
|
|
|
|
|
|
if (!defined($docoff->[$doci_cur])); |
|
485
|
0
|
0
|
|
|
|
|
$coldb->logconfess("create(): next \$docoff_in=\$docoff->[++(\$doci_cur=$doci_cur)] not defined at \$atokfh line ", $atokfh->input_line_number) |
|
486
|
|
|
|
|
|
|
if (!defined($docoff->[$doci_cur+1])); |
|
487
|
|
|
|
|
|
|
##--/BUGHUNT |
|
488
|
|
|
|
|
|
|
} |
|
489
|
|
|
|
|
|
|
|
|
490
|
0
|
|
|
|
|
|
$docoff->[$doci_cur] = $toki_out; |
|
491
|
0
|
|
|
|
|
|
$docoff_in = $docoff->[++$doci_cur]; |
|
492
|
|
|
|
|
|
|
} |
|
493
|
0
|
|
|
|
|
|
++$toki_in; |
|
494
|
0
|
0
|
|
|
|
|
$date = $1 if (s/ ([0-9]+)$//); |
|
495
|
0
|
0
|
|
|
|
|
if (defined($ts2t)) { |
|
496
|
0
|
0
|
|
|
|
|
next if (!defined($t=$ts2t->{$_})); |
|
497
|
|
|
|
|
|
|
} else { |
|
498
|
0
|
|
|
|
|
|
$t = pack($pack_t, split(' ',$_)); |
|
499
|
|
|
|
|
|
|
} |
|
500
|
0
|
0
|
|
|
|
|
$ti = $ts2i->{$t} = ++$nt if (!defined($ti=$ts2i->{$t})); |
|
501
|
0
|
|
|
|
|
|
$tokfh->print($ti, "\t", $date, "\n"); |
|
502
|
0
|
|
|
|
|
|
$vtokfh->print($t); |
|
503
|
0
|
|
|
|
|
|
++$toki_out; |
|
504
|
|
|
|
|
|
|
} |
|
505
|
|
|
|
|
|
|
else { |
|
506
|
0
|
|
|
|
|
|
$tokfh->print("\n"); |
|
507
|
|
|
|
|
|
|
} |
|
508
|
|
|
|
|
|
|
} |
|
509
|
|
|
|
|
|
|
##-- update any trailing tdf break indices |
|
510
|
0
|
0
|
|
|
|
|
if ($docoff) { |
|
511
|
0
|
|
|
|
|
|
$ndocs = $#$docoff; |
|
512
|
0
|
|
|
|
|
|
for (; $doci_cur <= $ndocs; ++$doci_cur) { |
|
513
|
0
|
|
|
|
|
|
$docoff->[$doci_cur] = $toki_out; |
|
514
|
|
|
|
|
|
|
} |
|
515
|
0
|
|
|
|
|
|
tied(@$docoff)->flush(); |
|
516
|
|
|
|
|
|
|
} |
|
517
|
|
|
|
|
|
|
|
|
518
|
0
|
0
|
|
|
|
|
CORE::close($atokfh) |
|
519
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to close temporary attribute-token-file $atokfile: $!"); |
|
520
|
0
|
0
|
|
|
|
|
CORE::close($tokfh) |
|
521
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to close temporary token-file $tokfile: $!"); |
|
522
|
0
|
0
|
|
|
|
|
CORE::close($vtokfh) |
|
523
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to close temporary tdf-token-file $vtokfile: $!"); |
|
524
|
0
|
|
|
|
|
|
my $ntok_out = $toki_out; |
|
525
|
0
|
0
|
|
|
|
|
my $ptokbad = $ntok_in ? sprintf("%.2f%%",100*($ntok_in-$ntok_out)/$ntok_in) : 'nan%'; |
|
526
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "create(): assigned $nt term tuple-IDs to $ntok_out of $ntok_in tokens (pruned $ptokbad)"); |
|
527
|
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
##-- cleanup: drop $aconf->[$ai]{i2j} now that we've used it |
|
529
|
0
|
|
|
|
|
|
delete($_->{i2j}) foreach (@$aconf); |
|
530
|
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
##-- compile: tenum |
|
532
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "create(): creating tuple-enum $dbdir/tenum.*"); |
|
533
|
0
|
|
|
|
|
|
$tenum->fromHash($ts2i); |
|
534
|
0
|
0
|
|
|
|
|
$tenum->save("$dbdir/tenum") |
|
535
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to save $dbdir/tenum.*: $!"); |
|
536
|
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
##-- compile: by attribute |
|
538
|
0
|
|
|
|
|
|
foreach $ac (@$aconf) { |
|
539
|
|
|
|
|
|
|
##-- compile: by attribte: enum |
|
540
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate},"create(): creating enum $dbdir/$ac->{a}_enum.*"); |
|
541
|
0
|
|
|
|
|
|
$ac->{enum}->fromHash($ac->{s2i}); |
|
542
|
0
|
0
|
|
|
|
|
$ac->{enum}->save("$dbdir/$ac->{a}_enum") |
|
543
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to save $dbdir/$ac->{a}_enum: $!"); |
|
544
|
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
##-- compile: by attribute: expansion multimaps (+dates) |
|
546
|
0
|
|
|
|
|
|
$coldb->create_multimap("$dbdir/$ac->{a}_2t",$ts2i,$ac->{pack_t},"attribute expansion multimap"); |
|
547
|
|
|
|
|
|
|
} |
|
548
|
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
##-- compute unigrams |
|
550
|
0
|
0
|
0
|
|
|
|
if ($coldb->{index_xf}//1) { |
|
551
|
0
|
|
|
|
|
|
$coldb->info("creating unigram index $dbdir/xf.*"); |
|
552
|
|
|
|
|
|
|
my $xfdb = $coldb->{xf} = DiaColloDB::Relation::Unigrams->new(base=>"$dbdir/xf", flags=>$flags, mmap=>$coldb->{mmap}, |
|
553
|
0
|
0
|
|
|
|
|
pack_i=>$pack_id, pack_f=>$pack_f, pack_d=>$pack_date) |
|
554
|
|
|
|
|
|
|
or $coldb->logconfess("create(): could not create $dbdir/xf.*: $!"); |
|
555
|
0
|
0
|
|
|
|
|
$xfdb->create($coldb, $tokfile) |
|
556
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to create unigram index: $!"); |
|
557
|
|
|
|
|
|
|
} else { |
|
558
|
0
|
|
|
|
|
|
$coldb->info("NOT creating unigram index $dbdir/xf.*; set index_xf=1 to enable"); |
|
559
|
|
|
|
|
|
|
} |
|
560
|
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
##-- compute collocation frequencies |
|
562
|
0
|
0
|
0
|
|
|
|
if ($coldb->{index_cof}//1) { |
|
563
|
0
|
|
|
|
|
|
$coldb->info("creating co-frequency index $dbdir/cof.* [dmax=$coldb->{dmax}, fmin=$coldb->{cfmin}]"); |
|
564
|
|
|
|
|
|
|
my $cof = $coldb->{cof} = DiaColloDB::Relation::Cofreqs->new(base=>"$dbdir/cof", flags=>$flags, mmap=>$coldb->{mmap}, |
|
565
|
|
|
|
|
|
|
pack_i=>$pack_id, pack_f=>$pack_f, pack_d=>$pack_date, |
|
566
|
|
|
|
|
|
|
dmax=>$coldb->{dmax}, fmin=>$coldb->{cfmin}, |
|
567
|
|
|
|
|
|
|
keeptmp=>$coldb->{keeptmp}, |
|
568
|
|
|
|
|
|
|
) |
|
569
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("create(): failed to create co-frequency index $dbdir/cof.*: $!"); |
|
570
|
0
|
0
|
|
|
|
|
$cof->create($coldb, $tokfile) |
|
571
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to create co-frequency index: $!"); |
|
572
|
|
|
|
|
|
|
} else { |
|
573
|
0
|
|
|
|
|
|
$coldb->info("NOT creating co-frequency index $dbdir/cof.*; set index_cof=1 to enable"); |
|
574
|
|
|
|
|
|
|
} |
|
575
|
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
##-- create tdf-model (if requested & available) |
|
577
|
0
|
0
|
|
|
|
|
if ($coldb->{index_tdf}) { |
|
578
|
0
|
|
|
|
|
|
$coldb->info("creating (term x document) index $dbdir/tdf* [dbreak=$dbreak]"); |
|
579
|
0
|
|
0
|
|
|
|
$coldb->{tdfopts} //= {}; |
|
580
|
0
|
|
0
|
|
|
|
$coldb->{tdfopts}{$_} //= $TDF_OPTS{$_} foreach (keys %TDF_OPTS); ##-- tdf: default options |
|
581
|
0
|
|
|
|
|
|
$coldb->{tdf} = DiaColloDB::Relation::TDF->create($coldb, undef, base=>"$dbdir/tdf", dbreak=>$dbreak); |
|
582
|
|
|
|
|
|
|
} else { |
|
583
|
0
|
|
|
|
|
|
$coldb->info("NOT creating (term x document) index, 'tdf' profiling relation disabled"); |
|
584
|
|
|
|
|
|
|
} |
|
585
|
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
##-- create ddc client relation (no-op if ddcServer option is not set) |
|
587
|
0
|
0
|
|
|
|
|
if ($coldb->{ddcServer}) { |
|
588
|
0
|
|
|
|
|
|
$coldb->info("creating ddc client configuration $dbdir/ddc.hdr [ddcServer=$coldb->{ddcServer}]"); |
|
589
|
0
|
|
|
|
|
|
$coldb->{ddc} = DiaColloDB::Relation::DDC->create($coldb); |
|
590
|
|
|
|
|
|
|
} else { |
|
591
|
0
|
|
|
|
|
|
$coldb->info("ddcServer option unset, NOT creating ddc client configuration"); |
|
592
|
|
|
|
|
|
|
} |
|
593
|
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
##-- save header |
|
595
|
0
|
0
|
|
|
|
|
$coldb->saveHeader() |
|
596
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to save header: $!"); |
|
597
|
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
##-- all done |
|
599
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "create(): DB $dbdir created."); |
|
600
|
|
|
|
|
|
|
|
|
601
|
|
|
|
|
|
|
##-- cleanup |
|
602
|
0
|
0
|
0
|
|
|
|
!$docmeta |
|
|
|
|
0
|
|
|
|
|
|
603
|
|
|
|
|
|
|
or !tied(@$docmeta) |
|
604
|
|
|
|
|
|
|
or untie(@$docmeta) |
|
605
|
|
|
|
|
|
|
or $coldb->logwarn("create(): could untie temporary doc-data array $dbdir/docmeta.*: $!"); |
|
606
|
0
|
|
|
|
|
|
delete $coldb->{docmeta}; |
|
607
|
|
|
|
|
|
|
|
|
608
|
0
|
0
|
0
|
|
|
|
!$docoff |
|
|
|
|
0
|
|
|
|
|
|
609
|
|
|
|
|
|
|
or !tied(@$docoff) |
|
610
|
|
|
|
|
|
|
or untie(@$docoff) |
|
611
|
|
|
|
|
|
|
or $coldb->logwarn("create(): could untie temporary doc-offset array $dbdir/docoff.*: $!"); |
|
612
|
0
|
|
|
|
|
|
delete $coldb->{docoff}; |
|
613
|
|
|
|
|
|
|
|
|
614
|
0
|
0
|
|
|
|
|
if (!$coldb->{keeptmp}) { |
|
615
|
0
|
|
|
|
|
|
foreach ($vtokfile,$tokfile,$atokfile) { |
|
616
|
0
|
0
|
|
|
|
|
CORE::unlink($_) |
|
617
|
|
|
|
|
|
|
or $coldb->logwarne("creat(): could not remove temporary file '$_': $!"); |
|
618
|
|
|
|
|
|
|
} |
|
619
|
|
|
|
|
|
|
} |
|
620
|
|
|
|
|
|
|
|
|
621
|
0
|
|
|
|
|
|
return $coldb; |
|
622
|
|
|
|
|
|
|
} |
|
623
|
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
625
|
|
|
|
|
|
|
## create: union (aka merge) |
|
626
|
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
## $coldb = $CLASS_OR_OBJECT->union(\@coldbs_or_dbdirs,%opts) |
|
628
|
|
|
|
|
|
|
## + populates $coldb as union over @coldbs_or_dbdirs |
|
629
|
|
|
|
|
|
|
## + clobbers argument dbs {_union_${a}i2u}, {_union_xi2u}, {_union_argi} |
|
630
|
1
|
|
|
1
|
|
6930
|
BEGIN { *merge = \&union; } |
|
631
|
|
|
|
|
|
|
sub union { |
|
632
|
0
|
|
|
0
|
0
|
|
my ($coldb,$args,%opts) = @_; |
|
633
|
0
|
0
|
|
|
|
|
$coldb = $coldb->new() if (!ref($coldb)); |
|
634
|
0
|
|
|
|
|
|
@$coldb{keys %opts} = values %opts; |
|
635
|
0
|
0
|
|
|
|
|
my @dbargs = map {ref($_) ? $_ : $coldb->new(dbdir=>$_)} @$args; |
|
|
0
|
|
|
|
|
|
|
|
636
|
0
|
|
|
|
|
|
my $flags = O_RDWR|O_CREAT|O_TRUNC; |
|
637
|
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
##-- sanity check(s): version |
|
639
|
0
|
|
|
|
|
|
my $min_db_version = '0.10.000'; |
|
640
|
0
|
|
|
|
|
|
foreach (@dbargs) { |
|
641
|
0
|
|
0
|
|
|
|
my $dbversion = $_->{version} // '0'; |
|
642
|
0
|
0
|
|
|
|
|
$coldb->logconfess("union(): can't handle v$dbversion index in '$_->{dbdir}'; try running \`dcdb-upgrade.perl $_->{dbdir}'") |
|
643
|
|
|
|
|
|
|
if (version->parse($dbversion) < $min_db_version); |
|
644
|
|
|
|
|
|
|
} |
|
645
|
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
##-- initialize: output directory |
|
647
|
|
|
|
|
|
|
my $dbdir = $coldb->{dbdir} |
|
648
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("union() called but 'dbdir' key not set!"); |
|
649
|
0
|
|
|
|
|
|
$dbdir =~ s{/$}{}; |
|
650
|
0
|
|
0
|
|
|
|
$coldb->vlog('info', "union($dbdir) v$coldb->{version}: ", join(' ', map {$_->{dbdir}//''} @dbargs)); |
|
|
0
|
|
|
|
|
|
|
|
651
|
0
|
0
|
0
|
|
|
|
!-d $dbdir |
|
652
|
|
|
|
|
|
|
or remove_tree($dbdir) |
|
653
|
|
|
|
|
|
|
or $coldb->logconfess("union(): could not remove stale $dbdir: $!"); |
|
654
|
0
|
0
|
|
|
|
|
make_path($dbdir) |
|
655
|
|
|
|
|
|
|
or $coldb->logconfess("union(): could not create DB directory $dbdir: $!"); |
|
656
|
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
##-- attributes |
|
658
|
0
|
|
|
|
|
|
my $attrs = [map {$coldb->attrName($_)} @{$coldb->attrs(undef,[])}]; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
659
|
0
|
|
|
|
|
|
my ($db,$dba); |
|
660
|
0
|
0
|
|
|
|
|
if (!@$attrs) { |
|
661
|
|
|
|
|
|
|
##-- use intersection of @dbargs attrs |
|
662
|
0
|
|
|
|
|
|
my @dbakeys = map {$db=$_; scalar {map {($_=>undef)} @{$db->attrs}}} @dbargs; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
663
|
0
|
|
|
|
|
|
my %akeys = qw(); |
|
664
|
0
|
|
|
|
|
|
foreach $dba (map {@{$_->attrs}} @dbargs) { |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
665
|
0
|
0
|
0
|
|
|
|
next if (exists($akeys{$dba}) || grep {!exists($_->{$dba})} @dbakeys); |
|
|
0
|
|
|
|
|
|
|
|
666
|
0
|
|
|
|
|
|
$akeys{$dba}=undef; |
|
667
|
0
|
|
|
|
|
|
push(@$attrs, $dba); |
|
668
|
|
|
|
|
|
|
} |
|
669
|
|
|
|
|
|
|
} |
|
670
|
0
|
|
|
|
|
|
$coldb->{attrs} = $attrs; |
|
671
|
0
|
0
|
|
|
|
|
$coldb->logconfess("union(): no attributes defined and intersection over db attributes is empty!") if (!@$attrs); |
|
672
|
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
##-- pack-formats |
|
674
|
0
|
|
|
|
|
|
my $pack_id = $coldb->{pack_id}; |
|
675
|
0
|
|
|
|
|
|
my $pack_date = $coldb->{pack_date}; |
|
676
|
0
|
|
|
|
|
|
my $pack_f = $coldb->{pack_f}; |
|
677
|
0
|
|
|
|
|
|
my $pack_off = $coldb->{pack_off}; |
|
678
|
0
|
|
|
|
|
|
my $pack_len = $coldb->{pack_len}; |
|
679
|
0
|
|
|
|
|
|
my $pack_t = $coldb->{pack_t} = $pack_id."[".scalar(@$attrs)."]"; ##-- pack("${pack_id}*${pack_date}", @ais) |
|
680
|
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
##-- tuple packing |
|
682
|
0
|
|
|
|
|
|
$coldb->{"pack_t$attrs->[$_]"} = '@'.($_*packsize($pack_id)).$pack_id foreach (0..$#$attrs); |
|
683
|
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
##-- common variables: enums |
|
685
|
0
|
|
|
|
|
|
my %efopts = (flags=>$flags, pack_i=>$coldb->{pack_id}, pack_o=>$coldb->{pack_off}, pack_l=>$coldb->{pack_len}); |
|
686
|
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
##-- union: attribute enums; also sets $db->{"_union_${a}i2u"} for each attribute $attr |
|
688
|
|
|
|
|
|
|
## + $db->{"${a}i2u"} is a PackedFile temporary in $dbdir/"${a}_i2u.tmp${argi}" |
|
689
|
0
|
|
|
|
|
|
my ($ac,$attr,$aenum,$as2i,$argi); |
|
690
|
0
|
|
|
|
|
|
my $adata = $coldb->attrData($attrs); |
|
691
|
0
|
|
|
|
|
|
foreach $ac (@$adata) { |
|
692
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): creating attribute enum $dbdir/$ac->{a}_enum.*"); |
|
693
|
0
|
|
|
|
|
|
$attr = $ac->{a}; |
|
694
|
0
|
|
|
|
|
|
$aenum = $coldb->{"${attr}enum"} = $ac->{enum} = $coldb->mmclass($ECLASS)->new(%efopts); |
|
695
|
0
|
|
|
|
|
|
$as2i = $aenum->{s2i}; |
|
696
|
0
|
|
|
|
|
|
foreach $argi (0..$#dbargs) { |
|
697
|
|
|
|
|
|
|
##-- enum union: guts |
|
698
|
0
|
|
|
|
|
|
$db = $dbargs[$argi]; |
|
699
|
0
|
|
|
|
|
|
my $dbenum = $db->{"${attr}enum"}; |
|
700
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): processing $dbenum->{base}.*"); |
|
701
|
0
|
|
|
|
|
|
$aenum->addEnum($dbenum); |
|
702
|
0
|
|
|
|
|
|
$db->{"_union_argi"} = $argi; |
|
703
|
|
|
|
|
|
|
$db->{"_union_${attr}i2u"} = (DiaColloDB::PackedFile |
|
704
|
|
|
|
|
|
|
->new(file=>"$dbdir/${attr}_i2u.tmp${argi}", flags=>'rw', packas=>$coldb->{pack_id}) |
|
705
|
0
|
0
|
|
|
|
|
->fromArray( [@$as2i{$dbenum ? @{$dbenum->toArray} : ''}] )) |
|
|
0
|
0
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
or $coldb->logconfess("union(): failed to create temporary $dbdir/${attr}_i2u.tmp${argi}"); |
|
707
|
0
|
|
|
|
|
|
$db->{"_union_${attr}i2u"}->flush(); |
|
708
|
|
|
|
|
|
|
} |
|
709
|
0
|
0
|
|
|
|
|
$aenum->save("$dbdir/${attr}_enum") |
|
710
|
|
|
|
|
|
|
or $coldb->logconfess("union(): failed to save attribute enum $dbdir/${attr}_enum: $!"); |
|
711
|
|
|
|
|
|
|
} |
|
712
|
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
##-- union: date-range |
|
714
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): computing date-range"); |
|
715
|
0
|
|
|
|
|
|
@$coldb{qw(xdmin xdmax)} = (undef,undef); |
|
716
|
0
|
|
|
|
|
|
foreach $db (@dbargs) { |
|
717
|
0
|
0
|
0
|
|
|
|
$coldb->{xdmin} = $db->{xdmin} if (!defined($coldb->{xdmin}) || $db->{xdmin} < $coldb->{xdmin}); |
|
718
|
0
|
0
|
0
|
|
|
|
$coldb->{xdmax} = $db->{xdmax} if (!defined($coldb->{xdmax}) || $db->{xdmax} > $coldb->{xdmax}); |
|
719
|
|
|
|
|
|
|
} |
|
720
|
0
|
|
0
|
|
|
|
$coldb->{xdmin} //= 0; |
|
721
|
0
|
|
0
|
|
|
|
$coldb->{xdmax} //= 0; |
|
722
|
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
##-- union: tenum |
|
724
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): creating tuple-enum $dbdir/tenum.*"); |
|
725
|
0
|
|
|
|
|
|
my $tenum = $coldb->{tenum} = $coldb->mmclass($XECLASS)->new(%efopts, pack_s=>$pack_t); |
|
726
|
0
|
|
|
|
|
|
my $ts2i = $tenum->{s2i}; |
|
727
|
0
|
|
|
|
|
|
my $nt = 0; |
|
728
|
0
|
|
|
|
|
|
foreach $db (@dbargs) { |
|
729
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): processing $db->{tenum}{base}.*"); |
|
730
|
0
|
|
|
|
|
|
my $db_pack_t = $db->{pack_t}; |
|
731
|
0
|
|
|
|
|
|
my $dbattrs = $db->{attrs}; |
|
732
|
0
|
|
|
|
|
|
my %a2dbti = map { ($dbattrs->[$_]=>$_) } (0..$#$dbattrs); |
|
|
0
|
|
|
|
|
|
|
|
733
|
0
|
|
|
|
|
|
my %a2i2u = map { ($_=>$db->{"_union_${_}i2u"}) } @$attrs; |
|
|
0
|
|
|
|
|
|
|
|
734
|
0
|
|
|
|
|
|
$argi = $db->{_union_argi}; |
|
735
|
0
|
|
|
|
|
|
my $ti2u = $db->{_union_ti2u} = DiaColloDB::PackedFile->new(file=>"$dbdir/t_i2u.tmp${argi}", flags=>'rw', packas=>$coldb->{pack_id}); |
|
736
|
0
|
|
|
|
|
|
my $dbti = 0; |
|
737
|
0
|
|
|
|
|
|
my (@dbt,@ut,$uts,$uti); |
|
738
|
0
|
|
|
|
|
|
foreach (@{$db->{tenum}->toArray}) { |
|
|
0
|
|
|
|
|
|
|
|
739
|
0
|
|
|
|
|
|
@dbt = unpack($db_pack_t,$_); |
|
740
|
|
|
|
|
|
|
$uts = pack($pack_t, |
|
741
|
|
|
|
|
|
|
(map { |
|
742
|
0
|
|
0
|
|
|
|
(exists($a2dbti{$_}) |
|
743
|
|
|
|
|
|
|
? $a2i2u{$_}->fetch($dbt[$a2dbti{$_}]//0)//0 |
|
744
|
0
|
0
|
0
|
|
|
|
: $a2i2u{$_}->fetch(0)//0) |
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
745
|
|
|
|
|
|
|
} @$attrs), |
|
746
|
|
|
|
|
|
|
$dbt[$#dbt]//0); |
|
747
|
0
|
0
|
|
|
|
|
$uti = $ts2i->{$uts} = $nt++ if (!defined($uti=$ts2i->{$uts})); |
|
748
|
0
|
|
|
|
|
|
$ti2u->store($dbti++, $uti); |
|
749
|
|
|
|
|
|
|
} |
|
750
|
|
|
|
|
|
|
$ti2u->flush() |
|
751
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("could not flush temporary $dbdir/t_i2u.tmp${argi}"); |
|
752
|
|
|
|
|
|
|
} |
|
753
|
0
|
|
|
|
|
|
$tenum->fromHash($ts2i); |
|
754
|
0
|
0
|
|
|
|
|
$tenum->save("$dbdir/tenum") |
|
755
|
|
|
|
|
|
|
or $coldb->logconfess("union(): failed to save $dbdir/tenum.*: $!"); |
|
756
|
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
##-- union: expansion maps |
|
758
|
0
|
|
|
|
|
|
foreach (@$adata) { |
|
759
|
0
|
|
|
|
|
|
$coldb->create_multimap("$dbdir/$_->{a}_2t",$ts2i,$_->{pack_t},"attribute expansion multimap"); |
|
760
|
|
|
|
|
|
|
} |
|
761
|
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
##-- intermediate cleanup: ts2i |
|
763
|
0
|
|
|
|
|
|
undef $ts2i; |
|
764
|
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
##-- unigrams: populate |
|
766
|
0
|
0
|
0
|
|
|
|
if ($coldb->{index_xf}//1) { |
|
767
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): creating tuple unigram index $dbdir/xf.*"); |
|
768
|
|
|
|
|
|
|
$coldb->{xf} = DiaColloDB::Relation::Unigrams->new(base=>"$dbdir/xf", flags=>$flags, mmap=>$coldb->{mmap}, |
|
769
|
|
|
|
|
|
|
pack_i=>$pack_id, pack_f=>$pack_f, pack_d=>$pack_date, |
|
770
|
|
|
|
|
|
|
keeptmp => $coldb->{keeptmp}, |
|
771
|
|
|
|
|
|
|
) |
|
772
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("union(): could not create $dbdir/xf.*: $!"); |
|
773
|
0
|
0
|
|
|
|
|
$coldb->{xf}->union($coldb, [map {[@$_{qw(xf _union_ti2u)}]} @dbargs]) |
|
|
0
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
or $coldb->logconfess("union(): could not populate unigram index $dbdir/xf.*: $!"); |
|
775
|
|
|
|
|
|
|
} else { |
|
776
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): NOT creating unigram index $dbdir/xf.*; set index_xf=1 to enable"); |
|
777
|
|
|
|
|
|
|
} |
|
778
|
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
##-- co-frequencies: populate |
|
780
|
0
|
0
|
0
|
|
|
|
if ($coldb->{index_cof}//1) { |
|
781
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): creating co-frequency index $dbdir/cof.* [fmin=$coldb->{cfmin}]"); |
|
782
|
|
|
|
|
|
|
$coldb->{cof} = DiaColloDB::Relation::Cofreqs->new(base=>"$dbdir/cof", flags=>$flags, mmap=>$coldb->{mmap}, |
|
783
|
|
|
|
|
|
|
pack_i=>$pack_id, pack_f=>$pack_f, pack_d=>$pack_date, |
|
784
|
|
|
|
|
|
|
dmax=>$coldb->{dmax}, fmin=>$coldb->{cfmin}, |
|
785
|
|
|
|
|
|
|
keeptmp=>$coldb->{keeptmp}, |
|
786
|
|
|
|
|
|
|
) |
|
787
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("create(): failed to open co-frequency index $dbdir/cof.*: $!"); |
|
788
|
0
|
0
|
|
|
|
|
$coldb->{cof}->union($coldb, [map {[@$_{qw(cof _union_ti2u)}]} @dbargs]) |
|
|
0
|
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
or $coldb->logconfess("union(): could not populate co-frequency index $dbdir/cof.*: $!"); |
|
790
|
|
|
|
|
|
|
} else { |
|
791
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): NOT creating co-frequency index $dbdir/cof.*; set index_cof=1 to enable"); |
|
792
|
|
|
|
|
|
|
} |
|
793
|
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
##-- tdf: populate |
|
795
|
0
|
|
|
|
|
|
my $db_tdf = !grep {!$_->{index_tdf}} @dbargs; |
|
|
0
|
|
|
|
|
|
|
|
796
|
0
|
|
0
|
|
|
|
$coldb->{index_tdf} //= $db_tdf; |
|
797
|
0
|
0
|
0
|
|
|
|
if ($coldb->{index_tdf} && $db_tdf) { |
|
798
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): creating (term x document) index $dbdir/tdf.*"); |
|
799
|
|
|
|
|
|
|
## |
|
800
|
0
|
|
|
|
|
|
my $tdfopts0 = $dbargs[0]{tdfopts}; |
|
801
|
0
|
|
0
|
|
|
|
$coldb->{tdfopts} //= {}; |
|
802
|
0
|
|
0
|
|
|
|
$coldb->{tdfopts} //= $tdfopts0->{$_} foreach (keys %$tdfopts0); ##-- tdf: inherit options |
|
803
|
0
|
|
0
|
|
|
|
$coldb->{tdfopts}{$_} //= $TDF_OPTS{$_} foreach (keys %TDF_OPTS); ##-- tdf: default options |
|
804
|
|
|
|
|
|
|
## |
|
805
|
0
|
|
0
|
|
|
|
my $dbreak = ($coldb->{dbreak} // $dbargs[0]{dbreak} // '#file'); |
|
|
|
|
0
|
|
|
|
|
|
806
|
0
|
0
|
|
|
|
|
$dbreak = "#$dbreak" if ($dbreak !~ /^#/); |
|
807
|
0
|
|
|
|
|
|
$coldb->{dbreak} = $dbreak; |
|
808
|
|
|
|
|
|
|
## |
|
809
|
|
|
|
|
|
|
$coldb->{tdf} = DiaColloDB::Relation::TDF->union($coldb, \@dbargs, |
|
810
|
|
|
|
|
|
|
base => "$dbdir/tdf", |
|
811
|
|
|
|
|
|
|
flags => $flags, |
|
812
|
|
|
|
|
|
|
keeptmp => $coldb->{keeptmp}, |
|
813
|
0
|
0
|
|
|
|
|
%{$coldb->{tdfopts}}, |
|
|
0
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
) |
|
815
|
|
|
|
|
|
|
or $coldb->logconfess("create(): failed to populate (term x document) index $dbdir/tdf.*: $!"); |
|
816
|
|
|
|
|
|
|
} else { |
|
817
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): NOT creating (term x document) index $dbdir/tdf.*; set index_tdf=1 on all argument DBs to enable"); |
|
818
|
|
|
|
|
|
|
} |
|
819
|
|
|
|
|
|
|
|
|
820
|
|
|
|
|
|
|
##-- cleanup |
|
821
|
0
|
0
|
|
|
|
|
if (!$coldb->{keeptmp}) { |
|
822
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): cleaning up temporary files"); |
|
823
|
0
|
|
|
|
|
|
foreach $db (@dbargs) { |
|
824
|
0
|
|
|
|
|
|
foreach my $pfkey ('_union_ti2u', map {"_union_${_}i2u"} @$attrs) { |
|
|
0
|
|
|
|
|
|
|
|
825
|
0
|
0
|
|
|
|
|
$db->{$pfkey}->unlink() if ($db->{$pfkey}->can('unlink')); |
|
826
|
0
|
|
|
|
|
|
delete $db->{$pfkey}; |
|
827
|
|
|
|
|
|
|
} |
|
828
|
0
|
|
|
|
|
|
delete $db->{_union_argi}; |
|
829
|
|
|
|
|
|
|
} |
|
830
|
|
|
|
|
|
|
} |
|
831
|
|
|
|
|
|
|
|
|
832
|
|
|
|
|
|
|
##-- save header |
|
833
|
|
|
|
|
|
|
$coldb->saveHeader() |
|
834
|
0
|
0
|
|
|
|
|
or $coldb->logconfess("union(): failed to save header: $!"); |
|
835
|
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
##-- all done |
|
837
|
0
|
|
|
|
|
|
$coldb->vlog($coldb->{logCreate}, "union(): union DB $dbdir created."); |
|
838
|
|
|
|
|
|
|
|
|
839
|
0
|
|
|
|
|
|
return $coldb; |
|
840
|
|
|
|
|
|
|
} |
|
841
|
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
1; ##-- be happy |