| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
## -*- Mode: CPerl -*- |
|
2
|
|
|
|
|
|
|
## File: DiaColloDB::Corpus::Compiled.pm |
|
3
|
|
|
|
|
|
|
## Author: Bryan Jurish <moocow@cpan.org> |
|
4
|
|
|
|
|
|
|
## Description: collocation db, source corpus (pre-compiled) |
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package DiaColloDB::Corpus::Compiled; |
|
7
|
|
|
|
|
|
|
|
|
8
|
1
|
|
|
1
|
|
9
|
use DiaColloDB::threads; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
14
|
|
|
9
|
1
|
|
|
1
|
|
546
|
use DiaColloDB::threads::shared; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
4
|
|
|
10
|
|
|
|
|
|
|
our ($HAVE_THREADS); |
|
11
|
|
|
|
|
|
|
BEGIN { |
|
12
|
1
|
50
|
|
1
|
|
155
|
$HAVE_THREADS = $DiaColloDB::threads::shared::MODULE ? 1 : 0; |
|
13
|
|
|
|
|
|
|
} |
|
14
|
|
|
|
|
|
|
|
|
15
|
1
|
|
|
1
|
|
8
|
use DiaColloDB::Corpus; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
21
|
|
|
16
|
1
|
|
|
1
|
|
471
|
use DiaColloDB::Corpus::Filters; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
25
|
|
|
17
|
1
|
|
|
1
|
|
106
|
use DiaColloDB::Logger; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
27
|
|
|
18
|
1
|
|
|
1
|
|
72
|
use DiaColloDB::Utils qw(:fcntl :jobs); |
|
|
1
|
|
|
|
|
5
|
|
|
|
1
|
|
|
|
|
69
|
|
|
19
|
1
|
|
|
1
|
|
354
|
use File::Basename qw(basename dirname); |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
28
|
|
|
20
|
1
|
|
|
1
|
|
128
|
use File::Path qw(make_path remove_tree); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
24
|
|
|
21
|
1
|
|
|
1
|
|
101
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
472
|
|
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
##============================================================================== |
|
24
|
|
|
|
|
|
|
## Globals & Constants |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB::Persistent DiaColloDB::Corpus); |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
##============================================================================== |
|
29
|
|
|
|
|
|
|
## Constructors etc. |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
## $corpus = CLASS_OR_OBJECT->new(%args) |
|
32
|
|
|
|
|
|
|
## + %args, object structure: |
|
33
|
|
|
|
|
|
|
## ( |
|
34
|
|
|
|
|
|
|
## ##-- NEW in DiaColloDB::Corpus::Compiled |
|
35
|
|
|
|
|
|
|
## dbdir => $dbdir, ##-- data directory for compiled corpus |
|
36
|
|
|
|
|
|
|
## flags => $flags, ##-- open mode flags (fcntl flags or perl-style; default='r') |
|
37
|
|
|
|
|
|
|
## filters => \%filters, ##-- corpus filters ( DiaColloDB::Corpus::Filters object or HASH-ref ) |
|
38
|
|
|
|
|
|
|
## njobs => $njobs, ##-- number of parallel worker jobs for create(); default=-1 (= nCores) |
|
39
|
|
|
|
|
|
|
## temp => $bool, ##-- implicitly unlink() on exit? |
|
40
|
|
|
|
|
|
|
## logThreads => $level ##-- log-level for thread stuff (default='off') |
|
41
|
|
|
|
|
|
|
## ## |
|
42
|
|
|
|
|
|
|
## ##-- INHERITED from DiaColloDB::Corpus |
|
43
|
|
|
|
|
|
|
## #files => \@files, ##-- source files (OVERRIDE: unused) |
|
44
|
|
|
|
|
|
|
## #dclass => $dclass, ##-- DiaColloDB::Document subclass for loading (OVERRIDE force 'DiaColloDB::Document::JSON') |
|
45
|
|
|
|
|
|
|
## dopts => \%opts, ##-- options for $dclass->fromFile() (override default={}) |
|
46
|
|
|
|
|
|
|
## cur => $i, ##-- index of current file |
|
47
|
|
|
|
|
|
|
## logOpen => $level, ##-- log-level for open(); default='info' |
|
48
|
|
|
|
|
|
|
## ) |
|
49
|
|
|
|
|
|
|
sub new { |
|
50
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
|
51
|
0
|
|
|
|
|
|
my $corpus = $that->SUPER::new( |
|
52
|
|
|
|
|
|
|
##-- new |
|
53
|
|
|
|
|
|
|
dbdir => undef, |
|
54
|
|
|
|
|
|
|
flags => 'r', |
|
55
|
|
|
|
|
|
|
#filters => DiaColloDB::Corpus::Filters->new(), |
|
56
|
|
|
|
|
|
|
#temp => 0, |
|
57
|
|
|
|
|
|
|
#opened => 0, |
|
58
|
|
|
|
|
|
|
njobs => -1, |
|
59
|
|
|
|
|
|
|
logThreads => 'off', |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
@_, ##-- user arguments |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
##-- strong overrides |
|
64
|
|
|
|
|
|
|
dclass => 'DiaColloDB::Document::JSON', |
|
65
|
|
|
|
|
|
|
); |
|
66
|
0
|
0
|
|
|
|
|
$corpus->{filters} = DiaColloDB::Corpus::Filters->new() if (!exists($corpus->{filters})); |
|
67
|
0
|
0
|
|
|
|
|
return $corpus->open() if (defined($corpus->{dbdir})); |
|
68
|
0
|
|
|
|
|
|
return $corpus; |
|
69
|
|
|
|
|
|
|
} |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
sub DESTROY { |
|
72
|
0
|
|
|
0
|
|
|
my $obj = $_[0]; |
|
73
|
0
|
0
|
|
|
|
|
$obj->unlink() if ($obj->{temp}); |
|
74
|
0
|
0
|
|
|
|
|
$obj->close() if ($obj->opened); |
|
75
|
|
|
|
|
|
|
} |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
##============================================================================== |
|
78
|
|
|
|
|
|
|
## Persistent API |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
## @keys = $obj->headerKeys() |
|
81
|
|
|
|
|
|
|
## + keys to save as header; default implementation returns all keys of all non-references |
|
82
|
|
|
|
|
|
|
sub headerKeys { |
|
83
|
0
|
|
|
0
|
1
|
|
return (grep {$_ !~ m{^log|^(?:cur|dbdir|njobs|opened|flags|files|list|glob|compiled|append|temp)$}} keys %{$_[0]}); |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
} |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
## @files = $obj->diskFiles() |
|
87
|
|
|
|
|
|
|
## + returns disk storage files, used by du() and timestamp() |
|
88
|
|
|
|
|
|
|
## + default implementation returns $obj->{file} or glob("$obj->{base}*") |
|
89
|
|
|
|
|
|
|
sub diskFiles { |
|
90
|
0
|
|
|
0
|
1
|
|
my $obj = shift; |
|
91
|
0
|
0
|
|
|
|
|
return ($obj->{dbdir}) if ($obj->{dbdir}); |
|
92
|
0
|
|
|
|
|
|
return qw(); |
|
93
|
|
|
|
|
|
|
} |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
## $bool = $obj->unlink(%opts) |
|
96
|
|
|
|
|
|
|
## + override %opts: |
|
97
|
|
|
|
|
|
|
## close => $bool, ##-- implicitly call $obj->close() ? (default=1) |
|
98
|
|
|
|
|
|
|
## + unlinks disk files |
|
99
|
|
|
|
|
|
|
## + implcitly calls $obj->close() if available |
|
100
|
|
|
|
|
|
|
sub unlink { |
|
101
|
0
|
|
|
0
|
1
|
|
my ($obj,%opts) = @_; |
|
102
|
0
|
|
|
|
|
|
my $dbdir = $obj->datadir; |
|
103
|
|
|
|
|
|
|
#$obj->vlog($obj->{logOpen}, "unlink(", $obj->dbdir, ")") if ($obj->opened); |
|
104
|
0
|
0
|
0
|
|
|
|
$obj->close() if (!exists($opts{close}) || $opts{close}); |
|
105
|
0
|
0
|
|
|
|
|
return (-e $dbdir ? File::Path::remove_tree($dbdir) : 1); |
|
106
|
|
|
|
|
|
|
} |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
##---------------------------------------------------------------------- |
|
109
|
|
|
|
|
|
|
## Compiled API: disk files etc. |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
## $dirname = $corpus->datadir() |
|
112
|
|
|
|
|
|
|
## $dirname = $corpus->datadir($dir) |
|
113
|
1
|
|
|
1
|
|
2700
|
BEGIN { *dbdir = \&datadir; } |
|
114
|
|
|
|
|
|
|
sub datadir { |
|
115
|
0
|
|
0
|
0
|
1
|
|
my $dir = $_[1] // $_[0]{dbdir}; |
|
116
|
0
|
0
|
|
|
|
|
$dir =~ s{/$}{} if ($dir); |
|
117
|
0
|
|
|
|
|
|
return $dir; |
|
118
|
|
|
|
|
|
|
} |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
## $bool = $corpus->truncate() |
|
121
|
|
|
|
|
|
|
## + removes all disk data (including header) and resets $corpus->{size}=0 |
|
122
|
|
|
|
|
|
|
sub truncate { |
|
123
|
0
|
|
|
0
|
1
|
|
my $corpus = shift; |
|
124
|
0
|
0
|
|
|
|
|
return undef if (!$corpus->unlink(close=>0)); |
|
125
|
0
|
|
|
|
|
|
$corpus->{size} = 0; |
|
126
|
0
|
|
|
|
|
|
return $corpus; |
|
127
|
|
|
|
|
|
|
} |
|
128
|
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
## $filters = $ccorpus->filters() |
|
130
|
|
|
|
|
|
|
## + return corpus filters as a DiaColloDB::Corpus::Filters object |
|
131
|
|
|
|
|
|
|
sub filters { |
|
132
|
0
|
0
|
|
0
|
1
|
|
return $_[0]{filters} if (UNIVERSAL::isa($_[0]{filters},'DiaColloDB::Corpus::Filters')); |
|
133
|
0
|
0
|
|
|
|
|
return DiaColloDB::Corpus::Filters->null() if (!defined($_[0]{filters})); |
|
134
|
0
|
|
|
|
|
|
return DiaColloDB::Corpus::Filters->new( %{$_[0]{filters}} ); |
|
|
0
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
} |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
##============================================================================== |
|
138
|
|
|
|
|
|
|
## Corpus API: open/close |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
## $bool = $corpus->open([$dbdir], %opts); ##-- compat |
|
141
|
|
|
|
|
|
|
## $bool = $corpus->open($dbdir, %opts); ##-- new |
|
142
|
|
|
|
|
|
|
## + opens corpus "$base.*" |
|
143
|
|
|
|
|
|
|
## + \@ARGV should be a single-element $dbdir, or (dbdir=>$dbdir) must exist or be specified in %opts |
|
144
|
|
|
|
|
|
|
## + DiaColloDB::Corpus %opts: |
|
145
|
|
|
|
|
|
|
## compiled => $bool, ##-- implicit |
|
146
|
|
|
|
|
|
|
## glob => $bool, ##-- (ignored) whether to glob arguments |
|
147
|
|
|
|
|
|
|
## list => $bool, ##-- (ignored) whether arguments are file-lists |
|
148
|
|
|
|
|
|
|
sub open { |
|
149
|
0
|
|
|
0
|
1
|
|
my ($corpus,$argv,%opts) = @_; |
|
150
|
0
|
|
|
|
|
|
delete @opts{qw(compiled glob list)}; |
|
151
|
0
|
0
|
|
|
|
|
$corpus = $corpus->new() if (!ref($corpus)); |
|
152
|
0
|
0
|
|
|
|
|
$corpus->close() if ($corpus->opened); |
|
153
|
0
|
|
|
|
|
|
@$corpus{keys %opts} = values(%opts); |
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
##-- sanity check(s): dbdir |
|
156
|
0
|
|
|
|
|
|
my $dbdir = $corpus->dbdir; |
|
157
|
0
|
0
|
|
|
|
|
if (UNIVERSAL::isa($argv,'ARRAY')) { |
|
|
|
0
|
|
|
|
|
|
|
158
|
0
|
0
|
|
|
|
|
if (@$argv==1) { |
|
159
|
0
|
|
|
|
|
|
$dbdir = $argv->[0]; ##-- single-element list |
|
160
|
|
|
|
|
|
|
} else { |
|
161
|
0
|
|
|
|
|
|
$corpus->logconfess("open(): can't handle multi-element array"); |
|
162
|
|
|
|
|
|
|
} |
|
163
|
|
|
|
|
|
|
} elsif (defined($argv)) { |
|
164
|
0
|
|
|
|
|
|
$dbdir = $argv; ##-- simple scalar |
|
165
|
|
|
|
|
|
|
} |
|
166
|
0
|
0
|
|
|
|
|
$corpus->{dbdir} = $corpus->dbdir($dbdir) |
|
167
|
|
|
|
|
|
|
or $corpus->logconfess("open(): no {dbdir} specified!"); |
|
168
|
|
|
|
|
|
|
|
|
169
|
0
|
0
|
|
|
|
|
my $flags = $corpus->{flags} = (fcflags($corpus->{flags}) | ($corpus->{append} ? fcflags('>>') : 0)); |
|
170
|
0
|
|
|
|
|
|
$corpus->vlog($corpus->{logOpen}, "open(", fcperl($flags), "$dbdir)"); |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
##-- flag-dependent dispatch |
|
173
|
0
|
0
|
0
|
|
|
|
if (fcwrite($flags) && fctrunc($flags)) { |
|
174
|
|
|
|
|
|
|
##-- truncate: remove any stale corpus |
|
175
|
0
|
0
|
|
|
|
|
$corpus->truncate() |
|
176
|
|
|
|
|
|
|
or $corpus->logconfess("open(): failed to truncate stale corpus $corpus->{dbdir}/: $!"); |
|
177
|
|
|
|
|
|
|
} |
|
178
|
0
|
0
|
0
|
|
|
|
if (fcwrite($flags) && fccreat($flags)) { |
|
179
|
|
|
|
|
|
|
##-- create: data-directory |
|
180
|
0
|
|
|
|
|
|
my $datadir = $corpus->datadir; |
|
181
|
0
|
0
|
0
|
|
|
|
-d $datadir |
|
182
|
|
|
|
|
|
|
or make_path($datadir) |
|
183
|
|
|
|
|
|
|
or $corpus->logconfess("open(): could not create data directory '$datadir': $!"); |
|
184
|
|
|
|
|
|
|
} |
|
185
|
0
|
0
|
0
|
|
|
|
if (fcread($flags) && !fctrunc($flags)) { |
|
186
|
|
|
|
|
|
|
##-- read-only, no create |
|
187
|
0
|
0
|
|
|
|
|
$corpus->loadHeaderFile |
|
188
|
|
|
|
|
|
|
or $corpus->logconfess("open(): failed to load header-file ", $corpus->headerFile); |
|
189
|
|
|
|
|
|
|
} |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
##-- force options: dclass, files, opened |
|
192
|
0
|
|
|
|
|
|
$corpus->{opened} = 1; |
|
193
|
0
|
|
|
|
|
|
$corpus->{dclass} = 'DiaColloDB::Document::JSON'; |
|
194
|
0
|
|
|
|
|
|
delete $corpus->{files}; |
|
195
|
|
|
|
|
|
|
|
|
196
|
0
|
|
|
|
|
|
return $corpus; |
|
197
|
|
|
|
|
|
|
} |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
## $bool = $corpus->close() |
|
200
|
|
|
|
|
|
|
sub close { |
|
201
|
0
|
|
|
0
|
1
|
|
my $corpus = shift; |
|
202
|
0
|
0
|
|
|
|
|
$corpus->vlog($corpus->{logOpen}, "close(", $corpus->dbdir, ")") if ($corpus->opened); |
|
203
|
0
|
0
|
0
|
|
|
|
my $rc = ($corpus->opened && fcwrite($corpus->{flags}) ? $corpus->flush : 1); |
|
204
|
0
|
|
0
|
|
|
|
$rc &&= $corpus->SUPER::close(); |
|
205
|
0
|
0
|
|
|
|
|
if ($rc) { |
|
206
|
0
|
|
|
|
|
|
$corpus->{opened} = 0; |
|
207
|
0
|
|
|
|
|
|
$corpus->{size} = 0; |
|
208
|
|
|
|
|
|
|
} |
|
209
|
0
|
|
|
|
|
|
return $rc; |
|
210
|
|
|
|
|
|
|
} |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
##---------------------------------------------------------------------- |
|
213
|
|
|
|
|
|
|
## Compiled API: open/close |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
## $bool = $corpus->opened() |
|
216
|
|
|
|
|
|
|
## + Returns true iff $corpus is currently opened. |
|
217
|
|
|
|
|
|
|
sub opened { |
|
218
|
0
|
|
|
0
|
1
|
|
my $corpus = shift; |
|
219
|
0
|
|
0
|
|
|
|
return $corpus->{dbdir} && $corpus->{opened}; |
|
220
|
|
|
|
|
|
|
} |
|
221
|
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
## $bool = $corpus->flush() |
|
223
|
|
|
|
|
|
|
## + flush pending data (header) to disk |
|
224
|
|
|
|
|
|
|
sub flush { |
|
225
|
0
|
|
|
0
|
1
|
|
my $corpus = shift; |
|
226
|
0
|
0
|
0
|
|
|
|
return undef if (!$corpus->opened || !fcwrite($corpus->{flags})); |
|
227
|
0
|
0
|
|
|
|
|
$corpus->saveHeader() |
|
228
|
|
|
|
|
|
|
or $corpus->logconfess("flush(): failed to store header file ", $corpus->headerFile, ": $!"); |
|
229
|
|
|
|
|
|
|
} |
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
## $corpus = $corpus->reopen(%opts) |
|
232
|
|
|
|
|
|
|
## + close and re-open corpus (e.g. with different flags) |
|
233
|
|
|
|
|
|
|
sub reopen { |
|
234
|
0
|
|
|
0
|
1
|
|
my $corpus = shift; |
|
235
|
0
|
|
|
|
|
|
my $dbdir = $corpus->{dbdir}; |
|
236
|
0
|
0
|
|
|
|
|
return $corpus if (!$corpus->opened); |
|
237
|
0
|
|
0
|
|
|
|
return $corpus->close() && $corpus->open([$dbdir], @_); |
|
238
|
|
|
|
|
|
|
} |
|
239
|
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
##============================================================================== |
|
241
|
|
|
|
|
|
|
## Corpus API: iteration |
|
242
|
|
|
|
|
|
|
## + mostly inherited from DiaColloDB::Corpus |
|
243
|
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
## $nfiles = $corpus->size() |
|
245
|
|
|
|
|
|
|
sub size { |
|
246
|
0
|
|
0
|
0
|
1
|
|
return $_[0]{size} // 0; |
|
247
|
|
|
|
|
|
|
} |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
## $bool = $corpus->iok() |
|
250
|
|
|
|
|
|
|
## + true if iterator is valid |
|
251
|
|
|
|
|
|
|
sub iok { |
|
252
|
0
|
|
0
|
0
|
1
|
|
return $_[0]{cur} < ($_[0]{size}//0); |
|
253
|
|
|
|
|
|
|
} |
|
254
|
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
## $label = $corpus->ifile() |
|
256
|
|
|
|
|
|
|
## $label = $corpus->ifile($pos) |
|
257
|
|
|
|
|
|
|
## + current iterator label |
|
258
|
|
|
|
|
|
|
sub ifile { |
|
259
|
0
|
|
0
|
0
|
1
|
|
my $pos = $_[1] // $_[0]{cur}; |
|
260
|
0
|
0
|
|
|
|
|
return undef if ($pos >= $_[0]{size}); |
|
261
|
0
|
|
|
|
|
|
return "$_[0]{dbdir}/$pos.json"; |
|
262
|
|
|
|
|
|
|
} |
|
263
|
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
## $doc_or_undef = $corpus->idocument() |
|
265
|
|
|
|
|
|
|
## $doc_or_undef = $corpus->idocument($pos) |
|
266
|
|
|
|
|
|
|
## + gets current document |
|
267
|
|
|
|
|
|
|
sub idocument { |
|
268
|
0
|
|
|
0
|
1
|
|
my ($corpus,$pos) = @_; |
|
269
|
0
|
|
0
|
|
|
|
$pos //= $corpus->{cur}; |
|
270
|
0
|
0
|
|
|
|
|
return undef if ($pos >= $corpus->size); |
|
271
|
0
|
|
0
|
|
|
|
return $corpus->{dclass}->fromFile($corpus->ifile($pos), %{$corpus->{dopts}//{}}); |
|
|
0
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
} |
|
273
|
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
##============================================================================== |
|
276
|
|
|
|
|
|
|
## Corpus::Compiled API: corpus compilation |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
## $ccorpus = CLASS_OR_OBJECT->create($src_corpus, %opts) |
|
279
|
|
|
|
|
|
|
## + compile or append a single $src_corpus to $opts{dbdir}, returns $ccorpus |
|
280
|
|
|
|
|
|
|
## + honors $opts{flags} for append and truncate |
|
281
|
|
|
|
|
|
|
sub create { |
|
282
|
0
|
|
|
0
|
1
|
|
my ($that,$icorpus,%opts) = @_; |
|
283
|
0
|
0
|
|
|
|
|
my $ocorpus = ref($that) ? $that : $that->new(); |
|
284
|
0
|
|
|
|
|
|
my $logas = 'create()'; |
|
285
|
0
|
|
|
|
|
|
$ocorpus->vlog('info',$logas); |
|
286
|
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
##-- save options |
|
288
|
|
|
|
|
|
|
my $odir = $ocorpus->dbdir($opts{dbdir}) |
|
289
|
0
|
0
|
|
|
|
|
or $ocorpus->logconfess("$logas: no output corpus {dbdir} specified"); |
|
290
|
|
|
|
|
|
|
|
|
291
|
0
|
|
0
|
|
|
|
my $flags = (fcflags($ocorpus->{flags}) | fcflags($opts{flags})) || fcflags('w'); |
|
292
|
0
|
|
|
|
|
|
delete $opts{dbdir}; |
|
293
|
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
##-- (re-)open output corpus |
|
295
|
0
|
0
|
0
|
|
|
|
if (!$ocorpus->opened || ($ocorpus->{dbdir} ne $odir)) { |
|
296
|
0
|
0
|
|
|
|
|
$ocorpus->open([$odir], %opts, flags=>$flags) |
|
297
|
|
|
|
|
|
|
or $ocorpus->logconfess("$logas: failed to (re-)open output corpus '$odir' in mode '", fcperl($flags)); |
|
298
|
|
|
|
|
|
|
} |
|
299
|
0
|
|
|
|
|
|
@$ocorpus{keys %opts} = values %opts; |
|
300
|
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
##-- check whether we're doing any filtering at all |
|
302
|
0
|
|
|
|
|
|
my $filters = $ocorpus->filters(); |
|
303
|
0
|
|
|
|
|
|
my $dofilter = !$filters->isnull(); |
|
304
|
0
|
0
|
|
|
|
|
if ($dofilter) { |
|
305
|
0
|
|
|
|
|
|
$ocorpus->vlog('info', "$logas: corpus content filters enabled"); |
|
306
|
0
|
|
|
|
|
|
foreach (grep {defined($filters->{$_})} sort keys %$filters) { |
|
|
0
|
|
|
|
|
|
|
|
307
|
0
|
|
|
|
|
|
$ocorpus->vlog('info', " + filter $_ => $filters->{$_}"); |
|
308
|
|
|
|
|
|
|
} |
|
309
|
|
|
|
|
|
|
} else { |
|
310
|
0
|
|
|
|
|
|
$ocorpus->vlog('info', "$logas: corpus content filters disabled"); |
|
311
|
|
|
|
|
|
|
} |
|
312
|
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
##-- common data |
|
314
|
0
|
|
|
|
|
|
my $nfiles = $icorpus->size(); |
|
315
|
0
|
|
0
|
|
|
|
my $logFileN = $ocorpus->{logFileN} || int($nfiles / 20) || 1; |
|
316
|
0
|
|
|
|
|
|
my @outkeys = keys %{DiaColloDB::Document->new}; |
|
|
0
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
|
|
318
|
0
|
|
|
|
|
|
my $osize = $ocorpus->size(); |
|
319
|
0
|
|
|
|
|
|
my $outdir = $ocorpus->datadir(); |
|
320
|
|
|
|
|
|
|
|
|
321
|
0
|
|
|
|
|
|
my $filei_shared = 0; |
|
322
|
0
|
|
|
|
|
|
share( $filei_shared ); |
|
323
|
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
##-------------------------------------------- |
|
325
|
|
|
|
|
|
|
my $cb_worker = sub { |
|
326
|
0
|
|
0
|
0
|
|
|
my $thrid = shift || DiaColloDB::threads->tid(); |
|
327
|
0
|
|
|
|
|
|
$logas .= "#$thrid"; |
|
328
|
0
|
|
|
|
|
|
(*STDERR)->autoflush(1); |
|
329
|
0
|
|
|
|
|
|
$ocorpus->vlog($ocorpus->{logThreads}, "$logas: starting worker thread #$thrid"); |
|
330
|
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
##-- initialize: disable auto-deletion |
|
332
|
0
|
|
|
|
|
|
$ocorpus->{temp} = 0; |
|
333
|
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
##-- initialize filters (formerly in DiaColloDB.pm) |
|
335
|
0
|
0
|
|
|
|
|
my $cfilters = $dofilter ? $filters->compile() : {} |
|
|
|
0
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
or $ocorpus->logconfess("$logas: failed to compile corpus content filters: $!"); |
|
337
|
|
|
|
|
|
|
## |
|
338
|
|
|
|
|
|
|
##-- initialize: filters: variables |
|
339
|
0
|
|
|
|
|
|
my ($pgood, $pbad, $wgood, $wbad, $lgood, $lbad ) = @$cfilters{map {("${_}good","${_}bad")} qw(p w l)}; |
|
|
0
|
|
|
|
|
|
|
|
340
|
0
|
|
|
|
|
|
my ($pgoodh,$pbadh,$wgoodh,$wbadh,$lgoodh,$lbadh) = @$cfilters{map {("${_}goodfile","${_}badfile")} qw(p w l)}; |
|
|
0
|
|
|
|
|
|
|
|
341
|
0
|
|
|
|
|
|
my ($tok,$w,$p,$l); |
|
342
|
|
|
|
|
|
|
|
|
343
|
0
|
|
|
|
|
|
my ($filei); |
|
344
|
0
|
|
|
|
|
|
while (1) { |
|
345
|
|
|
|
|
|
|
{ |
|
346
|
0
|
|
|
|
|
|
lock($filei_shared); |
|
|
0
|
|
|
|
|
|
|
|
347
|
0
|
|
|
|
|
|
$filei = $filei_shared; |
|
348
|
0
|
|
|
|
|
|
++$filei_shared; |
|
349
|
|
|
|
|
|
|
} |
|
350
|
0
|
0
|
|
|
|
|
last if ($filei >= $nfiles); |
|
351
|
|
|
|
|
|
|
|
|
352
|
0
|
|
|
|
|
|
my $idoc = $icorpus->idocument($filei); |
|
353
|
0
|
|
|
|
|
|
my $infile = $icorpus->ifile($filei); |
|
354
|
0
|
|
|
|
|
|
my $outfile = "$outdir/".($filei+$osize).".json"; |
|
355
|
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
#$ocorpus->vlog('info', sprintf("processing files [%3.0f%%]: %s -> %s", 100*($filei-1)/$nfiles, $infile, $outfile)) |
|
357
|
0
|
0
|
0
|
|
|
|
$ocorpus->vlog('info', sprintf("%s: processing files [%3.0f%%]: %s", $logas, 100*($filei-1)/$nfiles, $infile)) |
|
358
|
|
|
|
|
|
|
if ($logFileN && ($filei % $logFileN)==0); |
|
359
|
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
##-- apply filters |
|
361
|
0
|
0
|
|
|
|
|
if ($dofilter) { |
|
362
|
0
|
|
|
|
|
|
my $ftokens = []; |
|
363
|
0
|
|
|
|
|
|
foreach $tok (@{$idoc->{tokens}}) { |
|
|
0
|
|
|
|
|
|
|
|
364
|
0
|
0
|
|
|
|
|
if (ref($tok)) { |
|
365
|
|
|
|
|
|
|
##-- normal token: apply filters |
|
366
|
0
|
|
|
|
|
|
($w,$p,$l) = @$tok{qw(w p l)}; |
|
367
|
|
|
|
|
|
|
next if ((defined($pgood) && $p !~ $pgood) || ($pgoodh && !exists($pgoodh->{$p})) |
|
368
|
|
|
|
|
|
|
|| (defined($pbad) && $p =~ $pbad) || ($pbadh && exists($pbadh->{$p})) |
|
369
|
|
|
|
|
|
|
|| (defined($wgood) && $w !~ $wgood) || ($wgoodh && !exists($wgoodh->{$w})) |
|
370
|
|
|
|
|
|
|
|| (defined($wbad) && $w =~ $wbad) || ($wbadh && exists($wbadh->{$w})) |
|
371
|
|
|
|
|
|
|
|| (defined($lgood) && $l !~ $lgood) || ($lgoodh && !exists($lgoodh->{$l})) |
|
372
|
0
|
0
|
0
|
|
|
|
|| (defined($lbad) && $l =~ $lbad) || ($lbadh && exists($lbadh->{$l})) |
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
373
|
|
|
|
|
|
|
); |
|
374
|
|
|
|
|
|
|
} |
|
375
|
0
|
0
|
0
|
|
|
|
push(@$ftokens,$tok) if (defined($tok) || (@$ftokens && defined($ftokens->[$#$ftokens]))); |
|
|
|
|
0
|
|
|
|
|
|
376
|
|
|
|
|
|
|
} |
|
377
|
0
|
|
|
|
|
|
$idoc->{tokens} = $ftokens; |
|
378
|
|
|
|
|
|
|
} |
|
379
|
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
##-- create output document |
|
381
|
0
|
|
|
|
|
|
my $odoc = {}; |
|
382
|
0
|
|
|
|
|
|
@$odoc{@outkeys} = @$idoc{@outkeys}; |
|
383
|
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
##-- dump output document (json) |
|
385
|
0
|
0
|
|
|
|
|
DiaColloDB::Utils::saveJsonFile($odoc,$outfile, pretty=>0,canonical=>0) |
|
386
|
|
|
|
|
|
|
or $ocorpus->logconfess("$logas: failed to save JSON data for '$infile' to '$outfile': $!"); |
|
387
|
|
|
|
|
|
|
} |
|
388
|
|
|
|
|
|
|
|
|
389
|
0
|
|
|
|
|
|
$ocorpus->vlog($ocorpus->{logThreads}, "$logas: worker thread #$thrid exiting normally"); |
|
390
|
0
|
|
|
|
|
|
$ocorpus->{logOpen} = 'off'; ##-- suppress 'close()' messages from worker threads |
|
391
|
0
|
|
|
|
|
|
}; |
|
392
|
|
|
|
|
|
|
##--/cb_worker |
|
393
|
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
##-- spawn workers |
|
395
|
0
|
|
|
|
|
|
my $njobs = nJobs($ocorpus->{njobs}); |
|
396
|
0
|
0
|
0
|
|
|
|
if ($njobs==0 || !$HAVE_THREADS) { |
|
397
|
0
|
|
|
|
|
|
$ocorpus->info("$logas: running in serial mode"); |
|
398
|
0
|
|
|
|
|
|
$cb_worker->(0); |
|
399
|
|
|
|
|
|
|
} else { |
|
400
|
0
|
|
|
|
|
|
$ocorpus->info("$logas: running in parallel mode with $njobs job(s)"); |
|
401
|
0
|
|
|
|
|
|
my @workers = (map {threads->new($cb_worker,$_)} (1..$njobs)); |
|
|
0
|
|
|
|
|
|
|
|
402
|
0
|
|
|
|
|
|
foreach my $thrid (1..$njobs) { |
|
403
|
0
|
|
|
|
|
|
my $worker = $workers[$thrid-1]; |
|
404
|
0
|
|
|
|
|
|
$worker->join(); |
|
405
|
0
|
0
|
|
|
|
|
if (defined(my $err=$worker->error)) { |
|
406
|
0
|
|
|
|
|
|
$ocorpus->logconfess("$logas: error for worker thread #$thrid: $err"); |
|
407
|
|
|
|
|
|
|
} |
|
408
|
|
|
|
|
|
|
} |
|
409
|
|
|
|
|
|
|
} |
|
410
|
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
##-- adopt list of compiled files |
|
412
|
0
|
|
|
|
|
|
$ocorpus->{size} += $nfiles; |
|
413
|
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
##-- save header (happens implicitly on DESTROY() via close()) |
|
415
|
|
|
|
|
|
|
#$ocorpus->saveHeader() |
|
416
|
|
|
|
|
|
|
# or $ocorpus->logconfess("$logas: failed to save header file ", $ocorpus->headerFile, ": $!"); |
|
417
|
|
|
|
|
|
|
|
|
418
|
0
|
|
|
|
|
|
return $ocorpus; |
|
419
|
|
|
|
|
|
|
} |
|
420
|
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
##============================================================================== |
|
423
|
|
|
|
|
|
|
## Corpus::Compiled API: union |
|
424
|
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
## $ccorpus = $ccorpus->union(\@sources, %opts) |
|
426
|
|
|
|
|
|
|
## + merge source corpora \@sources to $opts{dbdir}, destructive |
|
427
|
|
|
|
|
|
|
## + each $src in \@sources is either a Corpus::Compiled object or a simple scalar (dbdir of a Corpus::Compiled object) |
|
428
|
|
|
|
|
|
|
## + honors $opts{flags} for append and truncate |
|
429
|
|
|
|
|
|
|
## + no filters are applied |
|
430
|
|
|
|
|
|
|
sub union { |
|
431
|
0
|
|
|
0
|
1
|
|
my ($that,$sources,%opts) = @_; |
|
432
|
0
|
0
|
|
|
|
|
my $ocorpus = ref($that) ? $that : $that->new(); |
|
433
|
0
|
|
|
|
|
|
my $logas = 'union()'; |
|
434
|
0
|
|
|
|
|
|
$ocorpus->vlog('info',$logas); |
|
435
|
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
##-- save options before open() |
|
437
|
|
|
|
|
|
|
my $odir = $ocorpus->dbdir($opts{dbdir}) |
|
438
|
0
|
0
|
|
|
|
|
or $ocorpus->logconfess("$logas: no output corpus {dbdir} specified"); |
|
439
|
0
|
|
0
|
|
|
|
my $flags = (fcflags($ocorpus->{flags}) | fcflags($opts{flags})) || fcflags('w'); |
|
440
|
0
|
|
|
|
|
|
delete $opts{dbdir}; |
|
441
|
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
##-- (re-)open output corpus |
|
443
|
0
|
0
|
0
|
|
|
|
if (!$ocorpus->opened || ($ocorpus->{dbdir} ne $odir)) { |
|
444
|
0
|
0
|
|
|
|
|
$ocorpus->open([$odir], %opts, flags=>$flags) |
|
445
|
|
|
|
|
|
|
or $ocorpus->logconfess("$logas: failed to (re-)open output corpus '$odir' in mode '", fcperl($flags)); |
|
446
|
|
|
|
|
|
|
} |
|
447
|
0
|
|
|
|
|
|
@$ocorpus{keys %opts} = values %opts; |
|
448
|
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
##-- union: guts |
|
450
|
0
|
0
|
|
|
|
|
foreach my $src (UNIVERSAL::isa($sources,'ARRAY') ? @$sources : $sources) { |
|
451
|
0
|
0
|
|
|
|
|
my $idir = ref($src) ? $src->{dbdir} : $src; |
|
452
|
0
|
|
|
|
|
|
$ocorpus->vlog('info',"$logas: processing $idir"); |
|
453
|
|
|
|
|
|
|
|
|
454
|
0
|
0
|
|
|
|
|
my $icorpus = ref($src) ? $src : DiaColloDB::Corpus::Compiled->new(dbdir=>$src,logOpen=>undef) |
|
|
|
0
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
or $ocorpus->logconfess("union(): failed to open input corpus '$src': $!"); |
|
456
|
|
|
|
|
|
|
|
|
457
|
0
|
|
|
|
|
|
my $nifiles = $icorpus->{size}; |
|
458
|
0
|
|
|
|
|
|
my $osize = $ocorpus->size; |
|
459
|
|
|
|
|
|
|
|
|
460
|
0
|
|
|
|
|
|
my ($filei,$infile,$outfile); |
|
461
|
0
|
|
|
|
|
|
for ($filei=0; $filei < $nifiles; ++$filei) { |
|
462
|
0
|
|
|
|
|
|
$infile = $icorpus->ifile($filei); |
|
463
|
0
|
|
|
|
|
|
$outfile = "$odir/".($filei+$osize).".json"; |
|
464
|
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
##-- link |
|
466
|
0
|
0
|
0
|
|
|
|
link($infile,$outfile) |
|
467
|
|
|
|
|
|
|
or symlink($infile,$outfile) |
|
468
|
|
|
|
|
|
|
or $ocorpus->logconfess("union(): failed to create output link $outfile -> $infile: $!"); |
|
469
|
|
|
|
|
|
|
} |
|
470
|
0
|
|
|
|
|
|
$ocorpus->{size} += $nifiles; |
|
471
|
|
|
|
|
|
|
} |
|
472
|
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
##-- all done |
|
474
|
|
|
|
|
|
|
#$ocorpus->vlog('info', "merged ", scalar(@$sources), " input corpora to $odir"); |
|
475
|
0
|
|
|
|
|
|
return $ocorpus; |
|
476
|
|
|
|
|
|
|
} |
|
477
|
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
##============================================================================== |
|
480
|
|
|
|
|
|
|
## Footer |
|
481
|
|
|
|
|
|
|
1; |
|
482
|
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
__END__ |
|
484
|
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
|