line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
## -*- Mode: CPerl -*- |
2
|
|
|
|
|
|
|
## File: DiaColloDB::Relation::Unigrams.pm |
3
|
|
|
|
|
|
|
## Author: Bryan Jurish <moocow@cpan.org> |
4
|
|
|
|
|
|
|
## Description: collocation db, profiling relation: unigram database (using DiaColloDB::PackedFile) |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package DiaColloDB::Relation::Unigrams; |
7
|
1
|
|
|
1
|
|
8
|
use DiaColloDB::Relation; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
30
|
|
8
|
1
|
|
|
1
|
|
8
|
use DiaColloDB::PackedFile; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
27
|
|
9
|
1
|
|
|
1
|
|
5
|
use DiaColloDB::Utils qw(:fcntl :sort :env :run :pack :file :jobs); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
37
|
|
10
|
1
|
|
|
1
|
|
387
|
use Fcntl qw(:DEFAULT :seek); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
32
|
|
11
|
1
|
|
|
1
|
|
488
|
use File::Basename qw(dirname); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
28
|
|
12
|
1
|
|
|
1
|
|
112
|
use version; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
8
|
|
13
|
1
|
|
|
1
|
|
85
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
3325
|
|
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
##============================================================================== |
16
|
|
|
|
|
|
|
## Globals & Constants |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB::Relation); |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
## $PFCLASS : object class for nested PackedFile objects |
21
|
|
|
|
|
|
|
our $PFCLASS = 'DiaColloDB::PackedFile::MMap'; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
##============================================================================== |
24
|
|
|
|
|
|
|
## Constructors etc. |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
## $ug = CLASS_OR_OBJECT->new(%args) |
27
|
|
|
|
|
|
|
## + %args, object structure: |
28
|
|
|
|
|
|
|
## ( |
29
|
|
|
|
|
|
|
## ##-- user options |
30
|
|
|
|
|
|
|
## base => $basename, ##-- file basename (default=undef:none); use files "${base}.dba1", "${base}.dba2", "${base}.hdr" |
31
|
|
|
|
|
|
|
## flags => $flags, ##-- fcntl flags or open-mode (default='r') |
32
|
|
|
|
|
|
|
## perms => $perms, ##-- creation permissions (default=(0666 &~umask)) |
33
|
|
|
|
|
|
|
## pack_i => $pack_i, ##-- pack-template for IDs (default='N') |
34
|
|
|
|
|
|
|
## pack_f => $pack_f, ##-- pack-template for frequencies (default='N') |
35
|
|
|
|
|
|
|
## pack_d => $pack_d, ##-- pack-tempalte for dates (default='n') |
36
|
|
|
|
|
|
|
## keeptmp => $bool, ##-- keep temporary files? (default=false) |
37
|
|
|
|
|
|
|
## mmap => $bool, ##-- use mmap access? (default=true) |
38
|
|
|
|
|
|
|
## logCompat => $level, ##-- log-level for compatibility warnings (default='warn') |
39
|
|
|
|
|
|
|
## ## |
40
|
|
|
|
|
|
|
## ##-- size info (after open() or load()) |
41
|
|
|
|
|
|
|
## size1 => $size1, ##-- == $r1->size() |
42
|
|
|
|
|
|
|
## size2 => $size2, ##-- == $r2->size() |
43
|
|
|
|
|
|
|
## sizeN => $sizeN, ##-- == $rN->size() |
44
|
|
|
|
|
|
|
## ## |
45
|
|
|
|
|
|
|
## ##-- low-level data |
46
|
|
|
|
|
|
|
## r1 => $r1, ##-- pf: [$end2] @ $i1 : constant (logical index) |
47
|
|
|
|
|
|
|
## r2 => $r2, ##-- pf: [$d1,$f1]* @ end2($i1-1)..(end2($i1+1)-1) : sorted by $d1 for each $i1 |
48
|
|
|
|
|
|
|
## rN => $rN, ##-- pf: [$fN] @ $date - $ymin : totals by date |
49
|
|
|
|
|
|
|
## ymin => $dmin, ##-- constant == $coldb->{xdmin} |
50
|
|
|
|
|
|
|
## N => $N, ##-- sum($f12) [always used for version <= 0.11; used here only for slice==0] |
51
|
|
|
|
|
|
|
## version => $version, ##-- file version, for compatibility checks |
52
|
|
|
|
|
|
|
## ) |
53
|
|
|
|
|
|
|
sub new { |
54
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
55
|
0
|
|
0
|
|
|
|
my $ug = bless({ |
56
|
|
|
|
|
|
|
base =>undef, |
57
|
|
|
|
|
|
|
flags =>'r', |
58
|
|
|
|
|
|
|
perms =>(0666 & ~umask), |
59
|
|
|
|
|
|
|
pack_i=>'N', |
60
|
|
|
|
|
|
|
pack_f=>'N', |
61
|
|
|
|
|
|
|
pack_d=>'n', |
62
|
|
|
|
|
|
|
N => 0, |
63
|
|
|
|
|
|
|
version => $DiaColloDB::VERSION, |
64
|
|
|
|
|
|
|
logCompat => 'warn', |
65
|
|
|
|
|
|
|
#keeptmp => 0, |
66
|
|
|
|
|
|
|
#mmap => 1, |
67
|
|
|
|
|
|
|
@_ |
68
|
|
|
|
|
|
|
}, (ref($that)||$that)); |
69
|
0
|
|
0
|
|
|
|
$ug->{$_} //= $ug->mmclass($PFCLASS)->new() foreach (qw(r1 r2 rN)); |
70
|
0
|
|
|
|
|
|
$ug->{class} = ref($ug); |
71
|
0
|
0
|
|
|
|
|
return $ug->open() if (defined($ug->{base})); |
72
|
0
|
|
|
|
|
|
return $ug; |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
sub DESTROY { |
76
|
0
|
0
|
|
0
|
|
|
$_[0]->close() if ($_[0]->opened); |
77
|
|
|
|
|
|
|
} |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
##============================================================================== |
80
|
|
|
|
|
|
|
## Persistent API: disk usage |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
## @files = $obj->diskFiles() |
83
|
|
|
|
|
|
|
## + returns disk storage files, used by du() and timestamp() |
84
|
|
|
|
|
|
|
sub diskFiles { |
85
|
0
|
|
|
0
|
1
|
|
return map {"$_[0]{base}$_"} (qw(.hdr .dba1 .dba1.hdr .dba2 .dba2.hdr)); |
|
0
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
##============================================================================== |
89
|
|
|
|
|
|
|
## I/O |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
92
|
|
|
|
|
|
|
## I/O: open/close |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
## $ug_or_undef = $ug->open($base,$flags) |
95
|
|
|
|
|
|
|
## $ug_or_undef = $ug->open($base) |
96
|
|
|
|
|
|
|
## $ug_or_undef = $ug->open() |
97
|
|
|
|
|
|
|
sub open { |
98
|
0
|
|
|
0
|
1
|
|
my ($ug,$base,$flags) = @_; |
99
|
0
|
|
0
|
|
|
|
$base //= $ug->{base}; |
100
|
0
|
|
0
|
|
|
|
$flags //= $ug->{flags}; |
101
|
0
|
0
|
|
|
|
|
$ug->close() if ($ug->opened); |
102
|
0
|
|
|
|
|
|
$ug->{base} = $base; |
103
|
0
|
|
|
|
|
|
$ug->{flags} = $flags = fcflags($flags); |
104
|
0
|
|
|
|
|
|
my ($hdr); ##-- save header, for version-checking |
105
|
0
|
0
|
0
|
|
|
|
if (fcread($flags) && !fctrunc($flags)) { |
106
|
0
|
0
|
0
|
|
|
|
$hdr = ($ug->readHeader() || $ug->readHeader("$ug->{base}.dba.hdr")) |
107
|
|
|
|
|
|
|
or $ug->logconfess("failed to read header data from '$ug->{base}.hdr': $!"); |
108
|
0
|
0
|
|
|
|
|
$ug->loadHeaderData($hdr) |
109
|
|
|
|
|
|
|
or $ug->logconess("failed to load header data from '$ug->{base}.hdr': $!"); |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
##-- check compatibility |
113
|
0
|
|
|
|
|
|
my $min_version = qv(0.12.000); |
114
|
0
|
0
|
0
|
|
|
|
if ($hdr && (!defined($hdr->{version}) || version->parse($hdr->{version}) < $min_version)) { |
|
|
|
0
|
|
|
|
|
115
|
0
|
|
|
|
|
|
$ug->vlog($ug->{logCompat}, "using v0.11 compatibility mode for $ug->{base}.*; consider running \`dcdb-upgrade.perl ", dirname($ug->{base}), "\'"); |
116
|
0
|
|
|
|
|
|
DiaColloDB::Compat->usecompat('v0_11'); |
117
|
0
|
|
|
|
|
|
bless($ug, 'DiaColloDB::Compat::v0_11::Relation::Unigrams'); |
118
|
0
|
|
|
|
|
|
$ug->{version} = $hdr->{version}; |
119
|
0
|
|
|
|
|
|
return $ug->open($base,$flags); |
120
|
|
|
|
|
|
|
} |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
##-- open low-level data structures |
123
|
0
|
0
|
|
|
|
|
$ug->{r1}->open("$base.dba1", $flags, perms=>$ug->{perms}, packas=>"$ug->{pack_i}") |
124
|
|
|
|
|
|
|
or $ug->logconfess("open failed for $base.dba1: $!"); |
125
|
0
|
0
|
|
|
|
|
$ug->{r2}->open("$base.dba2", $flags, perms=>$ug->{perms}, packas=>"$ug->{pack_d}$ug->{pack_f}") |
126
|
|
|
|
|
|
|
or $ug->logconfess("open failed for $base.dba2: $!"); |
127
|
0
|
0
|
|
|
|
|
$ug->{rN}->open("$base.dbaN", $flags, perms=>$ug->{perms}, packas=>"$ug->{pack_f}") |
128
|
|
|
|
|
|
|
or $ug->logconfess("open failed for $base.dbaN: $!"); |
129
|
0
|
|
|
|
|
|
$ug->{size1} = $ug->{r1}->size; |
130
|
0
|
|
|
|
|
|
$ug->{size2} = $ug->{r2}->size; |
131
|
0
|
|
|
|
|
|
$ug->{sizeN} = $ug->{rN}->size; |
132
|
|
|
|
|
|
|
|
133
|
0
|
|
|
|
|
|
return $ug; |
134
|
|
|
|
|
|
|
} |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
## $ug_or_undef = $ug->close() |
137
|
|
|
|
|
|
|
sub close { |
138
|
0
|
|
|
0
|
1
|
|
my $ug = shift; |
139
|
0
|
0
|
0
|
|
|
|
if ($ug->opened && fcwrite($ug->{flags})) { |
140
|
0
|
0
|
|
|
|
|
$ug->saveHeader() or return undef; |
141
|
|
|
|
|
|
|
} |
142
|
0
|
0
|
|
|
|
|
$ug->{r1}->close() or return undef; |
143
|
0
|
0
|
|
|
|
|
$ug->{r2}->close() or return undef; |
144
|
0
|
0
|
|
|
|
|
$ug->{rN}->close() or return undef; |
145
|
0
|
|
|
|
|
|
undef $ug->{base}; |
146
|
0
|
|
|
|
|
|
return $ug; |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
## $bool = $ug->opened() |
150
|
|
|
|
|
|
|
sub opened { |
151
|
0
|
|
|
0
|
1
|
|
my $ug = shift; |
152
|
|
|
|
|
|
|
return |
153
|
|
|
|
|
|
|
(defined($ug->{base}) |
154
|
|
|
|
|
|
|
&& defined($ug->{r1}) && $ug->{r1}->opened |
155
|
|
|
|
|
|
|
&& defined($ug->{r2}) && $ug->{r2}->opened |
156
|
|
|
|
|
|
|
&& defined($ug->{rN}) && $ug->{rN}->opened |
157
|
0
|
|
0
|
|
|
|
); |
158
|
|
|
|
|
|
|
} |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
161
|
|
|
|
|
|
|
## I/O: header |
162
|
|
|
|
|
|
|
## + largely INHERITED from DiaColloDB::Persistent |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
## @keys = $ug->headerKeys() |
165
|
|
|
|
|
|
|
## + keys to save as header |
166
|
|
|
|
|
|
|
sub headerKeys { |
167
|
0
|
|
0
|
0
|
1
|
|
return grep {!ref($_[0]{$_}) && $_ !~ m{^(?:base|flags|perms|log.*|packas|reclen)$}} keys %{$_[0]}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
## $bool = $ug->loadHeaderData($hdr) |
171
|
|
|
|
|
|
|
## + instantiates header data from $hdr |
172
|
|
|
|
|
|
|
## + overrides DiaColloDB::Persistent implementation |
173
|
|
|
|
|
|
|
sub loadHeaderData { |
174
|
0
|
|
|
0
|
1
|
|
my ($ug,$hdr) = @_; |
175
|
0
|
0
|
0
|
|
|
|
if (!defined($hdr) && !fccreat($ug->{flags})) { |
|
|
0
|
|
|
|
|
|
176
|
0
|
|
|
|
|
|
$ug->logconfess("loadHeaderData() failed to load header data from ", $ug->headerFile, ": $!"); |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
elsif (defined($hdr)) { |
179
|
0
|
|
|
|
|
|
return $ug->SUPER::loadHeaderData($hdr); |
180
|
|
|
|
|
|
|
} |
181
|
0
|
|
|
|
|
|
return $ug; |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
## $bool = $enum->saveHeader() |
185
|
|
|
|
|
|
|
## + inherited from DiaColloDB::Persistent |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
188
|
|
|
|
|
|
|
## I/O: text |
189
|
|
|
|
|
|
|
## + largely INHERITED from DiaColloDB::Persistent |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
## $bool = $obj->loadTextFile($filename_or_handle, %opts) |
192
|
|
|
|
|
|
|
## + wraps loadTextFh() |
193
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Persistent |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
## $ug = $ug->loadTextFh($fh,%opts) |
196
|
|
|
|
|
|
|
## + loads from text file as saved by saveTextFh() |
197
|
|
|
|
|
|
|
## + input fh must be sorted by $i1,$d1 |
198
|
|
|
|
|
|
|
## + supports multiple lines for pairs ($i1,$d1) provided the above conditions hold |
199
|
|
|
|
|
|
|
## + supports loading of $ug->{N} from single-component lines |
200
|
|
|
|
|
|
|
## + %opts: clobber %$ug |
201
|
|
|
|
|
|
|
sub loadTextFh { |
202
|
0
|
|
|
0
|
1
|
|
my ($ug,$infh,%opts) = @_; |
203
|
0
|
0
|
|
|
|
|
if (!ref($ug)) { |
204
|
0
|
|
|
|
|
|
$ug = $ug->new(%opts); |
205
|
|
|
|
|
|
|
} else { |
206
|
0
|
|
|
|
|
|
@$ug{keys %opts} = values %opts; |
207
|
|
|
|
|
|
|
} |
208
|
0
|
0
|
|
|
|
|
$ug->logconfess("loadTextFh(): cannot load unopened database!") if (!$ug->opened); |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
##-- common variables |
211
|
|
|
|
|
|
|
## $r1 : [$end2] @ $i1 |
212
|
|
|
|
|
|
|
## $r2 : [$d1,$f1]* @ end2($i1-1)..(end2($i1+1)-1) |
213
|
0
|
|
|
|
|
|
my ($r1,$r2,$rN) = @$ug{qw(r1 r2 rN)}; |
214
|
0
|
|
|
|
|
|
my ($pack_r1,$pack_r2) = map {$_->{packas}} ($r1,$r2); |
|
0
|
|
|
|
|
|
|
215
|
0
|
|
|
|
|
|
$r1->truncate(); |
216
|
0
|
|
|
|
|
|
$r2->truncate(); |
217
|
0
|
|
|
|
|
|
$rN->truncate(); |
218
|
0
|
|
|
|
|
|
my ($fh1,$fh2) = ($r1->{fh},$r2->{fh}); |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
##-- iteration variables |
221
|
0
|
|
|
|
|
|
my ($pos1,$pos2) = (0,0); |
222
|
0
|
|
|
|
|
|
my ($i1_cur,$f1) = (-1,undef,0); |
223
|
0
|
|
|
|
|
|
my ($i1,$d1); |
224
|
0
|
|
|
|
|
|
my $N = 0; ##-- total marginal frequency as extracted from %fd |
225
|
0
|
|
|
|
|
|
my $N1 = 0; ##-- total N as extracted from single-element records |
226
|
0
|
|
|
|
|
|
my %fd = qw(); ##-- ($d=>$f1d, ...) for $i1_cur |
227
|
0
|
|
|
|
|
|
my %fN = qw(); ##-- ($d=>$fd, ...) global |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
##-- guts for inserting records from $i1_cur,%fd,$pos1,$pos2 : call on changed ($i1_cur) |
230
|
|
|
|
|
|
|
my $insert = sub { |
231
|
0
|
0
|
|
0
|
|
|
if ($i1_cur >= 0) { |
232
|
0
|
0
|
|
|
|
|
if ($i1_cur != $pos1) { |
233
|
|
|
|
|
|
|
##-- we've skipped one or more $i1 because it had no data-lines |
234
|
0
|
|
|
|
|
|
$fh1->print( pack($pack_r1,$pos2) x ($i1_cur-$pos1) ); |
235
|
0
|
|
|
|
|
|
$pos1 = $i1_cur; |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
##-- dump r2-record(s) for ($i1_cur) |
239
|
0
|
|
|
|
|
|
foreach (sort {$a<=>$b} keys %fd) { |
|
0
|
|
|
|
|
|
|
240
|
0
|
|
|
|
|
|
$fh2->print(pack($pack_r2, $_,$fd{$_})); |
241
|
0
|
|
|
|
|
|
++$pos2; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
##-- dump r1-record for $i1_cur |
245
|
0
|
|
|
|
|
|
$fh1->print(pack($pack_r1, $pos2)); |
246
|
0
|
|
|
|
|
|
$pos1 = $i1_cur+1; |
247
|
|
|
|
|
|
|
} |
248
|
0
|
|
|
|
|
|
$i1_cur = $i1; |
249
|
0
|
|
|
|
|
|
%fd = qw(); |
250
|
0
|
|
|
|
|
|
}; |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
##-- ye olde loope |
253
|
0
|
|
|
|
|
|
binmode($infh,':raw'); |
254
|
0
|
|
|
|
|
|
while (defined($_=<$infh>)) { |
255
|
0
|
|
|
|
|
|
chomp; |
256
|
0
|
|
|
|
|
|
($f1,$i1,$d1) = split(' ',$_,3); |
257
|
0
|
0
|
|
|
|
|
if (!defined($i1)) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
258
|
0
|
|
|
|
|
|
$N1 += $f1; ##-- load N values |
259
|
0
|
|
|
|
|
|
next; |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
elsif ($i1 eq '') { |
262
|
0
|
|
|
|
|
|
next; ##-- ignore EOS counts from create() |
263
|
|
|
|
|
|
|
} |
264
|
|
|
|
|
|
|
elsif (!defined($d1)) { |
265
|
0
|
|
|
|
|
|
$ug->logconfess("loadTextFh(): failed to parse input line ", $infh->input_line_number); |
266
|
|
|
|
|
|
|
} |
267
|
0
|
0
|
|
|
|
|
$insert->() ##-- insert record(s) for ($i1_cur) |
268
|
|
|
|
|
|
|
if ($i1 != $i1_cur); |
269
|
0
|
|
|
|
|
|
$fd{$d1} += $f1; ##-- buffer frequencies for ($i1_cur,$d1_cur) |
270
|
0
|
|
|
|
|
|
$fN{$d1} += $f1; ##-- track N by date |
271
|
0
|
|
|
|
|
|
$N += $f1; ##-- track marginal N |
272
|
|
|
|
|
|
|
} |
273
|
0
|
|
|
|
|
|
$i1 = -1; |
274
|
0
|
|
|
|
|
|
$insert->(); ##-- write record(s) for final ($i1_cur) |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
##-- create $rN by date |
277
|
0
|
|
|
|
|
|
my @dates = sort {$a<=>$b} keys %fN; |
|
0
|
|
|
|
|
|
|
278
|
0
|
|
|
|
|
|
my $ymin = $ug->{ymin} = $dates[0]; |
279
|
0
|
|
0
|
|
|
|
$rN->{fh}->print(pack("($rN->{packas})*", map {$fN{$_}//0} ($ymin..$dates[$#dates]))); |
|
0
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
##-- adopt final $N and sizes |
282
|
0
|
0
|
|
|
|
|
$ug->{N} = $N1>$N ? $N1 : $N; |
283
|
0
|
|
|
|
|
|
foreach (qw(1 2 N)) { |
284
|
0
|
|
|
|
|
|
my $r = $ug->{"r$_"}; |
285
|
0
|
|
|
|
|
|
$r->flush(); |
286
|
0
|
|
|
|
|
|
$ug->{"size$_"} = $r->size; |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
|
289
|
0
|
|
|
|
|
|
return $ug; |
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
## $bool = $obj->saveTextFile($filename_or_handle, %opts) |
293
|
|
|
|
|
|
|
## + wraps saveTextFh() |
294
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Persistent |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
## $bool = $ug->saveTextFh($fh,%opts) |
297
|
|
|
|
|
|
|
## + save from text file with lines of the form: |
298
|
|
|
|
|
|
|
## N ##-- 1 field : N |
299
|
|
|
|
|
|
|
## FREQ ID1 DATE ##-- 3 fields: unigram frequency for (ID1,DATE) |
300
|
|
|
|
|
|
|
## + %opts: |
301
|
|
|
|
|
|
|
## i2s => \&CODE, ##-- code-ref for formatting indices; called as $s=CODE($i) |
302
|
|
|
|
|
|
|
sub saveTextFh { |
303
|
0
|
|
|
0
|
1
|
|
my ($ug,$outfh,%opts) = @_; |
304
|
0
|
0
|
|
|
|
|
$ug->logconfess("saveTextFile(): cannot save unopened DB") if (!$ug->opened); |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
##-- common variables |
307
|
|
|
|
|
|
|
## $r1 : [$end2] @ $i1 |
308
|
|
|
|
|
|
|
## $r2 : [$d1,$f1]* @ end2($i1-1)..(end2($i1+1)-1) |
309
|
0
|
|
|
|
|
|
my ($r1,$r2) = @$ug{qw(r1 r2)}; |
310
|
0
|
|
|
|
|
|
my ($pack1,$pack2) = map {$_->{packas}} ($r1,$r2); |
|
0
|
|
|
|
|
|
|
311
|
0
|
|
|
|
|
|
my $i2s = $opts{i2s}; |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
##-- iteration variables |
314
|
0
|
|
|
|
|
|
my ($buf1,$i1,$s1,$end2); |
315
|
0
|
|
|
|
|
|
my ($buf2,$off2,$d1,$f1); |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
##-- ye olde loope |
318
|
0
|
|
|
|
|
|
binmode($outfh,':raw'); |
319
|
0
|
|
|
|
|
|
$outfh->print($ug->{N}, "\n"); |
320
|
0
|
|
|
|
|
|
for ($r1->seek($i1=0), $r2->seek($off2=0); !$r1->eof(); ++$i1) { |
321
|
0
|
0
|
|
|
|
|
$r1->read(\$buf1) or $ug->logconfess("saveTextFile(): failed to read record $i1 from $r1->{file}: $!"); |
322
|
0
|
|
|
|
|
|
$end2 = unpack($pack1,$buf1); |
323
|
0
|
0
|
|
|
|
|
$s1 = $i2s ? $i2s->($i1) : $i1; |
324
|
|
|
|
|
|
|
|
325
|
0
|
|
0
|
|
|
|
for ( ; $off2 < $end2 && !$r2->eof(); ++$off2) { |
326
|
0
|
0
|
|
|
|
|
$r2->read(\$buf2) or $ug->logconfess("saveTextFile(): failed to read record $off2 from $r2->{file}: $!"); |
327
|
0
|
|
|
|
|
|
($d1,$f1) = unpack($pack2,$buf2); |
328
|
|
|
|
|
|
|
|
329
|
0
|
|
|
|
|
|
$outfh->print(join("\t", $f1, $s1, $d1), "\n"); |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
} |
332
|
|
|
|
|
|
|
|
333
|
0
|
|
|
|
|
|
return $ug; |
334
|
|
|
|
|
|
|
} |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
##============================================================================== |
338
|
|
|
|
|
|
|
## Relation API: create |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
## $ug = $CLASS_OR_OBJECT->create($coldb,$tokdat_file,%opts) |
341
|
|
|
|
|
|
|
## + populates current database from $tokdat_file, |
342
|
|
|
|
|
|
|
## a tt-style text file containing with lines of the form: |
343
|
|
|
|
|
|
|
## TID DATE ##-- single token |
344
|
|
|
|
|
|
|
## "\n" ##-- blank line --> EOS |
345
|
|
|
|
|
|
|
## + %opts: clobber %$ug |
346
|
|
|
|
|
|
|
sub create { |
347
|
0
|
|
|
0
|
1
|
|
my ($ug,$coldb,$datfile,%opts) = @_; |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
##-- create/clobber |
350
|
0
|
0
|
|
|
|
|
$ug = $ug->new() if (!ref($ug)); |
351
|
0
|
|
|
|
|
|
@$ug{keys %opts} = values %opts; |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
##-- ensure openend |
354
|
0
|
0
|
0
|
|
|
|
$ug->opened |
355
|
|
|
|
|
|
|
or $ug->open() |
356
|
|
|
|
|
|
|
or $ug->logconfess("create(): failed to open unigrams database: $!"); |
357
|
|
|
|
|
|
|
|
358
|
0
|
|
|
|
|
|
env_push(LC_ALL=>'C'); |
359
|
0
|
0
|
|
|
|
|
my $cmdfh = opencmd("sort -nk1 -nk2 ".sortJobs()." $datfile | uniq -c |") |
360
|
|
|
|
|
|
|
or $ug->logconfess("create(): failed to open pipe from sort: $!"); |
361
|
0
|
0
|
|
|
|
|
$ug->loadTextFh($cmdfh) |
362
|
|
|
|
|
|
|
or $ug->logconfess("create(): failed to load unigram data: $!"); |
363
|
0
|
0
|
|
|
|
|
$cmdfh->close() |
364
|
|
|
|
|
|
|
or $ug->logconfess("create(): failed to close pipe from sort: $!"); |
365
|
0
|
|
|
|
|
|
env_pop(); |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
##-- save header |
368
|
0
|
0
|
|
|
|
|
$ug->saveHeader() |
369
|
|
|
|
|
|
|
or $ug->logconfess("create(): failed to save header: $!"); |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
##-- done |
372
|
0
|
|
|
|
|
|
return $ug; |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
##============================================================================== |
376
|
|
|
|
|
|
|
## Relation API: union |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
## $ug = CLASS_OR_OBJECT->union($coldb, \@pairs, %opts) |
379
|
|
|
|
|
|
|
## + merge multiple co-frequency indices into new object |
380
|
|
|
|
|
|
|
## + @pairs : array of pairs ([$ug,\@ti2u],...) |
381
|
|
|
|
|
|
|
## of unigram-objects $ug and tuple-id maps \@ti2u for $ug |
382
|
|
|
|
|
|
|
## - \@ti2u may also be a mapping object supporting a toArray() method |
383
|
|
|
|
|
|
|
## + %opts: clobber %$ug |
384
|
|
|
|
|
|
|
## + implicitly flushes the new index |
385
|
|
|
|
|
|
|
sub union { |
386
|
0
|
|
|
0
|
1
|
|
my ($ug,$coldb,$pairs,%opts) = @_; |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
##-- create/clobber |
389
|
0
|
0
|
|
|
|
|
$ug = $ug->new() if (!ref($ug)); |
390
|
0
|
|
|
|
|
|
@$ug{keys %opts} = values %opts; |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
##-- tempfile (input for sort) |
393
|
0
|
|
|
|
|
|
my $tmpfile = "$ug->{base}.udat"; |
394
|
0
|
0
|
|
|
|
|
my $tmpfh = IO::File->new(">$tmpfile") |
395
|
|
|
|
|
|
|
or $ug->logconfess("union(): open failed for tempfile $tmpfile: $!"); |
396
|
0
|
|
|
|
|
|
binmode($tmpfh,':raw'); |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
##-- stage1: dump argument relations to text tempfile |
399
|
0
|
|
|
|
|
|
$ug->vlog('trace', "union(): stage1: collect items"); |
400
|
0
|
|
|
|
|
|
my ($pair,$pxf,$pi2u,$pi2s); |
401
|
0
|
|
|
|
|
|
my $pairi =0; |
402
|
0
|
|
|
|
|
|
foreach $pair (@$pairs) { |
403
|
0
|
|
|
|
|
|
($pxf,$pi2u) = @$pair; |
404
|
0
|
0
|
|
|
|
|
$pi2u = $pi2u->toArray() if (UNIVERSAL::can($pi2u,'toArray')); |
405
|
0
|
|
|
0
|
|
|
$pxf->saveTextFh($tmpfh, i2s=>sub { $pi2u->[$_[0]] }) |
406
|
0
|
0
|
|
|
|
|
or $ug->logconfess("union(): failed to extract data for argument $pairi"); |
407
|
0
|
|
|
|
|
|
++$pairi; |
408
|
|
|
|
|
|
|
} |
409
|
|
|
|
|
|
|
$tmpfh->close() |
410
|
0
|
0
|
|
|
|
|
or $ug->logconfess("union(): failed to close tempfile $tmpfile: $!"); |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
##-- stage2: sort & load tempfile |
413
|
0
|
|
|
|
|
|
env_push(LC_ALL=>'C'); |
414
|
0
|
|
|
|
|
|
$ug->vlog('trace', "union(): stage2: load unigram frequencies"); |
415
|
0
|
0
|
|
|
|
|
my $sortfh = opencmd("sort -n -k2 -k3 ".sortJobs()." $tmpfile |") |
416
|
|
|
|
|
|
|
or $ug->logconfess("union(): open failed for pipe from sort: $!"); |
417
|
0
|
|
|
|
|
|
binmode($sortfh,':raw'); |
418
|
0
|
0
|
|
|
|
|
$ug->loadTextFh($sortfh) |
419
|
|
|
|
|
|
|
or $ug->logconfess("union(): failed to load unigram frequencies from $tmpfile: $!"); |
420
|
0
|
0
|
|
|
|
|
$sortfh->close() |
421
|
|
|
|
|
|
|
or $ug->logconfess("union(): failed to close pipe from sort: $!"); |
422
|
0
|
|
|
|
|
|
env_pop(); |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
##-- stage3: header |
425
|
0
|
0
|
|
|
|
|
$ug->saveHeader() |
426
|
|
|
|
|
|
|
or $ug->logconfess("union(): failed to save header: $!"); |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
##-- cleanup: unlink temp file(s) |
429
|
0
|
0
|
|
|
|
|
CORE::unlink($tmpfile) if (!$ug->{keeptmp}); |
430
|
|
|
|
|
|
|
|
431
|
0
|
|
|
|
|
|
return $ug; |
432
|
|
|
|
|
|
|
} |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
##============================================================================== |
435
|
|
|
|
|
|
|
## Relation API: dbinfo |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
## \%info = $rel->dbinfo($coldb) |
438
|
|
|
|
|
|
|
## + embedded info-hash for $coldb->dbinfo() |
439
|
|
|
|
|
|
|
sub dbinfo { |
440
|
0
|
|
|
0
|
0
|
|
my $ug = shift; |
441
|
0
|
|
|
|
|
|
my $info = $ug->SUPER::dbinfo(); |
442
|
0
|
|
|
|
|
|
@$info{qw(size1 size2 sizeN N)} = @$ug{qw(size1 size2 sizeN N)}; |
443
|
0
|
|
|
|
|
|
return $info; |
444
|
|
|
|
|
|
|
} |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
##============================================================================== |
448
|
|
|
|
|
|
|
## Utils: lookup |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
## $N = $cof->sliceN($sliceBy, $dateLo) |
451
|
|
|
|
|
|
|
## + get total slice co-occurrence count, used by subprofile1() |
452
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Relation |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
##============================================================================== |
455
|
|
|
|
|
|
|
## Relation API: default |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
458
|
|
|
|
|
|
|
## Relation API: default: sliceN |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
## $N = $rel->sliceN($sliceBy, $dateLo) |
461
|
|
|
|
|
|
|
## + get total slice-wise co-occurrence count for a slice of size $sliceBy starting at $dateLo |
462
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Relation |
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
465
|
|
|
|
|
|
|
## Relation API: default: profile |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
## \%slice2prf = $rel->subprofile1(\@tids,\%opts) |
468
|
|
|
|
|
|
|
## + get slice-wise joint co-frequency profile(s) for @tids (db must be opened; f1 and f12 only) |
469
|
|
|
|
|
|
|
## + %opts: as for profile(), also: |
470
|
|
|
|
|
|
|
## coldb => $coldb, ##-- parent DiaColloDB object (for shared data, debugging) |
471
|
|
|
|
|
|
|
## dreq => \%dreq, ##-- parsed date request |
472
|
|
|
|
|
|
|
sub subprofile1 { |
473
|
0
|
|
|
0
|
1
|
|
my ($ug,$tids,$opts) = @_; |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
##-- common variables |
476
|
0
|
0
|
|
|
|
|
$tids = [$tids] if (!UNIVERSAL::isa($tids,'ARRAY')); |
477
|
0
|
|
|
|
|
|
my $coldb = $opts->{coldb}; |
478
|
0
|
|
|
|
|
|
my $slice = $opts->{slice}; |
479
|
0
|
|
|
|
|
|
my $dreq = $opts->{dreq}; |
480
|
0
|
|
|
|
|
|
my $dfilter = $dreq->{dfilter}; |
481
|
0
|
|
|
|
|
|
my $groupby = $opts->{groupby}{ti2g}; |
482
|
0
|
|
|
|
|
|
my $extend = $opts->{extend}; |
483
|
0
|
|
|
|
|
|
my $onepass = $opts->{onepass}; |
484
|
0
|
|
|
|
|
|
my $pack_id = $coldb->{pack_id}; |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
##-- vars: relation-wise |
487
|
|
|
|
|
|
|
## $r1 : [$end2] @ $i1 |
488
|
|
|
|
|
|
|
## $r2 : [$d1,$f1]* @ end2($i1-1)..(end2($i1+1)-1) |
489
|
0
|
|
|
|
|
|
my ($r1,$r2) = @$ug{qw(r1 r2)}; |
490
|
0
|
|
|
|
|
|
my ($pack1,$pack2) = map {$_->{packas}} ($r1,$r2); |
|
0
|
|
|
|
|
|
|
491
|
0
|
|
|
|
|
|
my $pack2d = $ug->{pack_d}; |
492
|
0
|
|
|
|
|
|
my $pack2f = '@'.packsize("$ug->{pack_i}").$ug->{pack_f}; |
493
|
0
|
|
0
|
|
|
|
my $size1 = $ug->{size1} // ($ug->{size1}=$r1->size); |
494
|
0
|
|
0
|
|
|
|
my $size2 = $ug->{size2} // ($ug->{size2}=$r2->size); |
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
##-- setup %slice2prf |
497
|
|
|
|
|
|
|
my %slice2prf = map { |
498
|
0
|
|
|
|
|
|
($_ => DiaColloDB::Profile->new(f1=>0, N=>$ug->sliceN($slice,$_))) |
499
|
0
|
0
|
|
|
|
|
} ($slice ? (map {$_*$slice} (($dreq->{slo}/$slice)..($dreq->{shi}/$slice))) : 0); |
|
0
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
##-- ye olde loope |
503
|
0
|
|
|
|
|
|
my ($i1,$beg2,$end2, $pos2,$d1,$ds,$dprf,$f1, $key2,$buf); |
504
|
0
|
|
|
|
|
|
foreach $i1 (@$tids) { |
505
|
0
|
0
|
|
|
|
|
next if ($i1 >= $size1); |
506
|
0
|
0
|
|
|
|
|
$beg2 = ($i1==0 ? 0 : unpack($pack1,$r1->fetchraw($i1-1,\$buf))); |
507
|
0
|
|
|
|
|
|
$end2 = unpack($pack1, $r1->fetchraw($i1,\$buf)); |
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
##-- check groupby "having" filter |
510
|
0
|
0
|
|
|
|
|
$key2 = $groupby ? $groupby->($i1) : pack($pack_id,$i1); |
511
|
|
|
|
|
|
|
|
512
|
0
|
0
|
|
|
|
|
next if ($beg2 >= $size2); |
513
|
0
|
|
|
|
|
|
for ($pos2=$beg2; $pos2 < $end2; ++$pos2) { |
514
|
0
|
|
|
|
|
|
($d1,$f1) = unpack($pack2, $r2->fetchraw($pos2,\$buf)); |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
##-- check date-filter & get slice-local profile $dprf |
517
|
0
|
0
|
0
|
|
|
|
next if ($dfilter && !$dfilter->($d1)); |
518
|
0
|
0
|
|
|
|
|
$ds = $slice ? int($d1/$slice)*$slice : 0; |
519
|
0
|
|
|
|
|
|
$dprf = $slice2prf{$ds}; |
520
|
0
|
|
|
|
|
|
$dprf->{f1} += $f1; |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
next if (!defined($key2) ##-- item2 selection via groupby CODE-ref |
523
|
0
|
0
|
0
|
|
|
|
|| ($extend && !exists($extend->{$ds}{$key2})) ##-- ... or via 'extend' parameter |
|
|
|
0
|
|
|
|
|
524
|
|
|
|
|
|
|
); |
525
|
0
|
|
|
|
|
|
$dprf->{f12}{$key2} += $f1; |
526
|
0
|
|
|
|
|
|
$dprf->{f2}{$key2} += $f1; |
527
|
|
|
|
|
|
|
} |
528
|
|
|
|
|
|
|
} |
529
|
|
|
|
|
|
|
|
530
|
0
|
|
|
|
|
|
return \%slice2prf; |
531
|
|
|
|
|
|
|
} |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
534
|
|
|
|
|
|
|
## Relation API: default: subprofile2 |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
## \%slice2prf = $rel->subprofile2(\%slice2prf, \%opts) |
537
|
|
|
|
|
|
|
## + populate f2 frequencies for profiles in \%slice2prf |
538
|
|
|
|
|
|
|
## + %opts: as for subprofile1() |
539
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Relation : no-op |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
542
|
|
|
|
|
|
|
## Relation API: default: qinfo |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
## \%qinfo = $rel->qinfo($coldb, %opts) |
545
|
|
|
|
|
|
|
## + get query-info hash for profile administrivia (ddc hit links) |
546
|
|
|
|
|
|
|
## + %opts: as for profile(), additionally: |
547
|
|
|
|
|
|
|
## ( |
548
|
|
|
|
|
|
|
## qreqs => \@qreqs, ##-- as returned by $coldb->parseRequest($opts{query}) |
549
|
|
|
|
|
|
|
## gbreq => \%groupby, ##-- as returned by $coldb->groupby($opts{groupby}) |
550
|
|
|
|
|
|
|
## ) |
551
|
|
|
|
|
|
|
sub qinfo { |
552
|
0
|
|
|
0
|
1
|
|
my ($rel,$coldb,%opts) = @_; |
553
|
0
|
|
|
|
|
|
my ($q1strs,$q2strs,$qxstrs,$fstrs) = $rel->qinfoData($coldb,%opts); |
554
|
|
|
|
|
|
|
|
555
|
0
|
|
|
|
|
|
my @qstrs = (@$q1strs, @$q2strs, @$qxstrs); |
556
|
0
|
0
|
|
|
|
|
@qstrs = ('*') if (!@qstrs); |
557
|
0
|
0
|
|
|
|
|
my $qstr = ('('.join(' WITH ', @qstrs).') =1' |
558
|
|
|
|
|
|
|
.' #SEPARATE' |
559
|
|
|
|
|
|
|
.(@$fstrs ? (' '.join(' ',@$fstrs)) : ''), |
560
|
|
|
|
|
|
|
); |
561
|
|
|
|
|
|
|
return { |
562
|
0
|
|
|
|
|
|
fcoef => 1, |
563
|
|
|
|
|
|
|
qtemplate => $qstr, |
564
|
|
|
|
|
|
|
qcanon => $rel->qcanon($coldb,%opts), |
565
|
|
|
|
|
|
|
}; |
566
|
|
|
|
|
|
|
} |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
##============================================================================== |
569
|
|
|
|
|
|
|
## Pacakge Alias(es) |
570
|
|
|
|
|
|
|
package DiaColloDB::Unigrams; |
571
|
1
|
|
|
1
|
|
10
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
55
|
|
572
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB::Relation::Unigrams); |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
##============================================================================== |
575
|
|
|
|
|
|
|
## Footer |
576
|
|
|
|
|
|
|
1; |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
__END__ |