line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
## -*- Mode: CPerl -*- |
2
|
|
|
|
|
|
|
## File: DiaColloDB::Compat::v0_09::Relation::Cofreqs.pm |
3
|
|
|
|
|
|
|
## Author: Bryan Jurish <moocow@cpan.org> |
4
|
|
|
|
|
|
|
## Description: collocation db, profiling relation: co-frequency database (v0.9x format) |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package DiaColloDB::Compat::v0_09::Relation::Cofreqs; |
7
|
1
|
|
|
1
|
|
8
|
use DiaColloDB::Compat::v0_09::Relation; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
34
|
|
8
|
1
|
|
|
1
|
|
6
|
use DiaColloDB::PackedFile; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
22
|
|
9
|
1
|
|
|
1
|
|
5
|
use DiaColloDB::PackedFile::MMap; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
25
|
|
10
|
1
|
|
|
1
|
|
4
|
use DiaColloDB::Utils qw(:fcntl :env :run :json :pack); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
39
|
|
11
|
1
|
|
|
1
|
|
387
|
use Fcntl qw(:DEFAULT :seek); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
33
|
|
12
|
1
|
|
|
1
|
|
452
|
use strict; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
1508
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
##============================================================================== |
15
|
|
|
|
|
|
|
## Globals & Constants |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB::Compat::v0_09::Relation); |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
## $PFCLASS : object class for nested PackedFile objects |
20
|
|
|
|
|
|
|
our $PFCLASS = 'DiaColloDB::PackedFile::MMap'; |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
##============================================================================== |
23
|
|
|
|
|
|
|
## Constructors etc. |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
## $cof = CLASS_OR_OBJECT->new(%args) |
26
|
|
|
|
|
|
|
## + %args, object structure: |
27
|
|
|
|
|
|
|
## ( |
28
|
|
|
|
|
|
|
## ##-- user options |
29
|
|
|
|
|
|
|
## class => $class, ##-- optional, useful for debugging from header file |
30
|
|
|
|
|
|
|
## base => $basename, ##-- file basename (default=undef:none); use files "${base}.dba1", "${base}.dba2", "${base}.hdr" |
31
|
|
|
|
|
|
|
## flags => $flags, ##-- fcntl flags or open-mode (default='r') |
32
|
|
|
|
|
|
|
## perms => $perms, ##-- creation permissions (default=(0666 &~umask)) |
33
|
|
|
|
|
|
|
## dmax => $dmax, ##-- maximum distance for co-occurrences (default=5) |
34
|
|
|
|
|
|
|
## fmin => $fmin, ##-- minimum pair frequency (default=0) |
35
|
|
|
|
|
|
|
## pack_i => $pack_i, ##-- pack-template for IDs (default='N') |
36
|
|
|
|
|
|
|
## pack_f => $pack_f, ##-- pack-template for IDs (default='N') |
37
|
|
|
|
|
|
|
## keeptmp => $bool, ##-- keep temporary files? (default=false) |
38
|
|
|
|
|
|
|
## ## |
39
|
|
|
|
|
|
|
## ##-- size info (after open() or load()) |
40
|
|
|
|
|
|
|
## size1 => $size1, ##-- == $r1->size() |
41
|
|
|
|
|
|
|
## size2 => $size2, ##-- == $r2->size() |
42
|
|
|
|
|
|
|
## ## |
43
|
|
|
|
|
|
|
## ##-- low-level data |
44
|
|
|
|
|
|
|
## r1 => $r1, ##-- pf: [$end2,$f1] @ $i1 |
45
|
|
|
|
|
|
|
## r2 => $r2, ##-- pf: [$i2,$f12] @ end2($i1-1)..(end2($i1)-1) |
46
|
|
|
|
|
|
|
## N => $N, ##-- sum($f12) |
47
|
|
|
|
|
|
|
## ) |
48
|
|
|
|
|
|
|
sub new { |
49
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
50
|
0
|
|
0
|
|
|
|
my $cof = bless({ |
51
|
|
|
|
|
|
|
base =>undef, |
52
|
|
|
|
|
|
|
flags =>'r', |
53
|
|
|
|
|
|
|
perms =>(0666 & ~umask), |
54
|
|
|
|
|
|
|
dmax =>5, |
55
|
|
|
|
|
|
|
fmin =>0, |
56
|
|
|
|
|
|
|
pack_i=>'N', |
57
|
|
|
|
|
|
|
pack_f=>'N', |
58
|
|
|
|
|
|
|
r1 => $PFCLASS->new(), |
59
|
|
|
|
|
|
|
r2 => $PFCLASS->new(), |
60
|
|
|
|
|
|
|
N => 0, |
61
|
|
|
|
|
|
|
@_ |
62
|
|
|
|
|
|
|
}, (ref($that)||$that)); |
63
|
0
|
|
|
|
|
|
$cof->{class} = ref($cof); |
64
|
0
|
0
|
|
|
|
|
return $cof->open() if (defined($cof->{base})); |
65
|
0
|
|
|
|
|
|
return $cof; |
66
|
|
|
|
|
|
|
} |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
sub DESTROY { |
69
|
0
|
0
|
|
0
|
|
|
$_[0]->close() if ($_[0]->opened); |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
##============================================================================== |
73
|
|
|
|
|
|
|
## I/O |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
76
|
|
|
|
|
|
|
## I/O: open/close |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
## $cof_or_undef = $cof->open($base,$flags) |
79
|
|
|
|
|
|
|
## $cof_or_undef = $cof->open($base) |
80
|
|
|
|
|
|
|
## $cof_or_undef = $cof->open() |
81
|
|
|
|
|
|
|
sub open { |
82
|
0
|
|
|
0
|
0
|
|
my ($cof,$base,$flags) = @_; |
83
|
0
|
|
0
|
|
|
|
$base //= $cof->{base}; |
84
|
0
|
|
0
|
|
|
|
$flags //= $cof->{flags}; |
85
|
0
|
0
|
|
|
|
|
$cof->close() if ($cof->opened); |
86
|
0
|
|
|
|
|
|
$cof->{base} = $base; |
87
|
0
|
|
|
|
|
|
$cof->{flags} = $flags = fcflags($flags); |
88
|
0
|
0
|
0
|
|
|
|
if (fcread($flags) && !fctrunc($flags)) { |
89
|
0
|
0
|
|
|
|
|
$cof->loadHeader() |
90
|
|
|
|
|
|
|
or $cof->logconess("failed to load header from '$cof->{base}.hdr': $!"); |
91
|
|
|
|
|
|
|
} |
92
|
0
|
0
|
|
|
|
|
$cof->{r1}->open("$base.dba1", $flags, perms=>$cof->{perms}, packas=>"$cof->{pack_i}$cof->{pack_f}") |
93
|
|
|
|
|
|
|
or $cof->logconfess("open failed for $base.dba1: $!"); |
94
|
0
|
0
|
|
|
|
|
$cof->{r2}->open("$base.dba2", $flags, perms=>$cof->{perms}, packas=>"$cof->{pack_i}$cof->{pack_f}") |
95
|
|
|
|
|
|
|
or $cof->logconfess("open failed for $base.dba2: $!"); |
96
|
0
|
|
|
|
|
|
$cof->{size1} = $cof->{r1}->size; |
97
|
0
|
|
|
|
|
|
$cof->{size2} = $cof->{r2}->size; |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
#$cof->debug("open(): opened level-1 relation $cof->{r1}{file}.* as ", ref($cof->{r1})); |
100
|
|
|
|
|
|
|
#$cof->debug("open(): opened level-2 relation $cof->{r2}{file}.* as ", ref($cof->{r2})); |
101
|
0
|
|
|
|
|
|
return $cof; |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
## $cof_or_undef = $cof->close() |
105
|
|
|
|
|
|
|
sub close { |
106
|
0
|
|
|
0
|
0
|
|
my $cof = shift; |
107
|
0
|
0
|
0
|
|
|
|
if ($cof->opened && fcwrite($cof->{flags})) { |
108
|
0
|
0
|
|
|
|
|
$cof->saveHeader() or return undef; |
109
|
|
|
|
|
|
|
} |
110
|
0
|
0
|
|
|
|
|
$cof->{r1}->close() or return undef; |
111
|
0
|
0
|
|
|
|
|
$cof->{r2}->close() or return undef; |
112
|
0
|
|
|
|
|
|
undef $cof->{base}; |
113
|
0
|
|
|
|
|
|
return $cof; |
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
## $bool = $cof->opened() |
117
|
|
|
|
|
|
|
sub opened { |
118
|
0
|
|
|
0
|
0
|
|
my $cof = shift; |
119
|
|
|
|
|
|
|
return |
120
|
|
|
|
|
|
|
(defined($cof->{base}) |
121
|
|
|
|
|
|
|
&& defined($cof->{r1}) && $cof->{r1}->opened |
122
|
0
|
|
0
|
|
|
|
&& defined($cof->{r2}) && $cof->{r2}->opened); |
123
|
|
|
|
|
|
|
} |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
126
|
|
|
|
|
|
|
## I/O: header |
127
|
|
|
|
|
|
|
## + largely INHERITED from DiaColloDB::Persistent |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
## @keys = $cof->headerKeys() |
130
|
|
|
|
|
|
|
## + keys to save as header |
131
|
|
|
|
|
|
|
sub headerKeys { |
132
|
0
|
|
0
|
0
|
1
|
|
return grep {!ref($_[0]{$_}) && $_ !~ m{^(?:base|flags|perms)$}} keys %{$_[0]}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
} |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
## $bool = $cof->loadHeaderData($hdr) |
136
|
|
|
|
|
|
|
## + instantiates header data from $hdr |
137
|
|
|
|
|
|
|
## + overrides DiaColloDB::Persistent implementation |
138
|
|
|
|
|
|
|
sub loadHeaderData { |
139
|
0
|
|
|
0
|
1
|
|
my ($cof,$hdr) = @_; |
140
|
0
|
0
|
0
|
|
|
|
if (!defined($hdr) && !fccreat($cof->{flags})) { |
|
|
0
|
|
|
|
|
|
141
|
0
|
|
|
|
|
|
$cof->logconfess("loadHeaderData() failed to load header data from ", $cof->headerFile, ": $!"); |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
elsif (defined($hdr)) { |
144
|
0
|
|
|
|
|
|
return $cof->SUPER::loadHeaderData($hdr); |
145
|
|
|
|
|
|
|
} |
146
|
0
|
|
|
|
|
|
return $cof; |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
## $bool = $enum->saveHeader() |
150
|
|
|
|
|
|
|
## + inherited from DiaColloDB::Persistent |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
153
|
|
|
|
|
|
|
## I/O: text |
154
|
|
|
|
|
|
|
## + largely INHERITED from DiaColloDB::Persistent |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
## $bool = $obj->loadTextFile($filename_or_handle, %opts) |
157
|
|
|
|
|
|
|
## + wraps loadTextFh() |
158
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Persistent |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
## $cof = $cof->loadTextFh($fh,%opts) |
161
|
|
|
|
|
|
|
## + loads from text file as saved by saveTextFh() |
162
|
|
|
|
|
|
|
## + supports semi-sorted input: input fh must be sorted by $i1, |
163
|
|
|
|
|
|
|
## and all $i2 for each $i1 must be adjacent (i.e. no intervening $j1 != $i1) |
164
|
|
|
|
|
|
|
## + supports multiple lines for pairs ($i1,$i2) provided the above conditions hold |
165
|
|
|
|
|
|
|
## + supports loading of $cof->{N} from single-value lines |
166
|
|
|
|
|
|
|
## + %opts: clobber %$cof |
167
|
|
|
|
|
|
|
sub loadTextFh { |
168
|
0
|
|
|
0
|
1
|
|
my ($cof,$infh,%opts) = @_; |
169
|
0
|
0
|
|
|
|
|
if (!ref($cof)) { |
170
|
0
|
|
|
|
|
|
$cof = $cof->new(%opts); |
171
|
|
|
|
|
|
|
} else { |
172
|
0
|
|
|
|
|
|
@$cof{keys %opts} = values %opts; |
173
|
|
|
|
|
|
|
} |
174
|
0
|
0
|
|
|
|
|
$cof->logconfess("loadTextFh(): cannot load unopened database!") if (!$cof->opened); |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
##-- common variables |
177
|
0
|
|
|
|
|
|
my $pack_f = $cof->{pack_f}; |
178
|
0
|
|
|
|
|
|
my $pack_i = $cof->{pack_i}; |
179
|
0
|
|
|
|
|
|
my $pack_r1 = "${pack_i}${pack_f}"; ##-- $r1 : [$end2,$f1] @ $i1 |
180
|
0
|
|
|
|
|
|
my $pack_r2 = "${pack_i}${pack_f}"; ##-- $r2 : [$i2,$f12] @ end2($i1-1)..(end2($i1)-1) |
181
|
0
|
|
|
|
|
|
my $len_r2 = packsize($pack_r2); |
182
|
0
|
|
0
|
|
|
|
my $fmin = $cof->{fmin} // 0; |
183
|
0
|
|
|
|
|
|
my ($r1,$r2) = @$cof{qw(r1 r2)}; |
184
|
0
|
|
|
|
|
|
$r1->truncate(); |
185
|
0
|
|
|
|
|
|
$r2->truncate(); |
186
|
0
|
|
|
|
|
|
my ($fh1,$fh2) = ($r1->{fh},$r2->{fh}); |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
##-- iteration variables |
189
|
0
|
|
|
|
|
|
my ($pos1,$pos2) = (0,0); |
190
|
0
|
|
|
|
|
|
my ($i1_cur,$f1) = (-1,0); |
191
|
0
|
|
|
|
|
|
my ($f12,$i1,$i2,$f); |
192
|
0
|
|
|
|
|
|
my $N = 0; ##-- total marginal frequency as extracted from %f12 |
193
|
0
|
|
|
|
|
|
my $N1 = 0; ##-- total N as extracted from single-element records |
194
|
0
|
|
|
|
|
|
my %f12 = qw(); ##-- ($i2=>$f12, ...) for $i1_cur; un-collocated f1 counts appear as $i2='-1' |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
##-- guts for inserting records from $i1_cur,%f12,$pos1,$pos2 |
197
|
|
|
|
|
|
|
my $insert = sub { |
198
|
0
|
0
|
|
0
|
|
|
if ($i1_cur >= 0) { |
199
|
0
|
0
|
|
|
|
|
if ($i1_cur != $pos1) { |
200
|
|
|
|
|
|
|
##-- we've skipped one or more $i1 because it had no collocates (e.g. kern01 i1=287123="Untier/1906") |
201
|
0
|
|
|
|
|
|
$fh1->print( pack($pack_r1,$pos2,0) x ($i1_cur-$pos1) ); |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
##-- dump r2-records for $i1_cur |
204
|
0
|
|
|
|
|
|
$f1 = 0; |
205
|
0
|
|
|
|
|
|
foreach (sort {$a<=>$b} keys %f12) { |
|
0
|
|
|
|
|
|
|
206
|
0
|
|
|
|
|
|
$f = $f12{$_}; |
207
|
0
|
|
|
|
|
|
$f1 += $f; |
208
|
0
|
0
|
0
|
|
|
|
next if ($f < $fmin || $_ < 0); ##-- skip here so we can track "real" marginal frequencies |
209
|
0
|
|
|
|
|
|
$fh2->print(pack($pack_r2, $_,$f)); |
210
|
0
|
|
|
|
|
|
++$pos2; |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
##-- dump r1-record for $i1_cur |
213
|
0
|
|
|
|
|
|
$fh1->print(pack($pack_r1, $pos2,$f1)); |
214
|
0
|
|
|
|
|
|
$pos1 = $i1_cur+1; |
215
|
0
|
|
|
|
|
|
$N += $f1; |
216
|
|
|
|
|
|
|
} |
217
|
0
|
|
|
|
|
|
$i1_cur = $i1; |
218
|
0
|
|
|
|
|
|
%f12 = qw(); |
219
|
0
|
|
|
|
|
|
}; |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
##-- ye olde loope |
222
|
0
|
|
|
|
|
|
binmode($infh,':raw'); |
223
|
0
|
|
|
|
|
|
while (defined($_=<$infh>)) { |
224
|
0
|
|
|
|
|
|
chomp; |
225
|
0
|
|
|
|
|
|
($f12,$i1,$i2) = split(' ',$_,3); |
226
|
0
|
0
|
|
|
|
|
if (!defined($i1)) { |
227
|
|
|
|
|
|
|
#$cof->debug("N1 += $f12"); |
228
|
0
|
|
|
|
|
|
$N1 += $f12; ##-- load N values |
229
|
0
|
|
|
|
|
|
next; |
230
|
|
|
|
|
|
|
} |
231
|
0
|
0
|
|
|
|
|
$insert->() if ($i1 != $i1_cur); ##-- insert record(s) for $i1_cur |
232
|
0
|
|
0
|
|
|
|
$f12{($i2//-1)} += $f12; ##-- buffer co-frequencies for $i1_cur; track un-collocated frequencies as $i2=-1 |
233
|
|
|
|
|
|
|
} |
234
|
0
|
|
|
|
|
|
$insert->(); ##-- write record(s) for final $i1_cur |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
##-- adopt final $N and sizes |
237
|
|
|
|
|
|
|
#$cof->debug("FINAL: N1=$N1, N=$N"); |
238
|
0
|
0
|
|
|
|
|
$cof->{N} = $N1>$N ? $N1 : $N; |
239
|
0
|
|
|
|
|
|
$_->remap() foreach (grep {$_->can('remap')} @$cof{qw(r1 r2)}); |
|
0
|
|
|
|
|
|
|
240
|
0
|
|
|
|
|
|
$cof->{size1} = $r1->size; |
241
|
0
|
|
|
|
|
|
$cof->{size2} = $r2->size; |
242
|
|
|
|
|
|
|
|
243
|
0
|
|
|
|
|
|
return $cof; |
244
|
|
|
|
|
|
|
} |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
## $cof = $cof->loadTextFile_create($fh,%opts) |
247
|
|
|
|
|
|
|
## + old version of loadTextFile() which doesn't support N, semi-sorted input, or multiple ($i1,$i2) entries |
248
|
|
|
|
|
|
|
## + not useable by union() method |
249
|
|
|
|
|
|
|
BEGIN { |
250
|
1
|
|
|
1
|
|
10
|
*loadTextFile_create = DiaColloDB::Compat->nocompat("loadTextFile_create"); |
251
|
|
|
|
|
|
|
} |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
## $bool = $obj->saveTextFile($filename_or_handle, %opts) |
254
|
|
|
|
|
|
|
## + wraps saveTextFh() |
255
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Persistent |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
## $bool = $cof->saveTextFh($fh,%opts) |
258
|
|
|
|
|
|
|
## + save from text file with lines of the form: |
259
|
|
|
|
|
|
|
## N ##-- 1 field : N |
260
|
|
|
|
|
|
|
## FREQ ID1 ##-- 2 fields: un-collocated portion of $f1 |
261
|
|
|
|
|
|
|
## FREQ ID1 ID2 ##-- 3 fields: co-frequency pair (ID2 >= 0) |
262
|
|
|
|
|
|
|
## + %opts: |
263
|
|
|
|
|
|
|
## i2s => \&CODE, ##-- code-ref for formatting indices; called as $s=CODE($i) |
264
|
|
|
|
|
|
|
## i2s1 => \&CODE, ##-- code-ref for formatting item1 indices (overrides 'i2s') |
265
|
|
|
|
|
|
|
## i2s2 => \&CODE, ##-- code-ref for formatting item2 indices (overrides 'i2s') |
266
|
|
|
|
|
|
|
sub saveTextFh { |
267
|
0
|
|
|
0
|
1
|
|
my ($cof,$outfh,%opts) = @_; |
268
|
0
|
0
|
|
|
|
|
$cof->logconfess("saveTextFile(): cannot save unopened DB") if (!$cof->opened); |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
##-- common variables |
271
|
0
|
|
|
|
|
|
my ($r1,$r2) = @$cof{qw(r1 r2)}; |
272
|
0
|
|
|
|
|
|
my $pack_r1 = $r1->{packas}; |
273
|
0
|
|
|
|
|
|
my $pack_r2 = $r2->{packas}; |
274
|
0
|
|
|
|
|
|
my $i2s = $opts{i2s}; |
275
|
0
|
0
|
|
|
|
|
my $i2s1 = exists($opts{i2s1}) ? $opts{i2s1} : $i2s; |
276
|
0
|
0
|
|
|
|
|
my $i2s2 = exists($opts{i2s2}) ? $opts{i2s2} : $i2s; |
277
|
0
|
|
|
|
|
|
my ($fh1,$fh2) = ($r1->{fh},$r2->{fh}); |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
##-- iteration variables |
280
|
0
|
|
|
|
|
|
my ($buf1,$i1,$s1,$f1,$end2); |
281
|
0
|
|
|
|
|
|
my ($buf2,$off2,$i2,$s2,$f12,$f12sum); |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
##-- ye olde loope |
284
|
0
|
|
|
|
|
|
binmode($outfh,':raw'); |
285
|
0
|
|
|
|
|
|
$outfh->print($cof->{N}, "\n"); |
286
|
0
|
|
|
|
|
|
for ($r1->seek($i1=0), $r2->seek($off2=0); !$r1->eof(); ++$i1) { |
287
|
0
|
0
|
|
|
|
|
$r1->read(\$buf1) or $cof->logconfess("saveTextFile(): failed to read record $i1 from $r1->{file}: $!"); |
288
|
0
|
|
|
|
|
|
($end2,$f1) = unpack($pack_r1,$buf1); |
289
|
0
|
0
|
|
|
|
|
$s1 = $i2s1 ? $i2s1->($i1) : $i1; |
290
|
|
|
|
|
|
|
|
291
|
0
|
|
0
|
|
|
|
for ($f12sum=0; $off2 < $end2 && !$r2->eof(); ++$off2) { |
292
|
0
|
0
|
|
|
|
|
$r2->read(\$buf2) or $cof->logconfess("saveTextFile(): failed to read record $off2 from $r2->{file}: $!"); |
293
|
0
|
|
|
|
|
|
($i2,$f12) = unpack($pack_r2,$buf2); |
294
|
0
|
|
|
|
|
|
$f12sum += $f12; |
295
|
0
|
0
|
|
|
|
|
$s2 = $i2s2 ? $i2s2->($i2) : $i2; |
296
|
0
|
|
|
|
|
|
$outfh->print(join("\t", $f12, $s1,$s2), "\n"); |
297
|
|
|
|
|
|
|
} |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
##-- track un-collocated portion of $f1, if any |
300
|
0
|
0
|
|
|
|
|
$outfh->print(join("\t", $f1-$f12sum, $s1), "\n") if ($f12sum != $f1); |
301
|
|
|
|
|
|
|
} |
302
|
|
|
|
|
|
|
|
303
|
0
|
|
|
|
|
|
return $cof; |
304
|
|
|
|
|
|
|
} |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
##============================================================================== |
307
|
|
|
|
|
|
|
## Relation API: create |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
## $rel = $CLASS_OR_OBJECT->create($coldb,$tokdat_file,%opts) |
310
|
|
|
|
|
|
|
## + populates current database from $tokdat_file, |
311
|
|
|
|
|
|
|
## a tt-style text file containing 1 token-id perl line with optional blank lines |
312
|
|
|
|
|
|
|
## + %opts: clobber %$ug, also: |
313
|
|
|
|
|
|
|
## ( |
314
|
|
|
|
|
|
|
## size=>$size, ##-- set initial size (number of types) |
315
|
|
|
|
|
|
|
## ) |
316
|
|
|
|
|
|
|
## + DISABLED |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
##============================================================================== |
319
|
|
|
|
|
|
|
## Relation API: union |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
## $cof = CLASS_OR_OBJECT->union($coldb, \@pairs, %opts) |
322
|
|
|
|
|
|
|
## + merge multiple unigram unigram indices from \@pairs into new object |
323
|
|
|
|
|
|
|
## + @pairs : array of pairs ([$cof,\@xi2u],...) |
324
|
|
|
|
|
|
|
## of unigram-objects $cof and tuple-id maps \@xi2u for $cof |
325
|
|
|
|
|
|
|
## - \@xi2u may also be a mapping object supporting a toArray() method |
326
|
|
|
|
|
|
|
## + %opts: clobber %$cof |
327
|
|
|
|
|
|
|
## + implicitly flushes the new index |
328
|
|
|
|
|
|
|
## + DISABLED |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
##============================================================================== |
331
|
|
|
|
|
|
|
## Relation API: dbinfo |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
## \%info = $rel->dbinfo($coldb) |
334
|
|
|
|
|
|
|
## + embedded info-hash for $coldb->dbinfo() |
335
|
|
|
|
|
|
|
sub dbinfo { |
336
|
0
|
|
|
0
|
0
|
|
my $cof = shift; |
337
|
0
|
|
|
|
|
|
my $info = $cof->SUPER::dbinfo(); |
338
|
0
|
|
|
|
|
|
@$info{qw(fmin dmax size1 size2 N)} = @$cof{qw(fmin dmax size1 size2 N)}; |
339
|
0
|
|
|
|
|
|
return $info; |
340
|
|
|
|
|
|
|
} |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
##============================================================================== |
344
|
|
|
|
|
|
|
## Utilities: lookup |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
## $f = $cof->f1( @ids) |
347
|
|
|
|
|
|
|
## $f = $cof->f1(\@ids) |
348
|
|
|
|
|
|
|
## + get total marginal unigram frequency (db must be opened) |
349
|
|
|
|
|
|
|
sub f1 { |
350
|
0
|
|
|
0
|
0
|
|
my $cof = shift; |
351
|
0
|
0
|
|
|
|
|
my $ids = UNIVERSAL::isa($_[0],'ARRAY') ? @{$_[0]} : \@_; |
|
0
|
|
|
|
|
|
|
352
|
0
|
|
|
|
|
|
my $r1 = $cof->{r1}; |
353
|
0
|
|
|
|
|
|
my $f = 0; |
354
|
0
|
|
|
|
|
|
foreach (@$ids) { |
355
|
0
|
|
|
|
|
|
$f += $r1->fetch($_)->[1]; |
356
|
|
|
|
|
|
|
} |
357
|
0
|
|
|
|
|
|
return $f; |
358
|
|
|
|
|
|
|
} |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
## $f12 = $cof->f12($id1,$id2) |
361
|
|
|
|
|
|
|
## + return joint frequency for pair ($id1,$id2) |
362
|
|
|
|
|
|
|
## + UNUSED |
363
|
|
|
|
|
|
|
sub f12 { |
364
|
0
|
|
|
0
|
0
|
|
my ($cof,$i1,$i2) = @_; |
365
|
0
|
0
|
|
|
|
|
my $beg2 = ($i1==0 ? 0 : $cof->{r1}->fetch($i1-1)->[0]); |
366
|
0
|
|
|
|
|
|
my $end2 = $cof->{r1}->fetch($i1)->[0]; |
367
|
0
|
|
|
|
|
|
my $pos2 = $cof->{r2}->bsearch($i2, lo=>$beg2, hi=>$end2, packas=>$cof->{pack_i}); |
368
|
0
|
0
|
|
|
|
|
return defined($pos2) ? $cof->{r2}->fetch($pos2)->[1] : 0; |
369
|
|
|
|
|
|
|
} |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
##============================================================================== |
372
|
|
|
|
|
|
|
## Relation API: default: profiling |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
## $prf = $cof->subprofile1(\@xids,%opts) |
375
|
|
|
|
|
|
|
## + get joint co-frequency profile for @xids (db must be opened; f1 and f12 only) |
376
|
|
|
|
|
|
|
## + %opts: |
377
|
|
|
|
|
|
|
## groupby => \&gbsub, ##-- key-extractor $key2_or_undef = $gbsub->($i2) |
378
|
|
|
|
|
|
|
## coldb => $coldb, ##-- for debugging |
379
|
|
|
|
|
|
|
## onepass => $bool, ##-- use fast but incorrect 1-pass method? |
380
|
|
|
|
|
|
|
sub subprofile1 { |
381
|
0
|
|
|
0
|
1
|
|
my ($cof,$ids,%opts) = @_; |
382
|
|
|
|
|
|
|
|
383
|
0
|
0
|
|
|
|
|
$ids = [$ids] if (!UNIVERSAL::isa($ids,'ARRAY')); |
384
|
0
|
|
|
|
|
|
my $r1 = $cof->{r1}; |
385
|
0
|
|
|
|
|
|
my $r2 = $cof->{r2}; |
386
|
0
|
|
|
|
|
|
my $pack1 = $r1->{packas}; |
387
|
0
|
|
|
|
|
|
my $pack2 = $r2->{packas}; |
388
|
0
|
|
|
|
|
|
my $pack1i = $cof->{pack_i}; |
389
|
0
|
|
|
|
|
|
my $pack1f = "@".packsize($cof->{pack_i}).$cof->{pack_f}; |
390
|
0
|
|
0
|
|
|
|
my $size1 = $cof->{size1} // ($cof->{size1}=$r1->size); |
391
|
0
|
|
0
|
|
|
|
my $size2 = $cof->{size2} // ($cof->{size2}=$r2->size); |
392
|
0
|
|
|
|
|
|
my $groupby = $opts{groupby}; |
393
|
0
|
|
|
|
|
|
my $pack_id = $opts{coldb}{pack_id}; |
394
|
0
|
|
|
|
|
|
my $onepass = $opts{onepass}; |
395
|
0
|
|
|
|
|
|
my $pf1 = 0; |
396
|
0
|
|
|
|
|
|
my $pf12 = {}; |
397
|
0
|
|
|
|
|
|
my $pf2 = {}; |
398
|
0
|
|
|
|
|
|
my ($i1,$i2,$key2, $beg2,$end2,$pos2, $f1,$f12, $buf, %i2); |
399
|
|
|
|
|
|
|
|
400
|
0
|
|
|
|
|
|
foreach $i1 (@$ids) { |
401
|
0
|
0
|
|
|
|
|
next if ($i1 >= $size1); |
402
|
0
|
0
|
|
|
|
|
$beg2 = ($i1==0 ? 0 : unpack($pack1i,$r1->fetchraw($i1-1,\$buf))); |
403
|
0
|
|
|
|
|
|
($end2,$f1) = unpack($pack1, $r1->fetchraw($i1,\$buf)); |
404
|
|
|
|
|
|
|
|
405
|
0
|
|
|
|
|
|
$pf1 += $f1; |
406
|
0
|
0
|
|
|
|
|
next if ($beg2 >= $size2); |
407
|
0
|
|
|
|
|
|
for ($r2->seek($beg2), $pos2=$beg2; $pos2 < $end2; ++$pos2) { |
408
|
0
|
0
|
|
|
|
|
$r2->getraw(\$buf) or last; |
409
|
0
|
|
|
|
|
|
($i2,$f12) = unpack($pack2, $buf); |
410
|
0
|
0
|
|
|
|
|
$key2 = $groupby ? $groupby->($i2) : pack($pack_id,$i2); |
411
|
0
|
0
|
|
|
|
|
next if (!defined($key2)); ##-- item2 selection via groupby CODE-ref |
412
|
0
|
|
|
|
|
|
$pf12->{$key2} += $f12; |
413
|
0
|
0
|
0
|
|
|
|
if ($onepass && !exists($i2{$i2})) { |
414
|
0
|
|
|
|
|
|
$pf2->{$key2} += unpack($pack1f, $r1->fetchraw($i2,\$buf)); ##-- avoid double-counting f2 for shared collocates |
415
|
0
|
|
|
|
|
|
$i2{$i2} = undef; |
416
|
|
|
|
|
|
|
} |
417
|
|
|
|
|
|
|
} |
418
|
|
|
|
|
|
|
} |
419
|
|
|
|
|
|
|
return DiaColloDB::Profile->new( |
420
|
|
|
|
|
|
|
N=>$cof->{N}, |
421
|
0
|
|
|
|
|
|
f1=>$pf1, |
422
|
|
|
|
|
|
|
f2=>$pf2, |
423
|
|
|
|
|
|
|
f12=>$pf12, |
424
|
|
|
|
|
|
|
); |
425
|
|
|
|
|
|
|
} |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
## \%slice2prf = $rel->subprofile2(\%slice2prf, %opts) |
428
|
|
|
|
|
|
|
## + populate f2 frequencies for profiles in \%slice2prf |
429
|
|
|
|
|
|
|
## + %opts: |
430
|
|
|
|
|
|
|
## groupby => \%gbreq, ##-- parsed groupby object |
431
|
|
|
|
|
|
|
## a2data => \%a2data, ##-- maps indexed attributes to associated datastructures |
432
|
|
|
|
|
|
|
## coldb => $coldb, ##-- parent DiaColloDB object (for shared data, debugging) |
433
|
|
|
|
|
|
|
## ... ##-- other options as for profile(), esp. qw(slice) |
434
|
|
|
|
|
|
|
sub subprofile2 { |
435
|
0
|
|
|
0
|
1
|
|
my ($cof,$slice2prf,%opts) = @_; |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
##-- vars: common |
438
|
0
|
|
|
|
|
|
my $coldb = $opts{coldb}; |
439
|
0
|
|
|
|
|
|
my $groupby = $opts{groupby}; |
440
|
0
|
|
|
|
|
|
my $a2data = $opts{a2data}; |
441
|
0
|
|
|
|
|
|
my $slice = $opts{slice}; |
442
|
|
|
|
|
|
|
#my $slices = $opts{slices} || [sort {$a<=>$b} keys %$slice2prf]; |
443
|
0
|
|
|
|
|
|
my ($dfilter,$slo,$shi,$dlo,$dhi) = $coldb->parseDateRequest(@opts{qw(date slice fill)}); |
444
|
0
|
|
0
|
|
|
|
my $filter_by_date = $slice || defined($dlo) || defined($dhi); |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
##-- vars: relation-wise |
447
|
0
|
|
|
|
|
|
my $r1 = $cof->{r1}; |
448
|
0
|
|
|
|
|
|
my $pack_r1f = '@'.packsize($cof->{pack_i}).$cof->{pack_f}; |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
##-- get "most specific projected attribute" ("MSPA"): that projected attribute with largest enum |
451
|
|
|
|
|
|
|
#my $gb1 = scalar(@{$groupby->{attrs}})==1; ##-- are we grouping by a single attribute? -->optimize! |
452
|
0
|
|
|
|
|
|
my $mspai = (sort {$b->[1]<=>$a->[1]} map {[$_,$a2data->{$groupby->{attrs}[$_]}{enum}->size]} (0..$#{$groupby->{attrs}}))[0][0]; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
453
|
0
|
|
|
|
|
|
my $mspa = $groupby->{attrs}[$mspai]; |
454
|
0
|
|
|
|
|
|
my $mspgpack = $groupby->{gpack}[$mspai]; |
455
|
0
|
|
|
|
|
|
my $mspxpack = $groupby->{xpack}[$mspai]; |
456
|
0
|
|
|
|
|
|
my $msp2x = $a2data->{$mspa}{a2x}; |
457
|
0
|
|
|
|
|
|
my %mspv = qw(); ##-- checked MSPA-values ($mspvi) |
458
|
0
|
|
|
|
|
|
my $xenum = $coldb->{xenum}; |
459
|
0
|
|
|
|
|
|
my $pack_xd = "@".(packsize($coldb->{pack_id}) * scalar(@{$coldb->{attrs}})).$coldb->{pack_date}; |
|
0
|
|
|
|
|
|
|
460
|
0
|
|
|
|
|
|
my $xs2g = $groupby->{xs2g}; |
461
|
0
|
|
|
|
|
|
my $debug_xp2g = join('',@{$groupby->{xpack}}); |
|
0
|
|
|
|
|
|
|
462
|
0
|
|
|
|
|
|
my $debug_gpack= "($coldb->{pack_id})*"; |
463
|
0
|
|
|
|
|
|
my ($prf,$pf12, $mspvi,$i2,$x2,$d2,$ds2,$prf2,$key2, $buf,$f2); |
464
|
0
|
0
|
|
|
|
|
$prf2 = (values %$slice2prf)[0] if (!$filter_by_date); |
465
|
0
|
|
|
|
|
|
foreach $prf (values %$slice2prf) { |
466
|
0
|
|
|
|
|
|
$pf12 = $prf->{f12}; |
467
|
0
|
|
|
|
|
|
foreach (keys %$pf12) { |
468
|
0
|
|
|
|
|
|
$mspvi = unpack($mspgpack,$_); |
469
|
0
|
0
|
|
|
|
|
next if (exists $mspv{$mspvi}); |
470
|
0
|
|
|
|
|
|
$mspv{$mspvi} = undef; |
471
|
0
|
|
|
|
|
|
foreach $i2 (@{$msp2x->fetch($mspvi)}) { |
|
0
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
##-- get item2 x-tuple |
473
|
0
|
|
|
|
|
|
$x2 = $xenum->i2s($i2); |
474
|
|
|
|
|
|
|
|
475
|
0
|
0
|
|
|
|
|
if ($filter_by_date) { |
476
|
|
|
|
|
|
|
##-- extract item2 date slice |
477
|
0
|
|
|
|
|
|
$d2 = unpack($pack_xd, $x2); |
478
|
0
|
0
|
|
|
|
|
$ds2 = $slice ? int($d2/$slice)*$slice : 0; |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
##-- ignore if item2 slice isn't in our target range |
481
|
0
|
0
|
0
|
|
|
|
next if (!defined($prf2=$slice2prf->{$ds2}) |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
482
|
|
|
|
|
|
|
|| (defined($dlo) && $d2 < $dlo) |
483
|
|
|
|
|
|
|
|| (defined($dhi) && $d2 > $dhi)); |
484
|
|
|
|
|
|
|
} |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
##-- get groupby-key from x-tuple string & check for item2 membership in the appropriate slice-profile |
487
|
0
|
0
|
|
|
|
|
$key2 = $xs2g ? $xs2g->($x2) : pack($mspgpack, $i2); |
488
|
|
|
|
|
|
|
#$key2 = pack($debug_gpack, unpack($debug_xp2g, $x2)); ##-- ca. 6% faster than $xs2g, no having-checks |
489
|
|
|
|
|
|
|
#$key2 = pack($debug_gpack, $mspvi); ##-- ca. 12% faster than $xs2g, no having-checks, only valid for groupby single-attribute |
490
|
0
|
0
|
0
|
|
|
|
next if (!defined($key2) || !exists($prf2->{f12}{$key2})); ##-- having()-failure or no item2 in target slice |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
##-- add item2 frequency |
493
|
0
|
|
|
|
|
|
$f2 = unpack($pack_r1f, $r1->fetchraw($i2,\$buf)); |
494
|
0
|
|
|
|
|
|
$prf2->{f2}{$key2} += $f2; |
495
|
|
|
|
|
|
|
} |
496
|
|
|
|
|
|
|
} |
497
|
|
|
|
|
|
|
} |
498
|
|
|
|
|
|
|
|
499
|
0
|
|
|
|
|
|
return $slice2prf; |
500
|
|
|
|
|
|
|
} |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
##============================================================================== |
504
|
|
|
|
|
|
|
## Relation API: default: query info |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
## \%qinfo = $rel->qinfo($coldb, %opts) |
507
|
|
|
|
|
|
|
## + get query-info hash for profile administrivia (ddc hit links) |
508
|
|
|
|
|
|
|
## + %opts: as for profile(), additionally: |
509
|
|
|
|
|
|
|
## ( |
510
|
|
|
|
|
|
|
## qreqs => \@qreqs, ##-- as returned by $coldb->parseRequest($opts{query}) |
511
|
|
|
|
|
|
|
## gbreq => \%groupby, ##-- as returned by $coldb->groupby($opts{groupby}) |
512
|
|
|
|
|
|
|
## ) |
513
|
|
|
|
|
|
|
sub qinfo { |
514
|
0
|
|
|
0
|
1
|
|
my ($rel,$coldb,%opts) = @_; |
515
|
0
|
|
|
|
|
|
my ($q1strs,$q2strs,$qxstrs,$fstrs) = $rel->qinfoData($coldb,%opts); |
516
|
|
|
|
|
|
|
|
517
|
0
|
0
|
|
|
|
|
my $q1str = '('.(@$q1strs ? join(' WITH ', @$q1strs,@$qxstrs) : '*').') =1'; |
518
|
0
|
0
|
|
|
|
|
my $q2str = '('.(@$q2strs ? join(' WITH ', @$q2strs,@$qxstrs) : '*').') =2'; |
519
|
|
|
|
|
|
|
my $qstr = ( |
520
|
|
|
|
|
|
|
#"$q1str && $q2str" ##-- approximate with &&-query (especially buggy since #sep doesn't work right here; see mantis bug #654) |
521
|
0
|
0
|
|
|
|
|
"NEAR( $q1str, $q2str, ".(2*($rel->{dmax}-1)).")" |
522
|
|
|
|
|
|
|
.' #SEPARATE' |
523
|
|
|
|
|
|
|
.(@$fstrs ? (' '.join(' ',@$fstrs)) : ''), |
524
|
|
|
|
|
|
|
); |
525
|
|
|
|
|
|
|
return { |
526
|
|
|
|
|
|
|
fcoef => 2*$rel->{dmax}, |
527
|
0
|
|
|
|
|
|
qtemplate => $qstr, |
528
|
|
|
|
|
|
|
}; |
529
|
|
|
|
|
|
|
} |
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
##============================================================================== |
532
|
|
|
|
|
|
|
## Pacakge Alias(es) |
533
|
|
|
|
|
|
|
package DiaColloDB::Compat::v0_09::Cofreqs; |
534
|
1
|
|
|
1
|
|
10
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
48
|
|
535
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB::Compat::v0_09::Relation::Cofreqs); |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
##============================================================================== |
538
|
|
|
|
|
|
|
## Footer |
539
|
|
|
|
|
|
|
1; |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
__END__ |