| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
## -*- Mode: CPerl -*- |
|
2
|
|
|
|
|
|
|
## File: DiaColloDB::Relation::Cofreqs.pm |
|
3
|
|
|
|
|
|
|
## Author: Bryan Jurish <moocow@cpan.org> |
|
4
|
|
|
|
|
|
|
## Description: collocation db, profiling relation: co-frequency database (using pair of DiaColloDB::PackedFile) |
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package DiaColloDB::Relation::Cofreqs; |
|
7
|
1
|
|
|
1
|
|
11
|
use DiaColloDB::Compat; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
40
|
|
|
8
|
1
|
|
|
1
|
|
8
|
use DiaColloDB::Relation; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
20
|
|
|
9
|
1
|
|
|
1
|
|
7
|
use DiaColloDB::PackedFile; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
31
|
|
|
10
|
1
|
|
|
1
|
|
7
|
use DiaColloDB::PackedFile::MMap; |
|
|
1
|
|
|
|
|
16
|
|
|
|
1
|
|
|
|
|
36
|
|
|
11
|
1
|
|
|
1
|
|
8
|
use DiaColloDB::Utils qw(:fcntl :env :run :json :pack :temp :sort :jobs); |
|
|
1
|
|
|
|
|
13
|
|
|
|
1
|
|
|
|
|
64
|
|
|
12
|
1
|
|
|
1
|
|
515
|
use Fcntl qw(:DEFAULT :seek); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
33
|
|
|
13
|
1
|
|
|
1
|
|
502
|
use File::Basename qw(dirname); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
37
|
|
|
14
|
1
|
|
|
1
|
|
131
|
use version; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
10
|
|
|
15
|
1
|
|
|
1
|
|
88
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
5652
|
|
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
##============================================================================== |
|
18
|
|
|
|
|
|
|
## Globals & Constants |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB::Relation); |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
## $PFCLASS : object class for nested PackedFile objects |
|
23
|
|
|
|
|
|
|
our $PFCLASS = 'DiaColloDB::PackedFile::MMap'; |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
## $WANT_XS : try to use DiaColloDB::XS::CofUtils methods (default:undef -> if available)? |
|
26
|
|
|
|
|
|
|
our $WANT_XS = undef; |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
##============================================================================== |
|
29
|
|
|
|
|
|
|
## Constructors etc. |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
## $cof = CLASS_OR_OBJECT->new(%args) |
|
32
|
|
|
|
|
|
|
## + %args, object structure: |
|
33
|
|
|
|
|
|
|
## ( |
|
34
|
|
|
|
|
|
|
## ##-- user options |
|
35
|
|
|
|
|
|
|
## class => $class, ##-- optional, useful for debugging from header file |
|
36
|
|
|
|
|
|
|
## base => $basename, ##-- file basename (default=undef:none); use files "${base}.dba1", "${base}.dba2", "${base}.dba3", "${base}.dbaN", "${base}.hdr" |
|
37
|
|
|
|
|
|
|
## flags => $flags, ##-- fcntl flags or open-mode (default='r') |
|
38
|
|
|
|
|
|
|
## perms => $perms, ##-- creation permissions (default=(0666 &~umask)) |
|
39
|
|
|
|
|
|
|
## dmax => $dmax, ##-- maximum distance for co-occurrences (default=5) |
|
40
|
|
|
|
|
|
|
## fmin => $fmin, ##-- minimum pair frequency (default=0) |
|
41
|
|
|
|
|
|
|
## pack_i => $pack_i, ##-- pack-template for IDs (default='N') |
|
42
|
|
|
|
|
|
|
## pack_f => $pack_f, ##-- pack-template for frequencies (default='N') |
|
43
|
|
|
|
|
|
|
## pack_d => $pack_d, ##-- pack-tempalte for dates (default='n') |
|
44
|
|
|
|
|
|
|
## keeptmp => $bool, ##-- keep temporary files? (default=false) |
|
45
|
|
|
|
|
|
|
## logCompat => $level, ##-- log-level for compatibility warnings (default='warn') |
|
46
|
|
|
|
|
|
|
## logXS => $level, ##-- log-level for XS usage (default='trace') |
|
47
|
|
|
|
|
|
|
## ## |
|
48
|
|
|
|
|
|
|
## ##-- size info (after open() or load()) |
|
49
|
|
|
|
|
|
|
## size1 => $size1, ##-- == $r1->size() |
|
50
|
|
|
|
|
|
|
## size2 => $size2, ##-- == $r2->size() |
|
51
|
|
|
|
|
|
|
## size3 => $size3, ##-- == $r3->size() |
|
52
|
|
|
|
|
|
|
## sizeN => $sizeN, ##-- == $rN->size() |
|
53
|
|
|
|
|
|
|
## ## |
|
54
|
|
|
|
|
|
|
## ##-- low-level data |
|
55
|
|
|
|
|
|
|
## r1 => $r1, ##-- pf: [$end2] @ $i1 : constant (logical index) |
|
56
|
|
|
|
|
|
|
## r2 => $r2, ##-- pf: [$end3,$d1,$f1]* @ end2($i1-1)..(end2($i1+1)-1) : sorted by $d1 for each $i1 |
|
57
|
|
|
|
|
|
|
## r3 => $r3, ##-- pf: [$i2,$f12]* @ end3($d1-1)..(end3($d1+1)-1) : sorted by $i2 for each ($i1,$d1) |
|
58
|
|
|
|
|
|
|
## rN => $rN, ##-- pf: [$fN] @ $date - $ymin : totals by date |
|
59
|
|
|
|
|
|
|
## ymin => $dmin, ##-- constant == $coldb->{xdmin} |
|
60
|
|
|
|
|
|
|
## N => $N, ##-- sum($f12) [always used for version <= 0.11; used here only for slice==0] |
|
61
|
|
|
|
|
|
|
## version => $version, ##-- file version, for compatibility checks |
|
62
|
|
|
|
|
|
|
## ) |
|
63
|
|
|
|
|
|
|
sub new { |
|
64
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
|
65
|
0
|
|
0
|
|
|
|
my $cof = bless({ |
|
66
|
|
|
|
|
|
|
base =>undef, |
|
67
|
|
|
|
|
|
|
flags =>'r', |
|
68
|
|
|
|
|
|
|
perms =>(0666 & ~umask), |
|
69
|
|
|
|
|
|
|
dmax =>5, |
|
70
|
|
|
|
|
|
|
fmin =>0, |
|
71
|
|
|
|
|
|
|
pack_i=>'N', |
|
72
|
|
|
|
|
|
|
pack_f=>'N', |
|
73
|
|
|
|
|
|
|
pack_d=>'n', |
|
74
|
|
|
|
|
|
|
N => 0, |
|
75
|
|
|
|
|
|
|
version => $DiaColloDB::VERSION, |
|
76
|
|
|
|
|
|
|
logCompat => 'warn', |
|
77
|
|
|
|
|
|
|
logXS => 'trace', |
|
78
|
|
|
|
|
|
|
#njobs => 0, ##-- use $DiaColloDB::NJOBS |
|
79
|
|
|
|
|
|
|
@_ |
|
80
|
|
|
|
|
|
|
}, (ref($that)||$that)); |
|
81
|
0
|
|
0
|
|
|
|
$cof->{$_} //= $cof->mmclass($PFCLASS)->new() foreach (qw(r1 r2 r3 rN)); |
|
82
|
0
|
|
|
|
|
|
$cof->{class} = ref($cof); |
|
83
|
0
|
0
|
|
|
|
|
return $cof->open() if (defined($cof->{base})); |
|
84
|
0
|
|
|
|
|
|
return $cof; |
|
85
|
|
|
|
|
|
|
} |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
sub DESTROY { |
|
88
|
0
|
0
|
|
0
|
|
|
$_[0]->close() if ($_[0]->opened); |
|
89
|
|
|
|
|
|
|
} |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
##============================================================================== |
|
92
|
|
|
|
|
|
|
## I/O |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
95
|
|
|
|
|
|
|
## I/O: open/close |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
## $cof_or_undef = $cof->open($base,$flags) |
|
98
|
|
|
|
|
|
|
## $cof_or_undef = $cof->open($base) |
|
99
|
|
|
|
|
|
|
## $cof_or_undef = $cof->open() |
|
100
|
|
|
|
|
|
|
sub open { |
|
101
|
0
|
|
|
0
|
1
|
|
my ($cof,$base,$flags) = @_; |
|
102
|
0
|
|
0
|
|
|
|
$base //= $cof->{base}; |
|
103
|
0
|
|
0
|
|
|
|
$flags //= $cof->{flags}; |
|
104
|
0
|
0
|
|
|
|
|
$cof->close() if ($cof->opened); |
|
105
|
0
|
|
|
|
|
|
$cof->{base} = $base; |
|
106
|
0
|
|
|
|
|
|
$cof->{flags} = $flags = fcflags($flags); |
|
107
|
0
|
|
|
|
|
|
my ($hdr); ##-- save header, for version-checking |
|
108
|
0
|
0
|
0
|
|
|
|
if (fcread($flags) && !fctrunc($flags)) { |
|
109
|
0
|
0
|
|
|
|
|
$hdr = $cof->readHeader() |
|
110
|
|
|
|
|
|
|
or $cof->logconess("failed to read header data from '$cof->{base}.hdr': $!"); |
|
111
|
0
|
0
|
|
|
|
|
$cof->loadHeaderData($hdr) |
|
112
|
|
|
|
|
|
|
or $cof->logconess("failed to load header data from '$cof->{base}.hdr': $!"); |
|
113
|
|
|
|
|
|
|
} |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
##-- check compatibility |
|
116
|
0
|
|
|
|
|
|
my $min_version = qv(0.12.000); |
|
117
|
0
|
0
|
0
|
|
|
|
if ($hdr && (!defined($hdr->{version}) || version->parse($hdr->{version}) < $min_version)) { |
|
|
|
|
0
|
|
|
|
|
|
118
|
0
|
|
|
|
|
|
$cof->vlog($cof->{logCompat}, "using v0.11 compatibility mode for $cof->{base}.*; consider running \`dcdb-upgrade.perl ", dirname($cof->{base}), "\'"); |
|
119
|
0
|
|
|
|
|
|
DiaColloDB::Compat->usecompat('v0_11'); |
|
120
|
0
|
|
|
|
|
|
bless($cof, 'DiaColloDB::Compat::v0_11::Relation::Cofreqs'); |
|
121
|
0
|
|
|
|
|
|
$cof->{version} = $hdr->{version}; |
|
122
|
0
|
|
|
|
|
|
return $cof->open($base,$flags); |
|
123
|
|
|
|
|
|
|
} |
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
##-- open low-level data structures |
|
126
|
0
|
0
|
|
|
|
|
$cof->{r1}->open("$base.dba1", $flags, perms=>$cof->{perms}, packas=>"$cof->{pack_i}") |
|
127
|
|
|
|
|
|
|
or $cof->logconfess("open failed for $base.dba1: $!"); |
|
128
|
0
|
0
|
|
|
|
|
$cof->{r2}->open("$base.dba2", $flags, perms=>$cof->{perms}, packas=>"$cof->{pack_i}$cof->{pack_d}$cof->{pack_f}") |
|
129
|
|
|
|
|
|
|
or $cof->logconfess("open failed for $base.dba2: $!"); |
|
130
|
0
|
0
|
|
|
|
|
$cof->{r3}->open("$base.dba3", $flags, perms=>$cof->{perms}, packas=>"$cof->{pack_i}$cof->{pack_f}") |
|
131
|
|
|
|
|
|
|
or $cof->logconfess("open failed for $base.dba3: $!"); |
|
132
|
0
|
0
|
|
|
|
|
$cof->{rN}->open("$base.dbaN", $flags, perms=>$cof->{perms}, packas=>"$cof->{pack_f}") |
|
133
|
|
|
|
|
|
|
or $cof->logconfess("open failed for $base.dbaN: $!"); |
|
134
|
0
|
|
|
|
|
|
$cof->{size1} = $cof->{r1}->size; |
|
135
|
0
|
|
|
|
|
|
$cof->{size2} = $cof->{r2}->size; |
|
136
|
0
|
|
|
|
|
|
$cof->{size3} = $cof->{r3}->size; |
|
137
|
0
|
|
|
|
|
|
$cof->{sizeN} = $cof->{rN}->size; |
|
138
|
|
|
|
|
|
|
|
|
139
|
0
|
|
|
|
|
|
return $cof; |
|
140
|
|
|
|
|
|
|
} |
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
## $cof_or_undef = $cof->close() |
|
143
|
|
|
|
|
|
|
sub close { |
|
144
|
0
|
|
|
0
|
1
|
|
my $cof = shift; |
|
145
|
0
|
0
|
0
|
|
|
|
if ($cof->opened && fcwrite($cof->{flags})) { |
|
146
|
0
|
0
|
|
|
|
|
$cof->saveHeader() or return undef; |
|
147
|
|
|
|
|
|
|
} |
|
148
|
0
|
0
|
|
|
|
|
$cof->{r1}->close() or return undef; |
|
149
|
0
|
0
|
|
|
|
|
$cof->{r2}->close() or return undef; |
|
150
|
0
|
0
|
|
|
|
|
$cof->{r3}->close() or return undef; |
|
151
|
0
|
0
|
|
|
|
|
$cof->{rN}->close() or return undef; |
|
152
|
0
|
|
|
|
|
|
undef $cof->{base}; |
|
153
|
0
|
|
|
|
|
|
return $cof; |
|
154
|
|
|
|
|
|
|
} |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
## $bool = $cof->opened() |
|
157
|
|
|
|
|
|
|
sub opened { |
|
158
|
0
|
|
|
0
|
1
|
|
my $cof = shift; |
|
159
|
|
|
|
|
|
|
return |
|
160
|
|
|
|
|
|
|
(defined($cof->{base}) |
|
161
|
|
|
|
|
|
|
&& defined($cof->{r1}) && $cof->{r1}->opened |
|
162
|
|
|
|
|
|
|
&& defined($cof->{r2}) && $cof->{r2}->opened |
|
163
|
|
|
|
|
|
|
&& defined($cof->{r3}) && $cof->{r3}->opened |
|
164
|
|
|
|
|
|
|
&& defined($cof->{rN}) && $cof->{rN}->opened |
|
165
|
0
|
|
0
|
|
|
|
); |
|
166
|
|
|
|
|
|
|
} |
|
167
|
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
169
|
|
|
|
|
|
|
## I/O: header |
|
170
|
|
|
|
|
|
|
## + largely INHERITED from DiaColloDB::Persistent |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
## @keys = $cof->headerKeys() |
|
173
|
|
|
|
|
|
|
## + keys to save as header |
|
174
|
|
|
|
|
|
|
sub headerKeys { |
|
175
|
0
|
|
0
|
0
|
1
|
|
return grep {!ref($_[0]{$_}) && $_ !~ m{^(?:base|flags|perms|njobs|log.*)$}} keys %{$_[0]}; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
} |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
## $bool = $cof->loadHeaderData($hdr) |
|
179
|
|
|
|
|
|
|
## + instantiates header data from $hdr |
|
180
|
|
|
|
|
|
|
## + overrides DiaColloDB::Persistent implementation |
|
181
|
|
|
|
|
|
|
sub loadHeaderData { |
|
182
|
0
|
|
|
0
|
1
|
|
my ($cof,$hdr) = @_; |
|
183
|
0
|
0
|
0
|
|
|
|
if (!defined($hdr) && !fccreat($cof->{flags})) { |
|
|
|
0
|
|
|
|
|
|
|
184
|
0
|
|
|
|
|
|
$cof->logconfess("loadHeaderData() failed to load header data from ", $cof->headerFile, ": $!"); |
|
185
|
|
|
|
|
|
|
} |
|
186
|
|
|
|
|
|
|
elsif (defined($hdr)) { |
|
187
|
0
|
|
|
|
|
|
return $cof->SUPER::loadHeaderData($hdr); |
|
188
|
|
|
|
|
|
|
} |
|
189
|
0
|
|
|
|
|
|
return $cof; |
|
190
|
|
|
|
|
|
|
} |
|
191
|
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
## $bool = $enum->saveHeader() |
|
193
|
|
|
|
|
|
|
## + inherited from DiaColloDB::Persistent |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
196
|
|
|
|
|
|
|
## I/O: text |
|
197
|
|
|
|
|
|
|
## + largely INHERITED from DiaColloDB::Persistent |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
## $bool = $obj->loadTextFile($filename_or_handle, %opts) |
|
200
|
|
|
|
|
|
|
## + wraps loadTextFh() |
|
201
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Persistent |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
## $cof = $cof->loadTextFh($fh,%opts) |
|
204
|
|
|
|
|
|
|
## + loads from text file as saved by saveTextFh(): |
|
205
|
|
|
|
|
|
|
## N ##-- 1 field : N |
|
206
|
|
|
|
|
|
|
## FREQ ID1 DATE ##-- 3 fields: un-collocated portion of $f1 |
|
207
|
|
|
|
|
|
|
## FREQ ID1 DATE ID2 ##-- 4 fields: co-frequency pair (ID2 >= 0) |
|
208
|
|
|
|
|
|
|
## FREQ ID1 DATE ID2 DATE2 ##-- 5 fields: redundant date (used by create(); DATE2 is ignored) |
|
209
|
|
|
|
|
|
|
## + supports semi-sorted input: input fh must be sorted by $i1,$d1 |
|
210
|
|
|
|
|
|
|
## and all $i2 for each $i1,$d1 must be adjacent (i.e. no intervening ($j1,$e1) with $j1 != $i1 or $e1 != $d1) |
|
211
|
|
|
|
|
|
|
## + supports multiple lines for pairs ($i1,$d1,$i2) provided the above conditions hold |
|
212
|
|
|
|
|
|
|
## + supports loading of $cof->{N} from single-value lines |
|
213
|
|
|
|
|
|
|
## + %opts: clobber %$cof |
|
214
|
|
|
|
|
|
|
sub loadTextFh { |
|
215
|
0
|
|
|
0
|
1
|
|
my ($cof,$infh,%opts) = @_; |
|
216
|
0
|
0
|
|
|
|
|
if (!ref($cof)) { |
|
217
|
0
|
|
|
|
|
|
$cof = $cof->new(%opts); |
|
218
|
|
|
|
|
|
|
} else { |
|
219
|
0
|
|
|
|
|
|
@$cof{keys %opts} = values %opts; |
|
220
|
|
|
|
|
|
|
} |
|
221
|
0
|
0
|
|
|
|
|
$cof->logconfess("loadTextFh(): cannot load unopened database!") if (!$cof->opened); |
|
222
|
|
|
|
|
|
|
|
|
223
|
0
|
0
|
|
|
|
|
return ($cof->wantXS |
|
224
|
|
|
|
|
|
|
? $cof->loadTextFhXS($infh,%opts) |
|
225
|
|
|
|
|
|
|
: $cof->loadTextFhPP($infh,%opts)); |
|
226
|
|
|
|
|
|
|
} |
|
227
|
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
## $cof = $cof->loadTextFhPP($fh,%opts) |
|
230
|
|
|
|
|
|
|
## + pure-perl guts for loadTextFh() |
|
231
|
|
|
|
|
|
|
sub loadTextFhPP { |
|
232
|
0
|
|
|
0
|
0
|
|
my ($cof,$infh,%opts) = @_; |
|
233
|
0
|
0
|
|
|
|
|
$cof->logconfess("loadTextFhPP(): cannot load unopened database!") if (!$cof->opened); |
|
234
|
0
|
|
|
|
|
|
$cof->vlog($cof->{logXS}, "loadTextFH(): using pure-perl implementation"); |
|
235
|
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
##-- common variables |
|
237
|
|
|
|
|
|
|
## $r1 : [$end2] @ $i1 |
|
238
|
|
|
|
|
|
|
## $r2 : [$end3,$d1,$f1]* @ end2($i1-1)..(end2($i1+1)-1) |
|
239
|
|
|
|
|
|
|
## $r3 : [$i2,$f12]* @ end3($d1-1)..(end3($d1+1)-1) |
|
240
|
0
|
|
0
|
|
|
|
my $fmin = $cof->{fmin} // 0; |
|
241
|
0
|
|
|
|
|
|
my ($r1,$r2,$r3,$rN) = @$cof{qw(r1 r2 r3 rN)}; |
|
242
|
0
|
|
|
|
|
|
my ($pack_r1,$pack_r2,$pack_r3) = map {$_->{packas}} ($r1,$r2,$r3); |
|
|
0
|
|
|
|
|
|
|
|
243
|
0
|
|
|
|
|
|
$r1->truncate(); |
|
244
|
0
|
|
|
|
|
|
$r2->truncate(); |
|
245
|
0
|
|
|
|
|
|
$r3->truncate(); |
|
246
|
0
|
|
|
|
|
|
$rN->truncate(); |
|
247
|
0
|
|
|
|
|
|
my ($fh1,$fh2,$fh3) = ($r1->{fh},$r2->{fh},$r3->{fh}); |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
##-- iteration variables |
|
250
|
0
|
|
|
|
|
|
my ($pos1,$pos2,$pos3) = (0,0,0); |
|
251
|
0
|
|
|
|
|
|
my ($i1_cur,$d1_cur,$f1) = (-1,undef,0); |
|
252
|
0
|
|
|
|
|
|
my ($f12,$i1,$d1,$i2,$d2,$f); |
|
253
|
0
|
|
|
|
|
|
my $N = 0; ##-- total marginal frequency as extracted from %f12 |
|
254
|
0
|
|
|
|
|
|
my $N1 = 0; ##-- total N as extracted from single-element records |
|
255
|
0
|
|
|
|
|
|
my %f12 = qw(); ##-- ($i2=>$f12, ...) for ($i1_cur,$d1_cur) |
|
256
|
0
|
|
|
|
|
|
my %fN = qw(); ##-- ($d=>$Nd, ...) |
|
257
|
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
##-- guts for inserting records from $i1_cur,$d1_cur,%f12,$pos1,$pos2 : call on changed ($i1_cur,$d1_cur) |
|
259
|
|
|
|
|
|
|
my $insert = sub { |
|
260
|
0
|
0
|
|
0
|
|
|
if ($i1_cur >= 0) { |
|
261
|
0
|
0
|
|
|
|
|
if ($i1_cur != $pos1) { |
|
262
|
|
|
|
|
|
|
##-- we've skipped one or more $i1 because it had no collocates (e.g. kern01 i1=287123="Untier/1906") |
|
263
|
0
|
|
|
|
|
|
$fh1->print( pack($pack_r1,$pos2) x ($i1_cur-$pos1) ); |
|
264
|
0
|
|
|
|
|
|
$pos1 = $i1_cur; |
|
265
|
|
|
|
|
|
|
} |
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
##-- dump r3-records for ($i1_cur,$d1_cur,*) |
|
268
|
0
|
|
|
|
|
|
$f1 = 0; |
|
269
|
0
|
|
|
|
|
|
foreach (sort {$a<=>$b} keys %f12) { |
|
|
0
|
|
|
|
|
|
|
|
270
|
0
|
|
|
|
|
|
$f = $f12{$_}; |
|
271
|
0
|
|
|
|
|
|
$f1 += $f; |
|
272
|
0
|
0
|
0
|
|
|
|
next if ($f < $fmin || $_ < 0); ##-- skip here so we can track "real" marginal frequencies |
|
273
|
0
|
|
|
|
|
|
$fh3->print(pack($pack_r3, $_,$f)); |
|
274
|
0
|
|
|
|
|
|
++$pos3; |
|
275
|
|
|
|
|
|
|
} |
|
276
|
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
##-- dump r2-record for ($i1_cur,$d1_cur), and track $fN by date |
|
278
|
0
|
0
|
|
|
|
|
if (defined($d1_cur)) { |
|
279
|
0
|
|
|
|
|
|
$fh2->print(pack($pack_r2, $pos3,$d1_cur,$f1)); |
|
280
|
0
|
|
|
|
|
|
$fN{$d1_cur} += $f1; |
|
281
|
0
|
|
|
|
|
|
++$pos2; |
|
282
|
|
|
|
|
|
|
} |
|
283
|
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
##-- maybe dump r1-record for $i1_cur |
|
285
|
0
|
0
|
|
|
|
|
if ($i1 != $i1_cur) { |
|
286
|
0
|
|
|
|
|
|
$fh1->print(pack($pack_r1, $pos2)); |
|
287
|
0
|
|
|
|
|
|
$pos1 = $i1_cur+1; |
|
288
|
|
|
|
|
|
|
} |
|
289
|
0
|
|
|
|
|
|
$N += $f1; |
|
290
|
|
|
|
|
|
|
} |
|
291
|
0
|
|
|
|
|
|
$i1_cur = $i1; |
|
292
|
0
|
|
|
|
|
|
$d1_cur = $d1; |
|
293
|
0
|
|
|
|
|
|
%f12 = qw(); |
|
294
|
0
|
|
|
|
|
|
}; |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
##-- ye olde loope |
|
297
|
0
|
|
|
|
|
|
binmode($infh,':raw'); |
|
298
|
0
|
|
|
|
|
|
while (defined($_=<$infh>)) { |
|
299
|
0
|
|
|
|
|
|
chomp; |
|
300
|
0
|
|
|
|
|
|
($f12,$i1,$d1,$i2,$d2) = split(' ',$_,5); |
|
301
|
0
|
0
|
|
|
|
|
if (!defined($i1)) { |
|
|
|
0
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
#$cof->debug("N1 += $f12"); |
|
303
|
0
|
|
|
|
|
|
$N1 += $f12; ##-- load N values |
|
304
|
0
|
|
|
|
|
|
next; |
|
305
|
|
|
|
|
|
|
} |
|
306
|
|
|
|
|
|
|
elsif (!defined($d1)) { |
|
307
|
0
|
|
|
|
|
|
$cof->logconfess("loadTextFhPP(): failed to parse input line ", $infh->input_line_number); |
|
308
|
|
|
|
|
|
|
} |
|
309
|
0
|
0
|
0
|
|
|
|
$insert->() ##-- insert record(s) for ($i1_cur,$d1_cur) |
|
310
|
|
|
|
|
|
|
if ($i1 != $i1_cur || $d1 != $d1_cur); |
|
311
|
0
|
|
0
|
|
|
|
$f12{$i2//-1} += $f12; ##-- buffer co-frequencies for ($i1_cur,$d1_cur); track un-collocated frequencies as $i2=-1 |
|
312
|
|
|
|
|
|
|
} |
|
313
|
0
|
|
|
|
|
|
$i1 = -1; |
|
314
|
0
|
|
|
|
|
|
$insert->(); ##-- write record(s) for final ($i1_cur,$d1_cur) |
|
315
|
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
##-- create $rN by date |
|
317
|
0
|
|
|
|
|
|
my @dates = sort {$a<=>$b} keys %fN; |
|
|
0
|
|
|
|
|
|
|
|
318
|
0
|
|
|
|
|
|
my $ymin = $cof->{ymin} = ($dates[0]+0); |
|
319
|
0
|
|
0
|
|
|
|
$rN->{fh}->print(pack("($rN->{packas})*", map {$fN{$_}//0} ($ymin..$dates[$#dates]))); |
|
|
0
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
##-- adopt final $N and sizes |
|
322
|
|
|
|
|
|
|
#$cof->debug("FINAL: N1=$N1, N=$N"); |
|
323
|
0
|
0
|
|
|
|
|
$cof->{N} = $N1>$N ? $N1 : $N; |
|
324
|
0
|
|
|
|
|
|
foreach (qw(1 2 3 N)) { |
|
325
|
0
|
|
|
|
|
|
my $r = $cof->{"r$_"}; |
|
326
|
0
|
|
|
|
|
|
$r->flush(); |
|
327
|
0
|
|
|
|
|
|
$cof->{"size$_"} = $r->size; |
|
328
|
|
|
|
|
|
|
} |
|
329
|
|
|
|
|
|
|
|
|
330
|
0
|
|
|
|
|
|
return $cof; |
|
331
|
|
|
|
|
|
|
} |
|
332
|
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
## $cof = $cof->loadTextFile_create($fh,%opts) |
|
334
|
|
|
|
|
|
|
## + old version of loadTextFile() which doesn't support N, semi-sorted input, or multiple ($i1,$i2) entries |
|
335
|
|
|
|
|
|
|
## + not useable by union() method |
|
336
|
|
|
|
|
|
|
## + obsolete; now just an alias for loadTextFile() |
|
337
|
|
|
|
|
|
|
sub loadTextFile_create { |
|
338
|
0
|
|
|
0
|
1
|
|
my $cof = shift; |
|
339
|
0
|
|
|
|
|
|
return $cof->loadTextFile(@_); |
|
340
|
|
|
|
|
|
|
} |
|
341
|
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
## $bool = $obj->saveTextFile($filename_or_handle, %opts) |
|
343
|
|
|
|
|
|
|
## + wraps saveTextFh() |
|
344
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Persistent |
|
345
|
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
## $bool = $cof->saveTextFh($fh,%opts) |
|
347
|
|
|
|
|
|
|
## + save from text file with lines of the form: |
|
348
|
|
|
|
|
|
|
## N ##-- 1 field : N |
|
349
|
|
|
|
|
|
|
## FREQ ID1 DATE ##-- 3 fields: un-collocated portion of $f1 |
|
350
|
|
|
|
|
|
|
## FREQ ID1 DATE ID2 ##-- 4 fields: co-frequency pair (ID2 >= 0) |
|
351
|
|
|
|
|
|
|
## + %opts: |
|
352
|
|
|
|
|
|
|
## i2s => \&CODE, ##-- code-ref for formatting indices; called as $s=CODE($i) |
|
353
|
|
|
|
|
|
|
## i2s1 => \&CODE, ##-- code-ref for formatting item1 indices (overrides 'i2s') |
|
354
|
|
|
|
|
|
|
## i2s2 => \&CODE, ##-- code-ref for formatting item2 indices (overrides 'i2s') |
|
355
|
|
|
|
|
|
|
sub saveTextFh { |
|
356
|
0
|
|
|
0
|
1
|
|
my ($cof,$outfh,%opts) = @_; |
|
357
|
0
|
0
|
|
|
|
|
$cof->logconfess("saveTextFile(): cannot save unopened DB") if (!$cof->opened); |
|
358
|
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
##-- common variables |
|
360
|
|
|
|
|
|
|
## $r1 : [$end2] @ $i1 |
|
361
|
|
|
|
|
|
|
## $r2 : [$end3,$d1,$f1]* @ end2($i1-1)..(end2($i1+1)-1) |
|
362
|
|
|
|
|
|
|
## $r3 : [$i2,$f12]* @ end3($d1-1)..(end3($d1+1)-1) |
|
363
|
0
|
|
|
|
|
|
my ($r1,$r2,$r3) = @$cof{qw(r1 r2 r3)}; |
|
364
|
0
|
|
|
|
|
|
my ($pack1,$pack2,$pack3) = map {$_->{packas}} ($r1,$r2,$r3); |
|
|
0
|
|
|
|
|
|
|
|
365
|
0
|
|
|
|
|
|
my $i2s = $opts{i2s}; |
|
366
|
0
|
0
|
|
|
|
|
my $i2s1 = exists($opts{i2s1}) ? $opts{i2s1} : $i2s; |
|
367
|
0
|
0
|
|
|
|
|
my $i2s2 = exists($opts{i2s2}) ? $opts{i2s2} : $i2s; |
|
368
|
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
##-- iteration variables |
|
370
|
0
|
|
|
|
|
|
my ($buf1,$i1,$s1,$end2); |
|
371
|
0
|
|
|
|
|
|
my ($buf2,$off2,$end3,$d1,$f1); |
|
372
|
0
|
|
|
|
|
|
my ($buf3,$off3,$i2,$s2,$f12,$f12sum); |
|
373
|
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
##-- ye olde loope |
|
375
|
0
|
|
|
|
|
|
binmode($outfh,':raw'); |
|
376
|
0
|
|
|
|
|
|
$outfh->print($cof->{N}, "\n"); |
|
377
|
0
|
|
|
|
|
|
for ($r1->seek($i1=0), $r2->seek($off2=0), $r3->seek($off3=0); !$r1->eof(); ++$i1) { |
|
378
|
0
|
0
|
|
|
|
|
$r1->read(\$buf1) or $cof->logconfess("saveTextFile(): failed to read record $i1 from $r1->{file}: $!"); |
|
379
|
0
|
|
|
|
|
|
$end2 = unpack($pack1,$buf1); |
|
380
|
0
|
0
|
|
|
|
|
$s1 = $i2s1 ? $i2s1->($i1) : $i1; |
|
381
|
|
|
|
|
|
|
|
|
382
|
0
|
|
0
|
|
|
|
for ( ; $off2 < $end2 && !$r2->eof(); ++$off2) { |
|
383
|
0
|
0
|
|
|
|
|
$r2->read(\$buf2) or $cof->logconfess("saveTextFile(): failed to read record $off2 from $r2->{file}: $!"); |
|
384
|
0
|
|
|
|
|
|
($end3,$d1,$f1) = unpack($pack2,$buf2); |
|
385
|
|
|
|
|
|
|
|
|
386
|
0
|
|
0
|
|
|
|
for ($f12sum=0; $off3 < $end3 && !$r3->eof(); ++$off3) { |
|
387
|
0
|
0
|
|
|
|
|
$r3->read(\$buf3) or $cof->logconfess("saveTextFile(): failed to read record $off3 from $r3->{file}: $!"); |
|
388
|
0
|
|
|
|
|
|
($i2,$f12) = unpack($pack3,$buf3); |
|
389
|
0
|
|
|
|
|
|
$f12sum += $f12; |
|
390
|
0
|
0
|
|
|
|
|
$s2 = $i2s2 ? $i2s2->($i2) : $i2; |
|
391
|
0
|
|
|
|
|
|
$outfh->print(join("\t", $f12, $s1, $d1, $s2), "\n"); |
|
392
|
|
|
|
|
|
|
} |
|
393
|
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
##-- track un-collocated portion of ($f1,$d1), if any |
|
395
|
0
|
0
|
|
|
|
|
$outfh->print(join("\t", $f1-$f12sum, $s1, $d1), "\n") if ($f12sum != $f1); |
|
396
|
|
|
|
|
|
|
} |
|
397
|
|
|
|
|
|
|
} |
|
398
|
|
|
|
|
|
|
|
|
399
|
0
|
|
|
|
|
|
return $cof; |
|
400
|
|
|
|
|
|
|
} |
|
401
|
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
##============================================================================== |
|
403
|
|
|
|
|
|
|
## Cofreqs API: XS |
|
404
|
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
## $bool = $CLASS_OR_OBJECT->wantXS() |
|
406
|
|
|
|
|
|
|
## + attempts to load and import DiaColloDB::XS::CofUtils |
|
407
|
|
|
|
|
|
|
## + returns true iff DiaColloDB::XS::CofUtils is loaded and imported |
|
408
|
|
|
|
|
|
|
sub wantXS { |
|
409
|
0
|
|
|
0
|
0
|
|
my $that = shift; |
|
410
|
0
|
0
|
0
|
|
|
|
if (!defined($WANT_XS) || ($WANT_XS && !$INC{'DiaColloDB/XS/CofUtils.pm'})) { |
|
|
|
|
0
|
|
|
|
|
|
411
|
0
|
|
|
|
|
|
$WANT_XS = eval "use DiaColloDB::XS::CofUtils qw(:cof); 1"; |
|
412
|
0
|
|
|
|
|
|
$@ = ''; |
|
413
|
|
|
|
|
|
|
} |
|
414
|
0
|
|
|
|
|
|
return $WANT_XS; |
|
415
|
|
|
|
|
|
|
} |
|
416
|
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
##============================================================================== |
|
419
|
|
|
|
|
|
|
## Relation API: create |
|
420
|
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
## $rel = $CLASS_OR_OBJECT->create($coldb,$tokdat_file,%opts) |
|
422
|
|
|
|
|
|
|
## + populates current database from $tokdat_file, |
|
423
|
|
|
|
|
|
|
## a tt-style text file with lines of the form: |
|
424
|
|
|
|
|
|
|
## TID DATE ##-- single token |
|
425
|
|
|
|
|
|
|
## "\n" ##-- blank line: EOS |
|
426
|
|
|
|
|
|
|
## + %opts: clobber %$ug |
|
427
|
|
|
|
|
|
|
sub create { |
|
428
|
0
|
|
|
0
|
1
|
|
my ($cof,$coldb,$tokfile,%opts) = @_; |
|
429
|
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
##-- create/clobber |
|
431
|
0
|
0
|
|
|
|
|
$cof = $cof->new() if (!ref($cof)); |
|
432
|
0
|
|
|
|
|
|
@$cof{keys %opts} = values %opts; |
|
433
|
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
##-- ensure openend |
|
435
|
|
|
|
|
|
|
$cof->opened |
|
436
|
|
|
|
|
|
|
or $cof->open(undef,'rw') |
|
437
|
0
|
0
|
0
|
|
|
|
or $cof->logconfess("create(): failed to open co-frequency database '", ($cof->{base}//'-undef-'), "': $!"); |
|
|
|
|
0
|
|
|
|
|
|
438
|
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
##-- stage1: generate pairs |
|
440
|
0
|
|
|
|
|
|
my $datfile = "$cof->{base}.dat"; |
|
441
|
0
|
0
|
|
|
|
|
my $xspp = $cof->wantXS() ? 'XS' : 'PP'; |
|
442
|
0
|
|
0
|
|
|
|
$cof->{dmax} //= 1; |
|
443
|
0
|
|
|
|
|
|
$cof->vlog('trace', "create(): stage1: generate pairs ($xspp, dmax=$cof->{dmax})"); |
|
444
|
0
|
0
|
|
|
|
|
$cof->generatePairs($tokfile, $datfile) |
|
445
|
|
|
|
|
|
|
or $cof->logconfess("create(): failed to generate co-occurrence pairs"); |
|
446
|
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
##-- stage2: load pair-frequencies |
|
448
|
0
|
|
|
|
|
|
$cof->vlog('trace', "create(): stage2: load pair frequencies ($xspp, fmin=$cof->{fmin})"); |
|
449
|
0
|
0
|
|
|
|
|
$cof->loadTextFile($datfile) |
|
450
|
|
|
|
|
|
|
or $cof->logconfess("create(): failed to load pair frequencies from $datfile: $!"); |
|
451
|
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
##-- stage3: header |
|
453
|
0
|
0
|
|
|
|
|
$cof->saveHeader() |
|
454
|
|
|
|
|
|
|
or $cof->logconfess("create(): failed to save header: $!"); |
|
455
|
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
##-- unlink temp file |
|
457
|
0
|
0
|
|
|
|
|
unlink($datfile) if (!$cof->{keeptmp}); |
|
458
|
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
##-- done |
|
460
|
0
|
|
|
|
|
|
return $cof; |
|
461
|
|
|
|
|
|
|
} |
|
462
|
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
464
|
|
|
|
|
|
|
## Relation API: create: guts |
|
465
|
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
## $cof_or_undef = $cof->generatePairs( $tokfile ) |
|
467
|
|
|
|
|
|
|
## $cof_or_undef = $cof->generatePairs( $tokfile, $outfile ) |
|
468
|
|
|
|
|
|
|
## + generates co-occurrence pairs for stage1 of Cofreqs compilation |
|
469
|
|
|
|
|
|
|
## + input: $tokfile : as passed to Cofreqs::create() (= "$dbdir/tokens.dat") |
|
470
|
|
|
|
|
|
|
## + output: $outfile : co-occurrence frequencies (= "$cof->{base}.dat"), as passed to stage2 |
|
471
|
|
|
|
|
|
|
## + really just wraps generatePairsXS() or generatePairsPP() |
|
472
|
|
|
|
|
|
|
sub generatePairs { |
|
473
|
0
|
|
|
0
|
0
|
|
my ($cof,$tokfile,$outfile) = @_; |
|
474
|
0
|
|
0
|
|
|
|
$cof->{dmax} //= 1; |
|
475
|
0
|
0
|
|
|
|
|
$outfile = "$cof->{base}.dat" if (!$outfile); |
|
476
|
0
|
0
|
|
|
|
|
return ($cof->wantXS |
|
477
|
|
|
|
|
|
|
? $cof->generatePairsXS($tokfile,$outfile) |
|
478
|
|
|
|
|
|
|
: $cof->generatePairsPP($tokfile,$outfile)); |
|
479
|
|
|
|
|
|
|
} |
|
480
|
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
## $cof_or_undef = $cof->generatePairsPP( $tokfile, $outfile ) |
|
482
|
|
|
|
|
|
|
## + pure-perl implementation of generatePairs() method |
|
483
|
|
|
|
|
|
|
sub generatePairsPP { |
|
484
|
0
|
|
|
0
|
0
|
|
my ($cof,$tokfile,$outfile) = @_; |
|
485
|
0
|
|
0
|
|
|
|
my $dmax = $cof->{dmax} // 1; |
|
486
|
0
|
0
|
|
|
|
|
$outfile = "$cof->{base}.dat" if (!$outfile); |
|
487
|
0
|
|
|
|
|
|
$cof->vlog($cof->{logXS}, "generatePairs(): using pure-perl implementation"); |
|
488
|
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
##-- token reader fh |
|
490
|
0
|
0
|
|
|
|
|
CORE::open(my $tokfh, "<$tokfile") |
|
491
|
|
|
|
|
|
|
or $cof->logconfess("create(): open failed for token-file '$tokfile': $!"); |
|
492
|
0
|
|
|
|
|
|
binmode($tokfh,':raw'); |
|
493
|
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
##-- temporary output file |
|
495
|
0
|
0
|
|
|
|
|
my $tmpfile = tmpfile("$outfile.tmp", UNLINK=>(!$cof->{keeptmp})) |
|
496
|
|
|
|
|
|
|
or $cof->logconfess("failed to create temp-file '$outfile.tmp': $!"); |
|
497
|
0
|
0
|
|
|
|
|
CORE::open(my $tmpfh, ">$tmpfile") |
|
498
|
|
|
|
|
|
|
or $cof->logconfess("failed to open temp-file '$outfile.tmp': $!"); |
|
499
|
0
|
|
|
|
|
|
binmode($tmpfh,':raw'); |
|
500
|
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
##-- stage1: generate pairs |
|
502
|
0
|
|
|
|
|
|
my (@sent,$i,$j,$wi,$wj); |
|
503
|
0
|
|
|
|
|
|
while (!eof($tokfh)) { |
|
504
|
0
|
|
|
|
|
|
@sent = qw(); |
|
505
|
0
|
|
|
|
|
|
while (defined($_=<$tokfh>)) { |
|
506
|
0
|
|
|
|
|
|
chomp; |
|
507
|
0
|
0
|
|
|
|
|
last if (/^$/ ); |
|
508
|
0
|
|
|
|
|
|
push(@sent,$_); |
|
509
|
|
|
|
|
|
|
} |
|
510
|
0
|
0
|
|
|
|
|
next if (!@sent); |
|
511
|
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
##-- get pairs |
|
513
|
0
|
|
|
|
|
|
foreach $i (0..$#sent) { |
|
514
|
0
|
|
|
|
|
|
$wi = $sent[$i]; |
|
515
|
|
|
|
|
|
|
print $tmpfh |
|
516
|
0
|
|
|
|
|
|
(map {"$wi\t$sent[$_]\n"} |
|
517
|
0
|
0
|
0
|
|
|
|
grep {$_>=0 && $_<=$#sent && $_ != $i} |
|
|
0
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
(($i-$dmax)..($i+$dmax)) |
|
519
|
|
|
|
|
|
|
); |
|
520
|
|
|
|
|
|
|
} |
|
521
|
|
|
|
|
|
|
} |
|
522
|
0
|
0
|
|
|
|
|
CORE::close($tmpfh) |
|
523
|
|
|
|
|
|
|
or $cof->logconfess("close failed for temp-file '$tmpfile': $!"); |
|
524
|
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
##-- sort & count |
|
526
|
0
|
|
|
|
|
|
env_push(LC_ALL=>'C'); |
|
527
|
0
|
0
|
|
|
|
|
runcmd(sortCmd()." -nk1 -nk2 -nk3 $tmpfile | uniq -c - $outfile")==0 |
|
528
|
|
|
|
|
|
|
or $cof->logconfess("create(): open failed for pipe to sort|uniq: $!"); |
|
529
|
0
|
|
|
|
|
|
env_pop(); |
|
530
|
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
##-- cleanup |
|
532
|
0
|
0
|
|
|
|
|
CORE::unlink($tmpfile) if (!$cof->{keeptmp}); |
|
533
|
|
|
|
|
|
|
|
|
534
|
0
|
|
|
|
|
|
return $cof; |
|
535
|
|
|
|
|
|
|
} |
|
536
|
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
##============================================================================== |
|
538
|
|
|
|
|
|
|
## Relation API: union |
|
539
|
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
## $cof = CLASS_OR_OBJECT->union($coldb, \@pairs, %opts) |
|
542
|
|
|
|
|
|
|
## + merge multiple cofreqs indices from \@pairs into new object |
|
543
|
|
|
|
|
|
|
## + @pairs : array of pairs ([$cof,\@ti2u],...) |
|
544
|
|
|
|
|
|
|
## of cofreqs-objects $cof and tuple-id maps \@ti2u for $cof |
|
545
|
|
|
|
|
|
|
## - \@ti2u may also be a mapping object supporting a toArray() method |
|
546
|
|
|
|
|
|
|
## + %opts: clobber %$cof |
|
547
|
|
|
|
|
|
|
## + implicitly flushes the new index |
|
548
|
|
|
|
|
|
|
sub union { |
|
549
|
0
|
|
|
0
|
1
|
|
my ($cof,$coldb,$pairs,%opts) = @_; |
|
550
|
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
##-- create/clobber |
|
552
|
0
|
0
|
|
|
|
|
$cof = $cof->new() if (!ref($cof)); |
|
553
|
0
|
|
|
|
|
|
@$cof{keys %opts} = values %opts; |
|
554
|
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
##-- tempfile (input for sort) |
|
556
|
0
|
|
|
|
|
|
my $tmpfile = "$cof->{base}.udat"; |
|
557
|
0
|
0
|
|
|
|
|
my $tmpfh = IO::File->new(">$tmpfile") |
|
558
|
|
|
|
|
|
|
or $cof->logconfess("union(): open failed for tempfile $tmpfile: $!"); |
|
559
|
0
|
|
|
|
|
|
binmode($tmpfh,':raw'); |
|
560
|
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
##-- union stage1: extract pairs and N |
|
562
|
0
|
|
|
|
|
|
$cof->vlog('trace', "union(): stage1: collect pairs"); |
|
563
|
0
|
|
|
|
|
|
my ($pair,$pcof,$pi2u); |
|
564
|
0
|
|
|
|
|
|
my $pairi=0; |
|
565
|
0
|
|
|
|
|
|
foreach $pair (@$pairs) { |
|
566
|
0
|
|
|
|
|
|
($pcof,$pi2u) = @$pair; |
|
567
|
0
|
0
|
|
|
|
|
$pi2u = $pi2u->toArray() if (UNIVERSAL::can($pi2u,'toArray')); |
|
568
|
0
|
|
|
0
|
|
|
$pcof->saveTextFh($tmpfh, i2s=>sub {$pi2u->[$_[0]]}) |
|
569
|
0
|
0
|
|
|
|
|
or $cof->logconfess("union(): failed to extract pairs for argument $pairi"); |
|
570
|
0
|
|
|
|
|
|
++$pairi; |
|
571
|
|
|
|
|
|
|
} |
|
572
|
|
|
|
|
|
|
$tmpfh->close() |
|
573
|
0
|
0
|
|
|
|
|
or $cof->logconfess("union(): failed to close tempfile $tmpfile: $!"); |
|
574
|
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
##-- union stage2: sort & load tempfile |
|
576
|
0
|
|
|
|
|
|
env_push(LC_ALL=>'C'); |
|
577
|
0
|
|
|
|
|
|
$cof->vlog('trace', "union(): stage2: load pair frequencies (fmin=$cof->{fmin})"); |
|
578
|
0
|
0
|
|
|
|
|
my $sortfh = opencmd(sortCmd()." -n -k2 -k3 -k4 $tmpfile |") |
|
579
|
|
|
|
|
|
|
or $cof->logconfess("union(): open failed for pipe from sort: $!"); |
|
580
|
0
|
|
|
|
|
|
binmode($sortfh,':raw'); |
|
581
|
0
|
0
|
|
|
|
|
$cof->loadTextFh($sortfh) |
|
582
|
|
|
|
|
|
|
or $cof->logconfess("union(): failed to load pair frequencies from $tmpfile: $!"); |
|
583
|
0
|
0
|
|
|
|
|
$sortfh->close() |
|
584
|
|
|
|
|
|
|
or $cof->logconfess("union(): failed to close pipe from sort: $!"); |
|
585
|
0
|
|
|
|
|
|
env_pop(); |
|
586
|
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
##-- stage3: header |
|
588
|
0
|
0
|
|
|
|
|
$cof->saveHeader() |
|
589
|
|
|
|
|
|
|
or $cof->logconfess("union(): failed to save header: $!"); |
|
590
|
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
##-- cleanup: unlink temp file |
|
592
|
0
|
0
|
|
|
|
|
CORE::unlink($tmpfile) if (!$cof->{keeptmp}); |
|
593
|
|
|
|
|
|
|
|
|
594
|
0
|
|
|
|
|
|
return $cof; |
|
595
|
|
|
|
|
|
|
} |
|
596
|
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
##============================================================================== |
|
598
|
|
|
|
|
|
|
## Relation API: dbinfo |
|
599
|
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
## \%info = $rel->dbinfo($coldb) |
|
601
|
|
|
|
|
|
|
## + embedded info-hash for $coldb->dbinfo() |
|
602
|
|
|
|
|
|
|
sub dbinfo { |
|
603
|
0
|
|
|
0
|
0
|
|
my $cof = shift; |
|
604
|
0
|
|
|
|
|
|
my $info = $cof->SUPER::dbinfo(); |
|
605
|
0
|
|
|
|
|
|
@$info{qw(fmin dmax size1 size2 size3 sizeN N)} = @$cof{qw(fmin dmax size1 size2 size3 sizeN N)}; |
|
606
|
0
|
|
|
|
|
|
return $info; |
|
607
|
|
|
|
|
|
|
} |
|
608
|
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
##============================================================================== |
|
611
|
|
|
|
|
|
|
## Utilities: lookup |
|
612
|
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
## $f = $cof->f1( @xids) |
|
614
|
|
|
|
|
|
|
## $f = $cof->f1(\@xids) |
|
615
|
|
|
|
|
|
|
## + get total marginal unigram frequency (db must be opened) |
|
616
|
|
|
|
|
|
|
## + no longer supported since v0.10.000 |
|
617
|
|
|
|
|
|
|
sub f1 { |
|
618
|
0
|
|
|
0
|
0
|
|
$_[0]->logconfess("f1(): method no longer supported"); |
|
619
|
|
|
|
|
|
|
} |
|
620
|
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
## $f12 = $cof->f12($xid1,$xid2) |
|
622
|
|
|
|
|
|
|
## + return joint frequency for pair ($xid1,$xid2) |
|
623
|
|
|
|
|
|
|
## + no longer supported since v0.10.000 |
|
624
|
|
|
|
|
|
|
sub f12 { |
|
625
|
0
|
|
|
0
|
0
|
|
$_[0]->logconfess("f12(): method no longer supported"); |
|
626
|
|
|
|
|
|
|
} |
|
627
|
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
##============================================================================== |
|
629
|
|
|
|
|
|
|
## Relation API: default |
|
630
|
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
632
|
|
|
|
|
|
|
## Relation API: default: sliceN |
|
633
|
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
## $N = $rel->sliceN($sliceBy, $dateLo) |
|
635
|
|
|
|
|
|
|
## + get total slice-wise co-occurrence count for a slice of size $sliceBy starting at $dateLo |
|
636
|
|
|
|
|
|
|
## + INHERITED from DiaColloDB::Relation |
|
637
|
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
639
|
|
|
|
|
|
|
## Relation API: default: profile |
|
640
|
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
## \%slice2prf = $rel->subprofile1(\@tids,\%opts) |
|
642
|
|
|
|
|
|
|
## + get slice-wise joint co-frequency profile(s) for @tids (db must be opened; f1 and f12 only) |
|
643
|
|
|
|
|
|
|
## + %opts: as for profile(), also: |
|
644
|
|
|
|
|
|
|
## coldb => $coldb, ##-- parent DiaColloDB object (for shared data, debugging) |
|
645
|
|
|
|
|
|
|
## dreq => \%dreq, ##-- parsed date request |
|
646
|
|
|
|
|
|
|
## extend => \%slice2keys_packed, ##-- packed extend |
|
647
|
|
|
|
|
|
|
sub subprofile1 { |
|
648
|
0
|
|
|
0
|
1
|
|
my ($cof,$tids,$opts) = @_; |
|
649
|
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
##-- common variables |
|
651
|
0
|
0
|
|
|
|
|
$tids = [$tids] if (!UNIVERSAL::isa($tids,'ARRAY')); |
|
652
|
0
|
|
|
|
|
|
my $coldb = $opts->{coldb}; |
|
653
|
0
|
|
|
|
|
|
my $slice = $opts->{slice}; |
|
654
|
0
|
|
|
|
|
|
my $dreq = $opts->{dreq}; |
|
655
|
0
|
|
|
|
|
|
my $dfilter = $dreq->{dfilter}; |
|
656
|
0
|
|
|
|
|
|
my $groupby = $opts->{groupby}{ti2g}; |
|
657
|
0
|
|
|
|
|
|
my $extend = $opts->{extend}; |
|
658
|
0
|
|
|
|
|
|
my $onepass = $opts->{onepass}; |
|
659
|
0
|
|
|
|
|
|
my $pack_id = $coldb->{pack_id}; |
|
660
|
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
##-- vars: relation-wise |
|
662
|
|
|
|
|
|
|
## $r1 : [$end2] @ $i1 |
|
663
|
|
|
|
|
|
|
## $r2 : [$end3,$d1,$f1]* @ end2($i1-1)..(end2($i1+1)-1) |
|
664
|
|
|
|
|
|
|
## $r3 : [$i2,$f12]* @ end3($d1-1)..(end3($d1+1)-1) |
|
665
|
0
|
|
|
|
|
|
my ($r1,$r2,$r3) = @$cof{qw(r1 r2 r3)}; |
|
666
|
0
|
|
|
|
|
|
my ($pack1,$pack2,$pack3) = map {$_->{packas}} ($r1,$r2,$r3); |
|
|
0
|
|
|
|
|
|
|
|
667
|
0
|
|
|
|
|
|
my $pack2e = $cof->{pack_i}; |
|
668
|
0
|
|
|
|
|
|
my $pack2d = '@'.packsize("$cof->{pack_i}").$cof->{pack_d}; |
|
669
|
0
|
|
|
|
|
|
my $pack2f = '@'.packsize("$cof->{pack_i}$cof->{pack_d}").$cof->{pack_f}; |
|
670
|
0
|
|
0
|
|
|
|
my $size1 = $cof->{size1} // ($cof->{size1}=$r1->size); |
|
671
|
0
|
|
0
|
|
|
|
my $size2 = $cof->{size2} // ($cof->{size2}=$r2->size); |
|
672
|
0
|
|
0
|
|
|
|
my $size3 = $cof->{size3} // ($cof->{size3}=$r3->size); |
|
673
|
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
##-- setup %slice2prf |
|
675
|
|
|
|
|
|
|
my %slice2prf = map { |
|
676
|
0
|
|
|
|
|
|
($_ => DiaColloDB::Profile->new(f1=>0, N=>$cof->sliceN($slice,$_))) |
|
677
|
0
|
0
|
|
|
|
|
} ($slice ? (map {$_*$slice} (($dreq->{slo}/$slice)..($dreq->{shi}/$slice))) : 0); |
|
|
0
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
##-- ye olde loope |
|
680
|
0
|
|
|
|
|
|
my ($i1,$beg2,$end2, $pos2,$beg3,$end3,$d1,$ds,$dprf,$f1, $pos3,$i2,$f12,$key2, $buf,%id2); |
|
681
|
0
|
|
|
|
|
|
my ($blo,$bhi,$bi); ##-- one-pass guts |
|
682
|
0
|
|
|
|
|
|
foreach $i1 (@$tids) { |
|
683
|
0
|
0
|
|
|
|
|
next if ($i1 >= $size1); |
|
684
|
0
|
0
|
|
|
|
|
$beg2 = ($i1==0 ? 0 : unpack($pack1,$r1->fetchraw($i1-1,\$buf))); |
|
685
|
0
|
|
|
|
|
|
$end2 = unpack($pack1, $r1->fetchraw($i1,\$buf)); |
|
686
|
|
|
|
|
|
|
|
|
687
|
0
|
0
|
|
|
|
|
next if ($beg2 >= $size2); |
|
688
|
0
|
|
|
|
|
|
for ($pos2=$beg2; $pos2 < $end2; ++$pos2) { |
|
689
|
0
|
0
|
|
|
|
|
$beg3 = ($pos2==0 ? 0 : unpack($pack2e, $r2->fetchraw($pos2-1,\$buf))); |
|
690
|
0
|
|
|
|
|
|
($end3,$d1,$f1) = unpack($pack2, $r2->fetchraw($pos2,\$buf)); |
|
691
|
|
|
|
|
|
|
|
|
692
|
|
|
|
|
|
|
##-- check date-filter & get slice-local profile $dprf |
|
693
|
0
|
0
|
0
|
|
|
|
next if ($dfilter && !$dfilter->($d1)); |
|
694
|
0
|
0
|
|
|
|
|
$ds = $slice ? int($d1/$slice)*$slice : 0; |
|
695
|
0
|
|
|
|
|
|
$dprf = $slice2prf{$ds}; |
|
696
|
0
|
|
|
|
|
|
$dprf->{f1} += $f1; |
|
697
|
|
|
|
|
|
|
|
|
698
|
0
|
0
|
|
|
|
|
next if ($beg3 >= $size3); |
|
699
|
0
|
|
|
|
|
|
for ($pos3=$beg3; $pos3 < $end3; ++$pos3) { |
|
700
|
0
|
|
|
|
|
|
($i2,$f12) = unpack($pack3, $r3->fetchraw($pos3,\$buf)); |
|
701
|
0
|
0
|
|
|
|
|
$key2 = $groupby ? $groupby->($i2) : pack($pack_id,$i2); |
|
702
|
|
|
|
|
|
|
next if (!defined($key2) ##-- item2 selection via groupby CODE-ref |
|
703
|
0
|
0
|
0
|
|
|
|
|| ($extend && !exists($extend->{$ds}{$key2}))); ##-- ... or via 'extend' parameter |
|
|
|
|
0
|
|
|
|
|
|
704
|
0
|
|
|
|
|
|
$dprf->{f12}{$key2} += $f12; |
|
705
|
|
|
|
|
|
|
|
|
706
|
0
|
0
|
0
|
|
|
|
if ($onepass && !exists($id2{"$i2 $d1"})) { |
|
707
|
|
|
|
|
|
|
##-- search for ($i2,$date) offset in r2 |
|
708
|
0
|
|
|
|
|
|
$id2{"$i2 $d1"} = undef; |
|
709
|
0
|
0
|
|
|
|
|
$blo = ($i2==0 ? 0 : unpack($pack1,$r1->fetchraw($i2-1,\$buf))); |
|
710
|
0
|
|
|
|
|
|
$bhi = unpack($pack1, $r1->fetchraw($i2,\$buf)); |
|
711
|
0
|
|
|
|
|
|
$bi = $r2->bsearch($d1,lo=>$blo,hi=>$bhi,packas=>$pack2d); |
|
712
|
0
|
|
|
|
|
|
$dprf->{f2}{$key2} += unpack($pack2f, $r2->fetchraw($bi,\$buf)); |
|
713
|
|
|
|
|
|
|
} |
|
714
|
|
|
|
|
|
|
} |
|
715
|
|
|
|
|
|
|
} |
|
716
|
|
|
|
|
|
|
} |
|
717
|
|
|
|
|
|
|
|
|
718
|
0
|
|
|
|
|
|
return \%slice2prf; |
|
719
|
|
|
|
|
|
|
} |
|
720
|
|
|
|
|
|
|
|
|
721
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
722
|
|
|
|
|
|
|
## Relation API: default: subprofile2 |
|
723
|
|
|
|
|
|
|
|
|
724
|
|
|
|
|
|
|
## \%slice2prf = $rel->subprofile2(\%slice2prf, \%opts) |
|
725
|
|
|
|
|
|
|
## + populate f2 frequencies for profiles in \%slice2prf |
|
726
|
|
|
|
|
|
|
## + %opts: as for subprofile1() |
|
727
|
|
|
|
|
|
|
sub subprofile2 { |
|
728
|
0
|
|
|
0
|
1
|
|
my ($cof,$slice2prf,$opts) = @_; |
|
729
|
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
##-- vars: common |
|
731
|
0
|
|
|
|
|
|
my $coldb = $opts->{coldb}; |
|
732
|
0
|
|
|
|
|
|
my $groupby = $opts->{groupby}; |
|
733
|
0
|
|
|
|
|
|
my $a2data = $opts->{a2data}; |
|
734
|
0
|
|
|
|
|
|
my $slice = $opts->{slice}; |
|
735
|
0
|
|
|
|
|
|
my $dfilter = $opts->{dreq}{dfilter}; |
|
736
|
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
##-- vars: relation-wise |
|
738
|
|
|
|
|
|
|
## $r1 : [$end2] @ $i1 |
|
739
|
|
|
|
|
|
|
## $r2 : [$end3,$d1,$f1]* @ end2($i1-1)..(end2($i1+1)-1) |
|
740
|
|
|
|
|
|
|
## #$r3 : [$i2,$f12]* @ end3($d1-1)..(end3($d1+1)-1) |
|
741
|
0
|
|
|
|
|
|
my ($r1,$r2) = @$cof{qw(r1 r2)}; |
|
742
|
0
|
|
|
|
|
|
my $pack1 = $r1->{packas}; |
|
743
|
0
|
|
|
|
|
|
my $pack2df = '@'.packsize("$cof->{pack_i}",0)."$cof->{pack_d}$cof->{pack_i}"; |
|
744
|
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
##-- optimize tightest loop for direct mmap buffer access if available |
|
746
|
0
|
0
|
|
|
|
|
my $bufr2 = UNIVERSAL::isa($r2,'DiaColloDB::PackedFile::MMap') ? $r2->{bufr} : undef; |
|
747
|
0
|
|
|
|
|
|
my $len2 = $r2->{reclen}; |
|
748
|
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
##-- get "most specific projected attribute" ("MSPA"): that projected attribute with largest enum |
|
750
|
|
|
|
|
|
|
#my $gb1 = scalar(@{$groupby->{attrs}})==1; ##-- are we grouping by a single attribute? -->optimize! |
|
751
|
0
|
|
|
|
|
|
my $mspai = (sort {$b->[1]<=>$a->[1]} map {[$_,$a2data->{$groupby->{attrs}[$_]}{enum}->size]} (0..$#{$groupby->{attrs}}))[0][0]; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
752
|
0
|
|
|
|
|
|
my $mspa = $groupby->{attrs}[$mspai]; |
|
753
|
0
|
|
|
|
|
|
my $mspgpack = $groupby->{gpack}[$mspai]; |
|
754
|
0
|
|
|
|
|
|
my $msptpack = $groupby->{tpack}[$mspai]; |
|
755
|
0
|
|
|
|
|
|
my $msp2t = $a2data->{$mspa}{a2t}; |
|
756
|
0
|
|
|
|
|
|
my %mspv = qw(); ##-- checked MSPA-values ($mspvi) |
|
757
|
0
|
|
|
|
|
|
my $tenum = $coldb->{tenum}; |
|
758
|
0
|
|
|
|
|
|
my $ts2g = $groupby->{ts2g}; |
|
759
|
|
|
|
|
|
|
|
|
760
|
0
|
|
|
|
|
|
my ($prf1, $mspvi,$i2,$t2,$key2, $beg2,$end2,$pos2, $d2,$f2,$ds2,$prf2, $buf); |
|
761
|
0
|
|
|
|
|
|
foreach $prf1 (values %$slice2prf) { |
|
762
|
0
|
|
|
|
|
|
foreach (keys %{$prf1->{f12}}) { |
|
|
0
|
|
|
|
|
|
|
|
763
|
0
|
|
|
|
|
|
$mspvi = unpack($mspgpack,$_); |
|
764
|
0
|
0
|
|
|
|
|
next if (exists $mspv{$mspvi}); |
|
765
|
0
|
|
|
|
|
|
$mspv{$mspvi} = undef; |
|
766
|
|
|
|
|
|
|
|
|
767
|
0
|
|
|
|
|
|
foreach $i2 (@{$msp2t->fetch($mspvi)}) { |
|
|
0
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
##-- get item2 t-tuple |
|
769
|
0
|
|
|
|
|
|
$t2 = $tenum->i2s($i2); |
|
770
|
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
##-- get groupby-key from tuple-string |
|
772
|
0
|
0
|
|
|
|
|
next if (!defined($key2 = $ts2g ? $ts2g->($t2) : pack($mspgpack, $i2))); ##-- having() failure |
|
|
|
0
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
##-- scan all dates for $i2 |
|
775
|
0
|
0
|
|
|
|
|
$beg2 = ($i2==0 ? 0 : unpack($pack1,$r1->fetchraw($i2-1,\$buf))); |
|
776
|
0
|
|
|
|
|
|
$end2 = unpack($pack1, $r1->fetchraw($i2,\$buf)); |
|
777
|
0
|
|
|
|
|
|
for ($pos2=$beg2; $pos2 < $end2; ++$pos2) { |
|
778
|
0
|
0
|
|
|
|
|
($d2,$f2) = unpack($pack2df, $bufr2 ? substr($$bufr2, $pos2*$len2, $len2) : $r2->fetchraw($pos2,\$buf)); |
|
779
|
|
|
|
|
|
|
|
|
780
|
|
|
|
|
|
|
##-- check date-filter & get slice |
|
781
|
0
|
0
|
0
|
|
|
|
next if ($dfilter && !$dfilter->($d2)); |
|
782
|
0
|
0
|
|
|
|
|
$ds2 = $slice ? int($d2/$slice)*$slice : 0; |
|
783
|
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
##-- ignore if item2 isn't in target slice |
|
785
|
0
|
|
|
|
|
|
$prf2 = $slice2prf->{$ds2}; |
|
786
|
0
|
0
|
|
|
|
|
next if (!exists($prf2->{f12}{$key2})); |
|
787
|
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
##-- add independent f2 |
|
789
|
0
|
|
|
|
|
|
$prf2->{f2}{$key2} += $f2; |
|
790
|
|
|
|
|
|
|
} |
|
791
|
|
|
|
|
|
|
} |
|
792
|
|
|
|
|
|
|
} |
|
793
|
|
|
|
|
|
|
} |
|
794
|
|
|
|
|
|
|
|
|
795
|
0
|
|
|
|
|
|
return $slice2prf; |
|
796
|
|
|
|
|
|
|
} |
|
797
|
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
799
|
|
|
|
|
|
|
## Relation API: default: qinfo |
|
800
|
|
|
|
|
|
|
|
|
801
|
|
|
|
|
|
|
## \%qinfo = $rel->qinfo($coldb, %opts) |
|
802
|
|
|
|
|
|
|
## + get query-info hash for profile administrivia (ddc hit links) |
|
803
|
|
|
|
|
|
|
## + %opts: as for profile(), additionally: |
|
804
|
|
|
|
|
|
|
## ( |
|
805
|
|
|
|
|
|
|
## qreqs => \@qreqs, ##-- as returned by $coldb->parseRequest($opts{query}) |
|
806
|
|
|
|
|
|
|
## gbreq => \%groupby, ##-- as returned by $coldb->groupby($opts{groupby}) |
|
807
|
|
|
|
|
|
|
## ) |
|
808
|
|
|
|
|
|
|
sub qinfo { |
|
809
|
0
|
|
|
0
|
1
|
|
my ($rel,$coldb,%opts) = @_; |
|
810
|
0
|
|
|
|
|
|
my ($q1strs,$q2strs,$qxstrs,$fstrs) = $rel->qinfoData($coldb,%opts); |
|
811
|
|
|
|
|
|
|
|
|
812
|
0
|
0
|
|
|
|
|
my $q1str = '('.(@$q1strs ? join(' WITH ', @$q1strs,@$qxstrs) : '*').') =1'; |
|
813
|
0
|
0
|
|
|
|
|
my $q2str = '('.(@$q2strs ? join(' WITH ', @$q2strs,@$qxstrs) : '*').') =2'; |
|
814
|
|
|
|
|
|
|
my $qstr = ( |
|
815
|
|
|
|
|
|
|
#"$q1str && $q2str" ##-- approximate with &&-query (especially buggy since #sep doesn't work right here; see mantis bug #654) |
|
816
|
0
|
0
|
|
|
|
|
"NEAR( $q1str, $q2str, ".(2*($rel->{dmax}-1)).")" |
|
817
|
|
|
|
|
|
|
.' #SEPARATE' |
|
818
|
|
|
|
|
|
|
.(@$fstrs ? (' '.join(' ',@$fstrs)) : ''), |
|
819
|
|
|
|
|
|
|
); |
|
820
|
|
|
|
|
|
|
return { |
|
821
|
|
|
|
|
|
|
fcoef => 2*$rel->{dmax}, |
|
822
|
0
|
|
|
|
|
|
qtemplate => $qstr, |
|
823
|
|
|
|
|
|
|
qcanon => $rel->qcanon($coldb,%opts), |
|
824
|
|
|
|
|
|
|
}; |
|
825
|
|
|
|
|
|
|
} |
|
826
|
|
|
|
|
|
|
|
|
827
|
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
##============================================================================== |
|
829
|
|
|
|
|
|
|
## Pacakge Alias(es) |
|
830
|
|
|
|
|
|
|
package DiaColloDB::Cofreqs; |
|
831
|
1
|
|
|
1
|
|
23
|
use strict; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
83
|
|
|
832
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB::Relation::Cofreqs); |
|
833
|
|
|
|
|
|
|
|
|
834
|
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
##============================================================================== |
|
836
|
|
|
|
|
|
|
## Footer |
|
837
|
|
|
|
|
|
|
1; |
|
838
|
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
__END__ |