| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
## -*- Mode: CPerl -*- |
|
2
|
|
|
|
|
|
|
## File: DiaColloDB::Compat::v0_09::Relation::Unigrams.pm |
|
3
|
|
|
|
|
|
|
## Author: Bryan Jurish <moocow@cpan.org> |
|
4
|
|
|
|
|
|
|
## Description: collocation db, profiling relation: unigram database (using DiaColloDB::PackedFile) |
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package DiaColloDB::Compat::v0_09::Relation::Unigrams; |
|
7
|
1
|
|
|
1
|
|
7
|
use DiaColloDB::Compat::v0_09::Relation; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
26
|
|
|
8
|
1
|
|
|
1
|
|
4
|
use DiaColloDB::PackedFile; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
20
|
|
|
9
|
1
|
|
|
1
|
|
5
|
use DiaColloDB::Utils qw(:sort :env :run :pack :file); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
31
|
|
|
10
|
1
|
|
|
1
|
|
309
|
use Fcntl qw(:seek); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
25
|
|
|
11
|
1
|
|
|
1
|
|
177
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
573
|
|
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
##============================================================================== |
|
14
|
|
|
|
|
|
|
## Globals & Constants |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB::PackedFile DiaColloDB::Compat::v0_09::Relation); |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
##============================================================================== |
|
19
|
|
|
|
|
|
|
## Constructors etc. |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
## $ug = CLASS_OR_OBJECT->new(%args) |
|
22
|
|
|
|
|
|
|
## + %args, object structure: |
|
23
|
|
|
|
|
|
|
## ( |
|
24
|
|
|
|
|
|
|
## ##-- PackedFile: user options |
|
25
|
|
|
|
|
|
|
## file => $filename, ##-- default: undef (none) |
|
26
|
|
|
|
|
|
|
## flags => $flags, ##-- fcntl flags or open-mode (default='r') |
|
27
|
|
|
|
|
|
|
## perms => $perms, ##-- creation permissions (default=(0666 &~umask)) |
|
28
|
|
|
|
|
|
|
## reclen => $reclen, ##-- record-length in bytes: (default: guess from pack format if available) |
|
29
|
|
|
|
|
|
|
## packas => $packas, ##-- pack-format or array; see DiaColloDB::Utils::packFilterStore(); ##-- OVERRIDE default='N' |
|
30
|
|
|
|
|
|
|
## ## |
|
31
|
|
|
|
|
|
|
## ##-- PackedFile: filters |
|
32
|
|
|
|
|
|
|
## filter_fetch => $filter, ##-- DB_File-style filter for fetch |
|
33
|
|
|
|
|
|
|
## filter_store => $filter, ##-- DB_File-style filter for store |
|
34
|
|
|
|
|
|
|
## ## |
|
35
|
|
|
|
|
|
|
## ##-- PackedFile: low-level data |
|
36
|
|
|
|
|
|
|
## fh => $fh, ##-- underlying filehandle |
|
37
|
|
|
|
|
|
|
## ## |
|
38
|
|
|
|
|
|
|
## ##-- Unigrams: high-level data |
|
39
|
|
|
|
|
|
|
## N => $N, ##-- total frequency |
|
40
|
|
|
|
|
|
|
## ) |
|
41
|
|
|
|
|
|
|
sub new { |
|
42
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
|
43
|
0
|
|
|
|
|
|
my $ug = $that->DiaColloDB::PackedFile::new( |
|
44
|
|
|
|
|
|
|
N=>0, |
|
45
|
|
|
|
|
|
|
packas=>'N', |
|
46
|
|
|
|
|
|
|
@_ |
|
47
|
|
|
|
|
|
|
); |
|
48
|
0
|
|
|
|
|
|
return $ug; |
|
49
|
|
|
|
|
|
|
} |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
##============================================================================== |
|
52
|
|
|
|
|
|
|
## Persistent API: disk usage: INHERITED |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
##============================================================================== |
|
55
|
|
|
|
|
|
|
## API: open/close: mostly INHERITED |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
## $filename = $obj->headerFile() |
|
58
|
|
|
|
|
|
|
## + returns header filename; default returns "$obj->{base}.hdr" or "$obj->{dbdir}/header.json" |
|
59
|
|
|
|
|
|
|
sub headerFile { |
|
60
|
0
|
0
|
|
0
|
1
|
|
return undef if (!ref($_[0])); |
|
61
|
0
|
0
|
|
|
|
|
return "$_[0]{file}.hdr" if (defined($_[0]{file})); |
|
62
|
0
|
|
|
|
|
|
return undef; |
|
63
|
|
|
|
|
|
|
} |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
##============================================================================== |
|
66
|
|
|
|
|
|
|
## API: filters: INHERITED |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
##============================================================================== |
|
69
|
|
|
|
|
|
|
## PackedFile API: positioning: INHERITED |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
##============================================================================== |
|
72
|
|
|
|
|
|
|
## PackedFile API: record access: INHERITED |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
##============================================================================== |
|
75
|
|
|
|
|
|
|
## I/O: text |
|
76
|
|
|
|
|
|
|
## + largely INHERITED from DiaColloDB::PackedFile |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
## $bool = $ug->saveTextFh_v0_10($fh,%opts) |
|
79
|
|
|
|
|
|
|
## + save from text file in v0.10.x format: lines of the form: |
|
80
|
|
|
|
|
|
|
## N ##-- 1 field : N |
|
81
|
|
|
|
|
|
|
## FREQ ID1 DATE ##-- 3 fields: unigram frequency for (ID1,DATE) |
|
82
|
|
|
|
|
|
|
## + %opts: |
|
83
|
|
|
|
|
|
|
## i2s => \&CODE, ##-- code-ref for formatting indices; called as $s=CODE($i) |
|
84
|
|
|
|
|
|
|
sub saveTextFh_v0_10 { |
|
85
|
0
|
|
|
0
|
0
|
|
my ($ug,$outfh,%opts) = @_; |
|
86
|
0
|
0
|
|
|
|
|
$ug->logconfess("saveTextFile(): cannot save unopened DB") if (!$ug->opened); |
|
87
|
|
|
|
|
|
|
|
|
88
|
0
|
0
|
|
|
|
|
$outfh->print($ug->{N}, "\n") if (defined($ug->{N})); ##-- save N line |
|
89
|
0
|
|
|
|
|
|
my $size = $ug->size(); |
|
90
|
0
|
|
|
|
|
|
my $i2s = $opts{i2s}; |
|
91
|
0
|
|
|
|
|
|
my ($i,$val); |
|
92
|
0
|
|
|
|
|
|
for ($i=0, $ug->reset(); $i < $size; ++$i) { |
|
93
|
0
|
|
|
|
|
|
$val = $ug->get(); |
|
94
|
0
|
0
|
|
|
|
|
$outfh->print($val, "\t", ($i2s ? $i2s->($i) : $i), "\n"); |
|
95
|
|
|
|
|
|
|
} |
|
96
|
|
|
|
|
|
|
|
|
97
|
0
|
|
|
|
|
|
return $ug; |
|
98
|
|
|
|
|
|
|
} |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
##============================================================================== |
|
103
|
|
|
|
|
|
|
## PackedFile API: tie interface: INHERITED |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
##============================================================================== |
|
106
|
|
|
|
|
|
|
## Relation API: create |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
## $ug = $CLASS_OR_OBJECT->create($coldb,$tokdat_file,%opts) |
|
109
|
|
|
|
|
|
|
## + populates current database from $tokdat_file, |
|
110
|
|
|
|
|
|
|
## a tt-style text file containing 1 token-id perl line with optional blank lines |
|
111
|
|
|
|
|
|
|
## + %opts: clobber %$ug, also: |
|
112
|
|
|
|
|
|
|
## ( |
|
113
|
|
|
|
|
|
|
## size=>$size, ##-- set initial size |
|
114
|
|
|
|
|
|
|
## ) |
|
115
|
|
|
|
|
|
|
## + DISABLED |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
##============================================================================== |
|
118
|
|
|
|
|
|
|
## Relation API: union |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
## $ug = CLASS_OR_OBJECT->union($coldb, \@pairs, %opts) |
|
121
|
|
|
|
|
|
|
## + merge multiple co-frequency indices into new object |
|
122
|
|
|
|
|
|
|
## + @pairs : array of pairs ([$ug,\@xi2u],...) |
|
123
|
|
|
|
|
|
|
## of unigram-objects $ug and tuple-id maps \@xi2u for $ug |
|
124
|
|
|
|
|
|
|
## + %opts: clobber %$ug |
|
125
|
|
|
|
|
|
|
## + implicitly flushes the new index |
|
126
|
|
|
|
|
|
|
## + DISABLED |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
##============================================================================== |
|
129
|
|
|
|
|
|
|
## Relation API: dbinfo |
|
130
|
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
## \%info = $rel->dbinfo($coldb) |
|
132
|
|
|
|
|
|
|
## + embedded info-hash for $coldb->dbinfo() |
|
133
|
|
|
|
|
|
|
sub dbinfo { |
|
134
|
0
|
|
|
0
|
0
|
|
my $ug = shift; |
|
135
|
0
|
|
|
|
|
|
my $info = $ug->SUPER::dbinfo(); |
|
136
|
0
|
|
|
|
|
|
$info->{N} = $ug->{N}; |
|
137
|
0
|
|
|
|
|
|
$info->{size} = $ug->size(); |
|
138
|
0
|
|
|
|
|
|
return $info; |
|
139
|
|
|
|
|
|
|
} |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
##============================================================================== |
|
143
|
|
|
|
|
|
|
## Relation API: default: profiling |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
## $prf = $ug->subprofile1(\@xids, %opts) |
|
146
|
|
|
|
|
|
|
## + get frequency profile for @xids (db must be opened) |
|
147
|
|
|
|
|
|
|
## + %opts: |
|
148
|
|
|
|
|
|
|
## groupby => \&gbsub, ##-- key-extractor $key2_or_undef = $gbsub->($i2) |
|
149
|
|
|
|
|
|
|
## coldb => $coldb, ##-- for debugging |
|
150
|
|
|
|
|
|
|
sub subprofile1 { |
|
151
|
0
|
|
|
0
|
1
|
|
my ($ug,$ids,%opts) = @_; |
|
152
|
0
|
0
|
|
|
|
|
$ids = [$ids] if (!UNIVERSAL::isa($ids,'ARRAY')); |
|
153
|
|
|
|
|
|
|
|
|
154
|
0
|
|
|
|
|
|
my $fh = $ug->{fh}; |
|
155
|
0
|
|
|
|
|
|
my $packf = $ug->{packas}; |
|
156
|
0
|
|
|
|
|
|
my $reclen = $ug->{reclen}; |
|
157
|
0
|
|
|
|
|
|
my $groupby = $opts{groupby}; |
|
158
|
0
|
|
|
|
|
|
my $pf1 = 0; |
|
159
|
0
|
|
|
|
|
|
my $pf2 = {}; |
|
160
|
0
|
|
|
|
|
|
my ($i,$f,$key2, $buf); |
|
161
|
|
|
|
|
|
|
|
|
162
|
0
|
|
|
|
|
|
foreach $i (@$ids) { |
|
163
|
0
|
0
|
|
|
|
|
CORE::seek($fh, $i*$reclen, SEEK_SET) or return undef; |
|
164
|
0
|
0
|
|
|
|
|
CORE::read($fh, $buf, $reclen)==$reclen or return undef; |
|
165
|
0
|
|
|
|
|
|
$f = unpack($packf,$buf); |
|
166
|
0
|
|
|
|
|
|
$pf1 += $f; |
|
167
|
0
|
0
|
|
|
|
|
$key2 = $groupby ? $groupby->($i) : $i; |
|
168
|
0
|
0
|
|
|
|
|
next if (!defined($key2)); |
|
169
|
0
|
|
|
|
|
|
$pf2->{$key2} += $f |
|
170
|
|
|
|
|
|
|
} |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
return DiaColloDB::Profile->new( |
|
173
|
|
|
|
|
|
|
N=>$ug->{N}, |
|
174
|
0
|
|
|
|
|
|
f1=>$pf1, |
|
175
|
|
|
|
|
|
|
f2=>$pf2, |
|
176
|
|
|
|
|
|
|
f12=>{ %$pf2 }, |
|
177
|
|
|
|
|
|
|
); |
|
178
|
|
|
|
|
|
|
} |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
##============================================================================== |
|
181
|
|
|
|
|
|
|
## Relation API: default: query info |
|
182
|
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
## \%qinfo = $rel->qinfo($coldb, %opts) |
|
184
|
|
|
|
|
|
|
## + get query-info hash for profile administrivia (ddc hit links) |
|
185
|
|
|
|
|
|
|
## + %opts: as for profile(), additionally: |
|
186
|
|
|
|
|
|
|
## ( |
|
187
|
|
|
|
|
|
|
## qreqs => \@qreqs, ##-- as returned by $coldb->parseRequest($opts{query}) |
|
188
|
|
|
|
|
|
|
## gbreq => \%groupby, ##-- as returned by $coldb->groupby($opts{groupby}) |
|
189
|
|
|
|
|
|
|
## ) |
|
190
|
|
|
|
|
|
|
sub qinfo { |
|
191
|
0
|
|
|
0
|
1
|
|
my ($rel,$coldb,%opts) = @_; |
|
192
|
0
|
|
|
|
|
|
my ($q1strs,$q2strs,$qxstrs,$fstrs) = $rel->qinfoData($coldb,%opts); |
|
193
|
|
|
|
|
|
|
|
|
194
|
0
|
|
|
|
|
|
my @qstrs = (@$q1strs, @$q2strs, @$qxstrs); |
|
195
|
0
|
0
|
|
|
|
|
@qstrs = ('*') if (!@qstrs); |
|
196
|
0
|
0
|
|
|
|
|
my $qstr = ('('.join(' WITH ', @qstrs).') =1' |
|
197
|
|
|
|
|
|
|
.' #SEPARATE' |
|
198
|
|
|
|
|
|
|
.(@$fstrs ? (' '.join(' ',@$fstrs)) : ''), |
|
199
|
|
|
|
|
|
|
); |
|
200
|
|
|
|
|
|
|
return { |
|
201
|
0
|
|
|
|
|
|
fcoef => 1, |
|
202
|
|
|
|
|
|
|
qtemplate => $qstr, |
|
203
|
|
|
|
|
|
|
}; |
|
204
|
|
|
|
|
|
|
} |
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
##============================================================================== |
|
207
|
|
|
|
|
|
|
## Pacakge Alias(es) |
|
208
|
|
|
|
|
|
|
package DiaColloDB::Compat::v0_09::Unigrams; |
|
209
|
1
|
|
|
1
|
|
7
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
50
|
|
|
210
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB::Compat::v0_09::Relation::Unigrams); |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
##============================================================================== |
|
213
|
|
|
|
|
|
|
## Footer |
|
214
|
|
|
|
|
|
|
1; |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
__END__ |