line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
## -*- Mode: CPerl -*- |
2
|
|
|
|
|
|
|
## File: DiaColloDB::Compat::v0_09::Relation::Unigrams.pm |
3
|
|
|
|
|
|
|
## Author: Bryan Jurish <moocow@cpan.org> |
4
|
|
|
|
|
|
|
## Description: collocation db, profiling relation: unigram database (using DiaColloDB::PackedFile) |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package DiaColloDB::Compat::v0_09::Relation::Unigrams; |
7
|
1
|
|
|
1
|
|
8
|
use DiaColloDB::Compat::v0_09::Relation; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
33
|
|
8
|
1
|
|
|
1
|
|
6
|
use DiaColloDB::PackedFile; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
25
|
|
9
|
1
|
|
|
1
|
|
5
|
use DiaColloDB::Utils qw(:sort :env :run :pack :file); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
73
|
|
10
|
1
|
|
|
1
|
|
402
|
use Fcntl qw(:seek); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
32
|
|
11
|
1
|
|
|
1
|
|
256
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
719
|
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
##============================================================================== |
14
|
|
|
|
|
|
|
## Globals & Constants |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB::PackedFile DiaColloDB::Compat::v0_09::Relation); |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
##============================================================================== |
19
|
|
|
|
|
|
|
## Constructors etc. |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
## $ug = CLASS_OR_OBJECT->new(%args) |
22
|
|
|
|
|
|
|
## + %args, object structure: |
23
|
|
|
|
|
|
|
## ( |
24
|
|
|
|
|
|
|
## ##-- PackedFile: user options |
25
|
|
|
|
|
|
|
## file => $filename, ##-- default: undef (none) |
26
|
|
|
|
|
|
|
## flags => $flags, ##-- fcntl flags or open-mode (default='r') |
27
|
|
|
|
|
|
|
## perms => $perms, ##-- creation permissions (default=(0666 &~umask)) |
28
|
|
|
|
|
|
|
## reclen => $reclen, ##-- record-length in bytes: (default: guess from pack format if available) |
29
|
|
|
|
|
|
|
## packas => $packas, ##-- pack-format or array; see DiaColloDB::Utils::packFilterStore(); ##-- OVERRIDE default='N' |
30
|
|
|
|
|
|
|
## ## |
31
|
|
|
|
|
|
|
## ##-- PackedFile: filters |
32
|
|
|
|
|
|
|
## filter_fetch => $filter, ##-- DB_File-style filter for fetch |
33
|
|
|
|
|
|
|
## filter_store => $filter, ##-- DB_File-style filter for store |
34
|
|
|
|
|
|
|
## ## |
35
|
|
|
|
|
|
|
## ##-- PackedFile: low-level data |
36
|
|
|
|
|
|
|
## fh => $fh, ##-- underlying filehandle |
37
|
|
|
|
|
|
|
## ## |
38
|
|
|
|
|
|
|
## ##-- Unigrams: high-level data |
39
|
|
|
|
|
|
|
## N => $N, ##-- total frequency |
40
|
|
|
|
|
|
|
## ) |
41
|
|
|
|
|
|
|
sub new { |
42
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
43
|
0
|
|
|
|
|
|
my $ug = $that->DiaColloDB::PackedFile::new( |
44
|
|
|
|
|
|
|
N=>0, |
45
|
|
|
|
|
|
|
packas=>'N', |
46
|
|
|
|
|
|
|
@_ |
47
|
|
|
|
|
|
|
); |
48
|
0
|
|
|
|
|
|
return $ug; |
49
|
|
|
|
|
|
|
} |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
##============================================================================== |
52
|
|
|
|
|
|
|
## Persistent API: disk usage: INHERITED |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
##============================================================================== |
55
|
|
|
|
|
|
|
## API: open/close: mostly INHERITED |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
## $filename = $obj->headerFile() |
58
|
|
|
|
|
|
|
## + returns header filename; default returns "$obj->{base}.hdr" or "$obj->{dbdir}/header.json" |
59
|
|
|
|
|
|
|
sub headerFile { |
60
|
0
|
0
|
|
0
|
1
|
|
return undef if (!ref($_[0])); |
61
|
0
|
0
|
|
|
|
|
return "$_[0]{file}.hdr" if (defined($_[0]{file})); |
62
|
0
|
|
|
|
|
|
return undef; |
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
##============================================================================== |
66
|
|
|
|
|
|
|
## API: filters: INHERITED |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
##============================================================================== |
69
|
|
|
|
|
|
|
## PackedFile API: positioning: INHERITED |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
##============================================================================== |
72
|
|
|
|
|
|
|
## PackedFile API: record access: INHERITED |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
##============================================================================== |
75
|
|
|
|
|
|
|
## I/O: text |
76
|
|
|
|
|
|
|
## + largely INHERITED from DiaColloDB::PackedFile |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
## $bool = $ug->saveTextFh_v0_10($fh,%opts) |
79
|
|
|
|
|
|
|
## + save from text file in v0.10.x format: lines of the form: |
80
|
|
|
|
|
|
|
## N ##-- 1 field : N |
81
|
|
|
|
|
|
|
## FREQ ID1 DATE ##-- 3 fields: unigram frequency for (ID1,DATE) |
82
|
|
|
|
|
|
|
## + %opts: |
83
|
|
|
|
|
|
|
## i2s => \&CODE, ##-- code-ref for formatting indices; called as $s=CODE($i) |
84
|
|
|
|
|
|
|
sub saveTextFh_v0_10 { |
85
|
0
|
|
|
0
|
0
|
|
my ($ug,$outfh,%opts) = @_; |
86
|
0
|
0
|
|
|
|
|
$ug->logconfess("saveTextFile(): cannot save unopened DB") if (!$ug->opened); |
87
|
|
|
|
|
|
|
|
88
|
0
|
0
|
|
|
|
|
$outfh->print($ug->{N}, "\n") if (defined($ug->{N})); ##-- save N line |
89
|
0
|
|
|
|
|
|
my $size = $ug->size(); |
90
|
0
|
|
|
|
|
|
my $i2s = $opts{i2s}; |
91
|
0
|
|
|
|
|
|
my ($i,$val); |
92
|
0
|
|
|
|
|
|
for ($i=0, $ug->reset(); $i < $size; ++$i) { |
93
|
0
|
|
|
|
|
|
$val = $ug->get(); |
94
|
0
|
0
|
|
|
|
|
$outfh->print($val, "\t", ($i2s ? $i2s->($i) : $i), "\n"); |
95
|
|
|
|
|
|
|
} |
96
|
|
|
|
|
|
|
|
97
|
0
|
|
|
|
|
|
return $ug; |
98
|
|
|
|
|
|
|
} |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
##============================================================================== |
103
|
|
|
|
|
|
|
## PackedFile API: tie interface: INHERITED |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
##============================================================================== |
106
|
|
|
|
|
|
|
## Relation API: create |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
## $ug = $CLASS_OR_OBJECT->create($coldb,$tokdat_file,%opts) |
109
|
|
|
|
|
|
|
## + populates current database from $tokdat_file, |
110
|
|
|
|
|
|
|
## a tt-style text file containing 1 token-id perl line with optional blank lines |
111
|
|
|
|
|
|
|
## + %opts: clobber %$ug, also: |
112
|
|
|
|
|
|
|
## ( |
113
|
|
|
|
|
|
|
## size=>$size, ##-- set initial size |
114
|
|
|
|
|
|
|
## ) |
115
|
|
|
|
|
|
|
## + DISABLED |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
##============================================================================== |
118
|
|
|
|
|
|
|
## Relation API: union |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
## $ug = CLASS_OR_OBJECT->union($coldb, \@pairs, %opts) |
121
|
|
|
|
|
|
|
## + merge multiple co-frequency indices into new object |
122
|
|
|
|
|
|
|
## + @pairs : array of pairs ([$ug,\@xi2u],...) |
123
|
|
|
|
|
|
|
## of unigram-objects $ug and tuple-id maps \@xi2u for $ug |
124
|
|
|
|
|
|
|
## + %opts: clobber %$ug |
125
|
|
|
|
|
|
|
## + implicitly flushes the new index |
126
|
|
|
|
|
|
|
## + DISABLED |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
##============================================================================== |
129
|
|
|
|
|
|
|
## Relation API: dbinfo |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
## \%info = $rel->dbinfo($coldb) |
132
|
|
|
|
|
|
|
## + embedded info-hash for $coldb->dbinfo() |
133
|
|
|
|
|
|
|
sub dbinfo { |
134
|
0
|
|
|
0
|
0
|
|
my $ug = shift; |
135
|
0
|
|
|
|
|
|
my $info = $ug->SUPER::dbinfo(); |
136
|
0
|
|
|
|
|
|
$info->{N} = $ug->{N}; |
137
|
0
|
|
|
|
|
|
$info->{size} = $ug->size(); |
138
|
0
|
|
|
|
|
|
return $info; |
139
|
|
|
|
|
|
|
} |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
##============================================================================== |
143
|
|
|
|
|
|
|
## Relation API: default: profiling |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
## $prf = $ug->subprofile1(\@xids, %opts) |
146
|
|
|
|
|
|
|
## + get frequency profile for @xids (db must be opened) |
147
|
|
|
|
|
|
|
## + %opts: |
148
|
|
|
|
|
|
|
## groupby => \&gbsub, ##-- key-extractor $key2_or_undef = $gbsub->($i2) |
149
|
|
|
|
|
|
|
## coldb => $coldb, ##-- for debugging |
150
|
|
|
|
|
|
|
sub subprofile1 { |
151
|
0
|
|
|
0
|
1
|
|
my ($ug,$ids,%opts) = @_; |
152
|
0
|
0
|
|
|
|
|
$ids = [$ids] if (!UNIVERSAL::isa($ids,'ARRAY')); |
153
|
|
|
|
|
|
|
|
154
|
0
|
|
|
|
|
|
my $fh = $ug->{fh}; |
155
|
0
|
|
|
|
|
|
my $packf = $ug->{packas}; |
156
|
0
|
|
|
|
|
|
my $reclen = $ug->{reclen}; |
157
|
0
|
|
|
|
|
|
my $groupby = $opts{groupby}; |
158
|
0
|
|
|
|
|
|
my $pf1 = 0; |
159
|
0
|
|
|
|
|
|
my $pf2 = {}; |
160
|
0
|
|
|
|
|
|
my ($i,$f,$key2, $buf); |
161
|
|
|
|
|
|
|
|
162
|
0
|
|
|
|
|
|
foreach $i (@$ids) { |
163
|
0
|
0
|
|
|
|
|
CORE::seek($fh, $i*$reclen, SEEK_SET) or return undef; |
164
|
0
|
0
|
|
|
|
|
CORE::read($fh, $buf, $reclen)==$reclen or return undef; |
165
|
0
|
|
|
|
|
|
$f = unpack($packf,$buf); |
166
|
0
|
|
|
|
|
|
$pf1 += $f; |
167
|
0
|
0
|
|
|
|
|
$key2 = $groupby ? $groupby->($i) : $i; |
168
|
0
|
0
|
|
|
|
|
next if (!defined($key2)); |
169
|
0
|
|
|
|
|
|
$pf2->{$key2} += $f |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
return DiaColloDB::Profile->new( |
173
|
|
|
|
|
|
|
N=>$ug->{N}, |
174
|
0
|
|
|
|
|
|
f1=>$pf1, |
175
|
|
|
|
|
|
|
f2=>$pf2, |
176
|
|
|
|
|
|
|
f12=>{ %$pf2 }, |
177
|
|
|
|
|
|
|
); |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
##============================================================================== |
181
|
|
|
|
|
|
|
## Relation API: default: query info |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
## \%qinfo = $rel->qinfo($coldb, %opts) |
184
|
|
|
|
|
|
|
## + get query-info hash for profile administrivia (ddc hit links) |
185
|
|
|
|
|
|
|
## + %opts: as for profile(), additionally: |
186
|
|
|
|
|
|
|
## ( |
187
|
|
|
|
|
|
|
## qreqs => \@qreqs, ##-- as returned by $coldb->parseRequest($opts{query}) |
188
|
|
|
|
|
|
|
## gbreq => \%groupby, ##-- as returned by $coldb->groupby($opts{groupby}) |
189
|
|
|
|
|
|
|
## ) |
190
|
|
|
|
|
|
|
sub qinfo { |
191
|
0
|
|
|
0
|
1
|
|
my ($rel,$coldb,%opts) = @_; |
192
|
0
|
|
|
|
|
|
my ($q1strs,$q2strs,$qxstrs,$fstrs) = $rel->qinfoData($coldb,%opts); |
193
|
|
|
|
|
|
|
|
194
|
0
|
|
|
|
|
|
my @qstrs = (@$q1strs, @$q2strs, @$qxstrs); |
195
|
0
|
0
|
|
|
|
|
@qstrs = ('*') if (!@qstrs); |
196
|
0
|
0
|
|
|
|
|
my $qstr = ('('.join(' WITH ', @qstrs).') =1' |
197
|
|
|
|
|
|
|
.' #SEPARATE' |
198
|
|
|
|
|
|
|
.(@$fstrs ? (' '.join(' ',@$fstrs)) : ''), |
199
|
|
|
|
|
|
|
); |
200
|
|
|
|
|
|
|
return { |
201
|
0
|
|
|
|
|
|
fcoef => 1, |
202
|
|
|
|
|
|
|
qtemplate => $qstr, |
203
|
|
|
|
|
|
|
}; |
204
|
|
|
|
|
|
|
} |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
##============================================================================== |
207
|
|
|
|
|
|
|
## Pacakge Alias(es) |
208
|
|
|
|
|
|
|
package DiaColloDB::Compat::v0_09::Unigrams; |
209
|
1
|
|
|
1
|
|
37
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
59
|
|
210
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB::Compat::v0_09::Relation::Unigrams); |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
##============================================================================== |
213
|
|
|
|
|
|
|
## Footer |
214
|
|
|
|
|
|
|
1; |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
__END__ |