line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
##-*- Mode: CPerl -*- |
2
|
|
|
|
|
|
|
## |
3
|
|
|
|
|
|
|
## File: PDL::Ngrams.pm |
4
|
|
|
|
|
|
|
## Author: Bryan Jurish |
5
|
|
|
|
|
|
|
## Description: N-Gram utilities for PDL |
6
|
|
|
|
|
|
|
##====================================================================== |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
package PDL::Ngrams; |
9
|
4
|
|
|
4
|
|
1964381
|
use strict; |
|
4
|
|
|
|
|
11
|
|
|
4
|
|
|
|
|
115
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
##====================================================================== |
12
|
|
|
|
|
|
|
## Export hacks |
13
|
4
|
|
|
4
|
|
22
|
use PDL; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
29
|
|
14
|
4
|
|
|
4
|
|
12323
|
use PDL::Exporter; |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
21
|
|
15
|
4
|
|
|
4
|
|
3174
|
use PDL::VectorValued; |
|
4
|
|
|
|
|
30582
|
|
|
4
|
|
|
|
|
33
|
|
16
|
4
|
|
|
4
|
|
3252
|
use PDL::Ngrams::Utils; |
|
4
|
|
|
|
|
9
|
|
|
4
|
|
|
|
|
60
|
|
17
|
|
|
|
|
|
|
our @ISA = qw(PDL::Exporter); |
18
|
|
|
|
|
|
|
our @EXPORT_OK = |
19
|
|
|
|
|
|
|
( |
20
|
|
|
|
|
|
|
(@PDL::Ngrams::Utils::EXPORT_OK), ##-- inherited |
21
|
|
|
|
|
|
|
qw(ng_cofreq ng_rotate), |
22
|
|
|
|
|
|
|
qw(_ng_qsortvec), ##-- compat |
23
|
|
|
|
|
|
|
); |
24
|
|
|
|
|
|
|
our %EXPORT_TAGS = |
25
|
|
|
|
|
|
|
( |
26
|
|
|
|
|
|
|
Func => [@EXPORT_OK], ##-- respect PDL conventions (hopefully) |
27
|
|
|
|
|
|
|
); |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
our $VERSION = '0.09'; ##-- use perl-reversion to update |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
##====================================================================== |
32
|
|
|
|
|
|
|
## pod: header |
33
|
|
|
|
|
|
|
=pod |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
=head1 NAME |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
PDL::Ngrams - N-Gram utilities for PDL |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
=head1 SYNOPSIS |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
use PDL; |
42
|
|
|
|
|
|
|
use PDL::Ngrams; |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
##--------------------------------------------------------------------- |
45
|
|
|
|
|
|
|
## Basic Data |
46
|
|
|
|
|
|
|
$toks = rint(10*random(10)); |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
##--------------------------------------------------------------------- |
49
|
|
|
|
|
|
|
## ... stuff happens |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=cut |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
##====================================================================== |
55
|
|
|
|
|
|
|
## Description |
56
|
|
|
|
|
|
|
=pod |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
=head1 DESCRIPTION |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
PDL::Ngrams provides basic utilities for tracking N-grams over PDL vectors. |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
=cut |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
##====================================================================== |
65
|
|
|
|
|
|
|
## pod: Functions |
66
|
|
|
|
|
|
|
=pod |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
=head1 FUNCTIONS |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=cut |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
##====================================================================== |
73
|
|
|
|
|
|
|
## backwards-compatibility aliases |
74
|
|
|
|
|
|
|
*PDL::_ng_qsortvec = *_ng_qsortvec = \&PDL::vv_qsortvec; |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
##====================================================================== |
77
|
|
|
|
|
|
|
## Run-Length Encoding/Decoding: n-dimensionl |
78
|
|
|
|
|
|
|
=pod |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
=head1 Counting N-Grams over PDLs |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
=cut |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
##---------------------------------------------------------------------- |
85
|
|
|
|
|
|
|
## ng_cofreq() |
86
|
|
|
|
|
|
|
=pod |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=head2 ng_cofreq |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
=for sig |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
Signature: (toks(@adims,N,NToks); %args) |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
Returns: (int [o]ngramfreqs(NNgrams); [o]ngramids(@adims,N,NNgrams)) |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
Keyword arguments (optional): |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
norotate => $bool, ##-- if true, $toks() will NOT be rotated along $N |
99
|
|
|
|
|
|
|
boffsets => $boffsets(NBlocks) ##-- block-offsets in $toks() along $NToks |
100
|
|
|
|
|
|
|
delims => $delims(@adims,N,NDelims) ##-- delimiters to splice in at block boundaries |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
Count co-occurrences (esp. N-Grams) over a token vector $toks. |
103
|
|
|
|
|
|
|
This function really just wraps ng_delimit(), ng_rotate(), vv_qsortvec(), and rlevec(). |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=cut |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
*PDL::ng_cofreq = \&ng_cofreq; |
108
|
|
|
|
|
|
|
sub ng_cofreq { |
109
|
2
|
|
|
2
|
1
|
2823
|
my ($toks,%args) = @_; |
110
|
|
|
|
|
|
|
## |
111
|
|
|
|
|
|
|
##-- sanity checks |
112
|
2
|
50
|
|
|
|
7
|
barf('Usage: ngrams($toks,%args)') if (!defined($toks)); |
113
|
2
|
|
|
|
|
8
|
my @adims = $toks->dims; |
114
|
2
|
|
|
|
|
58
|
my ($N,$NToks) = splice(@adims, $#adims-1, 2); |
115
|
|
|
|
|
|
|
## |
116
|
|
|
|
|
|
|
##-- splice in some delimiters (maybe) |
117
|
2
|
|
|
|
|
3
|
my ($dtoks); |
118
|
2
|
50
|
33
|
|
|
16
|
if (defined($args{boffsets}) && defined($args{delims})) { |
119
|
2
|
100
|
|
|
|
5
|
my $adslice = (@adims ? join(',', (map {"*$_"} @adims),'') : ''); |
|
1
|
|
|
|
|
5
|
|
120
|
|
|
|
|
|
|
$dtoks = ng_delimit($toks->mv(-1,0), |
121
|
|
|
|
|
|
|
$args{boffsets}->slice(",${adslice}*$N"), |
122
|
2
|
|
|
|
|
25
|
$args{delims}->mv(-1,0), |
123
|
|
|
|
|
|
|
)->mv(0,-1); |
124
|
|
|
|
|
|
|
} else { |
125
|
0
|
|
|
|
|
0
|
$dtoks = $toks; |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
## |
128
|
|
|
|
|
|
|
##-- rotate components (maybe) |
129
|
2
|
|
|
|
|
23
|
my $NDToks = $dtoks->dim(-1); |
130
|
2
|
|
|
|
|
3
|
my ($ngvecs); |
131
|
2
|
50
|
|
|
|
6
|
if ($args{norotate}) { $ngvecs=$dtoks; } |
|
0
|
|
|
|
|
0
|
|
132
|
2
|
|
|
|
|
5
|
else { $ngvecs=ng_rotate($dtoks); } |
133
|
|
|
|
|
|
|
## |
134
|
|
|
|
|
|
|
##-- sort 'em & count 'em |
135
|
2
|
|
|
|
|
7
|
my @ngvdims = $ngvecs->dims; |
136
|
|
|
|
|
|
|
## |
137
|
|
|
|
|
|
|
## ERRORS on next line (RT bug #108472) for t/04_cofreq.t (PDL-Ngrams v0.05003, PDL v2.0.14, Thu, 05 Nov 2015 10:28:13 +0100) |
138
|
|
|
|
|
|
|
## + Error message: 'Probably false alloc of over 1Gb PDL! (set $PDL::BIGPDL = 1 to enable) at ../blib/lib/PDL/Ngrams.pm line 136.' |
139
|
|
|
|
|
|
|
## + original line (v0.05003): $ngvecs = $ngvecs->clump(-2)->vv_qsortvec(); |
140
|
|
|
|
|
|
|
## + CASE 1: |
141
|
|
|
|
|
|
|
## - input $ngvecs has dims [2,13] |
142
|
|
|
|
|
|
|
## - $ngvecs->clump(-2) should also have dims [2,13], but winds up with dims [1,0,0,2,13], which is just bizarre |
143
|
|
|
|
|
|
|
## + CASE 2: |
144
|
|
|
|
|
|
|
## - $ngvecs has dims [3,2,13] |
145
|
|
|
|
|
|
|
## - $ngvecs->clump(-2) should have dims [6,13], but gets dims [1,0,0,2,13], which apparently leads to 'false alloc' error in later comparisons |
146
|
|
|
|
|
|
|
## + workaround: compute non-negative argument for clump() as (1+$ngvecs->ndims-2): this seems to work |
147
|
2
|
|
|
|
|
50
|
$ngvecs = $ngvecs->clump(1+$ngvecs->ndims-2)->vv_qsortvec(); |
148
|
2
|
|
|
|
|
108
|
my ($ngfreq,$ngelts) = rlevec($ngvecs); |
149
|
2
|
|
|
|
|
10
|
my $ngwhich = which($ngfreq); |
150
|
|
|
|
|
|
|
## |
151
|
|
|
|
|
|
|
##-- reshape results (using @ngvdims) |
152
|
2
|
|
|
|
|
83
|
$ngelts = $ngelts->reshape(@ngvdims); |
153
|
|
|
|
|
|
|
## |
154
|
|
|
|
|
|
|
##.... and return |
155
|
2
|
|
|
|
|
82
|
return ($ngfreq->index($ngwhich), $ngelts->dice_axis(-1,$ngwhich)); |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
##====================================================================== |
159
|
|
|
|
|
|
|
## N-Gram construction: rotation |
160
|
|
|
|
|
|
|
=pod |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
=head2 ng_rotate |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
Signature: (toks(@adims,N,NToks); [o]rtoks(@adims,N,NToks-N+1)) |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
Create a co-occurrence matrix by rotating a (delimited) token vector $toks(). |
167
|
|
|
|
|
|
|
Returns a matrix $rtoks() suitable for passing to ng_cofreq(). |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=cut |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
*PDL::ng_rotate = \&ng_rotate; |
172
|
|
|
|
|
|
|
sub ng_rotate { |
173
|
5
|
|
|
5
|
1
|
2567
|
my ($toks,$rtoks) = @_; |
174
|
|
|
|
|
|
|
|
175
|
5
|
50
|
|
|
|
19
|
barf("Usage: ng_rotate (toks(NAttrs,N,NToks), [o]rtoks(NAttrs,N,NToks-N-1))") |
176
|
|
|
|
|
|
|
if (!defined($toks)); |
177
|
|
|
|
|
|
|
|
178
|
5
|
|
|
|
|
21
|
my @adims = $toks->dims(); |
179
|
5
|
50
|
|
|
|
165
|
$rtoks = zeroes($toks->type, @adims) if (!defined($rtoks)); |
180
|
5
|
|
|
|
|
498
|
my $NToks = pop(@adims); |
181
|
5
|
|
|
|
|
9
|
my $N = pop(@adims); |
182
|
5
|
|
|
|
|
9
|
my ($i); |
183
|
5
|
|
|
|
|
15
|
foreach $i (0..($N-1)) { |
184
|
|
|
|
|
|
|
##-- the following line pukes on cpan testers 5.15.x with: "Can't modify non-lvalue subroutine call at ..." |
185
|
|
|
|
|
|
|
#$rtoks->dice_axis(-2,$i) .= $toks->dice_axis(-2,$i)->xchg(-1,0)->rotate(-$i)->xchg(0,-1); |
186
|
|
|
|
|
|
|
## |
187
|
11
|
|
|
|
|
1046
|
my $rtoks_i = $rtoks->dice_axis(-2,$i); |
188
|
11
|
|
|
|
|
1084
|
$rtoks_i .= $toks->dice_axis(-2,$i)->xchg(-1,0)->rotate(-$i)->xchg(0,-1); |
189
|
|
|
|
|
|
|
} |
190
|
5
|
|
|
|
|
751
|
$rtoks = $rtoks->xchg(-1,0)->slice("0:-$N")->xchg(-1,0); |
191
|
|
|
|
|
|
|
|
192
|
5
|
|
|
|
|
130
|
return $rtoks; |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
##====================================================================== |
197
|
|
|
|
|
|
|
## Delimit / Splice |
198
|
|
|
|
|
|
|
=pod |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=head1 Delimiter Insertion and Removal |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
The following functions can be used to add or remove delimiters to a PDL vector. |
203
|
|
|
|
|
|
|
This can be useful to add or remove beginning- and/or end-of-word markers to rsp. |
204
|
|
|
|
|
|
|
from a PDL vector, before rsp. after constructing a vector of N-gram vectors. |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
=cut |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
##---------------------------------------------------------------------- |
209
|
|
|
|
|
|
|
## ng_delimit() |
210
|
|
|
|
|
|
|
=pod |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
=head2 ng_delimit |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
=for sig |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
Signature: (toks(NToks); indx boffsets(NBlocks); delims(NDelims); [o]dtoks(NDToks)) |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
Add block-delimiters (e.g. BOS,EOS) to a vector of raw tokens. |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
See L. |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
=cut |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
##---------------------------------------------------------------------- |
225
|
|
|
|
|
|
|
## ng_undelimit() |
226
|
|
|
|
|
|
|
=pod |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
=head2 ng_undelimit |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
Signature: (dtoks(NDToks); indx boffsets(NBlocks); int NDelims(); [o]toks(NToks)) |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
Remove block-delimiters (e.g. BOS,EOS) from a vector of delimited tokens. |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
See L. |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
=cut |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
1; ##-- make perl happy |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
##====================================================================== |
243
|
|
|
|
|
|
|
## pod: Functions: low-level |
244
|
|
|
|
|
|
|
=pod |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
=head2 Low-Level Functions |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
Some additional low-level functions are provided in the |
249
|
|
|
|
|
|
|
PDL::Ngrams::ngutils |
250
|
|
|
|
|
|
|
package. |
251
|
|
|
|
|
|
|
See L for details. |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
=cut |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
##====================================================================== |
256
|
|
|
|
|
|
|
## pod: Footer |
257
|
|
|
|
|
|
|
=pod |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
perl by Larry Wall. |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
=head1 AUTHOR |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
Bryan Jurish Emoocow@cpan.orgE |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
PDL by Karl Glazebrook, Tuomas J. Lukka, Christian Soeller, and others. |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=head1 COPYRIGHT |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
Copyright (c) 2007-2015, Bryan Jurish. All rights reserved. |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
This package is free software. You may redistribute it |
274
|
|
|
|
|
|
|
and/or modify it under the same terms as Perl itself. |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
=head1 SEE ALSO |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
perl(1), PDL(3perl), PDL::Ngrams::ngutils(3perl) |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
=cut |