| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
##-*- Mode: CPerl -*- |
|
2
|
|
|
|
|
|
|
## |
|
3
|
|
|
|
|
|
|
## File: PDL::Ngrams.pm |
|
4
|
|
|
|
|
|
|
## Author: Bryan Jurish |
|
5
|
|
|
|
|
|
|
## Description: N-Gram utilities for PDL |
|
6
|
|
|
|
|
|
|
##====================================================================== |
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
package PDL::Ngrams; |
|
9
|
4
|
|
|
4
|
|
1964381
|
use strict; |
|
|
4
|
|
|
|
|
11
|
|
|
|
4
|
|
|
|
|
115
|
|
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
##====================================================================== |
|
12
|
|
|
|
|
|
|
## Export hacks |
|
13
|
4
|
|
|
4
|
|
22
|
use PDL; |
|
|
4
|
|
|
|
|
8
|
|
|
|
4
|
|
|
|
|
29
|
|
|
14
|
4
|
|
|
4
|
|
12323
|
use PDL::Exporter; |
|
|
4
|
|
|
|
|
7
|
|
|
|
4
|
|
|
|
|
21
|
|
|
15
|
4
|
|
|
4
|
|
3174
|
use PDL::VectorValued; |
|
|
4
|
|
|
|
|
30582
|
|
|
|
4
|
|
|
|
|
33
|
|
|
16
|
4
|
|
|
4
|
|
3252
|
use PDL::Ngrams::Utils; |
|
|
4
|
|
|
|
|
9
|
|
|
|
4
|
|
|
|
|
60
|
|
|
17
|
|
|
|
|
|
|
our @ISA = qw(PDL::Exporter); |
|
18
|
|
|
|
|
|
|
our @EXPORT_OK = |
|
19
|
|
|
|
|
|
|
( |
|
20
|
|
|
|
|
|
|
(@PDL::Ngrams::Utils::EXPORT_OK), ##-- inherited |
|
21
|
|
|
|
|
|
|
qw(ng_cofreq ng_rotate), |
|
22
|
|
|
|
|
|
|
qw(_ng_qsortvec), ##-- compat |
|
23
|
|
|
|
|
|
|
); |
|
24
|
|
|
|
|
|
|
our %EXPORT_TAGS = |
|
25
|
|
|
|
|
|
|
( |
|
26
|
|
|
|
|
|
|
Func => [@EXPORT_OK], ##-- respect PDL conventions (hopefully) |
|
27
|
|
|
|
|
|
|
); |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
our $VERSION = '0.09'; ##-- use perl-reversion to update |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
##====================================================================== |
|
32
|
|
|
|
|
|
|
## pod: header |
|
33
|
|
|
|
|
|
|
=pod |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
=head1 NAME |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
PDL::Ngrams - N-Gram utilities for PDL |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
use PDL; |
|
42
|
|
|
|
|
|
|
use PDL::Ngrams; |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
##--------------------------------------------------------------------- |
|
45
|
|
|
|
|
|
|
## Basic Data |
|
46
|
|
|
|
|
|
|
$toks = rint(10*random(10)); |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
##--------------------------------------------------------------------- |
|
49
|
|
|
|
|
|
|
## ... stuff happens |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=cut |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
##====================================================================== |
|
55
|
|
|
|
|
|
|
## Description |
|
56
|
|
|
|
|
|
|
=pod |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
PDL::Ngrams provides basic utilities for tracking N-grams over PDL vectors. |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
=cut |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
##====================================================================== |
|
65
|
|
|
|
|
|
|
## pod: Functions |
|
66
|
|
|
|
|
|
|
=pod |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
=head1 FUNCTIONS |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=cut |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
##====================================================================== |
|
73
|
|
|
|
|
|
|
## backwards-compatibility aliases |
|
74
|
|
|
|
|
|
|
*PDL::_ng_qsortvec = *_ng_qsortvec = \&PDL::vv_qsortvec; |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
##====================================================================== |
|
77
|
|
|
|
|
|
|
## Run-Length Encoding/Decoding: n-dimensionl |
|
78
|
|
|
|
|
|
|
=pod |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
=head1 Counting N-Grams over PDLs |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
=cut |
|
83
|
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
##---------------------------------------------------------------------- |
|
85
|
|
|
|
|
|
|
## ng_cofreq() |
|
86
|
|
|
|
|
|
|
=pod |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=head2 ng_cofreq |
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
=for sig |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
Signature: (toks(@adims,N,NToks); %args) |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
Returns: (int [o]ngramfreqs(NNgrams); [o]ngramids(@adims,N,NNgrams)) |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
Keyword arguments (optional): |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
norotate => $bool, ##-- if true, $toks() will NOT be rotated along $N |
|
99
|
|
|
|
|
|
|
boffsets => $boffsets(NBlocks) ##-- block-offsets in $toks() along $NToks |
|
100
|
|
|
|
|
|
|
delims => $delims(@adims,N,NDelims) ##-- delimiters to splice in at block boundaries |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
Count co-occurrences (esp. N-Grams) over a token vector $toks. |
|
103
|
|
|
|
|
|
|
This function really just wraps ng_delimit(), ng_rotate(), vv_qsortvec(), and rlevec(). |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=cut |
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
*PDL::ng_cofreq = \&ng_cofreq; |
|
108
|
|
|
|
|
|
|
sub ng_cofreq { |
|
109
|
2
|
|
|
2
|
1
|
2823
|
my ($toks,%args) = @_; |
|
110
|
|
|
|
|
|
|
## |
|
111
|
|
|
|
|
|
|
##-- sanity checks |
|
112
|
2
|
50
|
|
|
|
7
|
barf('Usage: ngrams($toks,%args)') if (!defined($toks)); |
|
113
|
2
|
|
|
|
|
8
|
my @adims = $toks->dims; |
|
114
|
2
|
|
|
|
|
58
|
my ($N,$NToks) = splice(@adims, $#adims-1, 2); |
|
115
|
|
|
|
|
|
|
## |
|
116
|
|
|
|
|
|
|
##-- splice in some delimiters (maybe) |
|
117
|
2
|
|
|
|
|
3
|
my ($dtoks); |
|
118
|
2
|
50
|
33
|
|
|
16
|
if (defined($args{boffsets}) && defined($args{delims})) { |
|
119
|
2
|
100
|
|
|
|
5
|
my $adslice = (@adims ? join(',', (map {"*$_"} @adims),'') : ''); |
|
|
1
|
|
|
|
|
5
|
|
|
120
|
|
|
|
|
|
|
$dtoks = ng_delimit($toks->mv(-1,0), |
|
121
|
|
|
|
|
|
|
$args{boffsets}->slice(",${adslice}*$N"), |
|
122
|
2
|
|
|
|
|
25
|
$args{delims}->mv(-1,0), |
|
123
|
|
|
|
|
|
|
)->mv(0,-1); |
|
124
|
|
|
|
|
|
|
} else { |
|
125
|
0
|
|
|
|
|
0
|
$dtoks = $toks; |
|
126
|
|
|
|
|
|
|
} |
|
127
|
|
|
|
|
|
|
## |
|
128
|
|
|
|
|
|
|
##-- rotate components (maybe) |
|
129
|
2
|
|
|
|
|
23
|
my $NDToks = $dtoks->dim(-1); |
|
130
|
2
|
|
|
|
|
3
|
my ($ngvecs); |
|
131
|
2
|
50
|
|
|
|
6
|
if ($args{norotate}) { $ngvecs=$dtoks; } |
|
|
0
|
|
|
|
|
0
|
|
|
132
|
2
|
|
|
|
|
5
|
else { $ngvecs=ng_rotate($dtoks); } |
|
133
|
|
|
|
|
|
|
## |
|
134
|
|
|
|
|
|
|
##-- sort 'em & count 'em |
|
135
|
2
|
|
|
|
|
7
|
my @ngvdims = $ngvecs->dims; |
|
136
|
|
|
|
|
|
|
## |
|
137
|
|
|
|
|
|
|
## ERRORS on next line (RT bug #108472) for t/04_cofreq.t (PDL-Ngrams v0.05003, PDL v2.0.14, Thu, 05 Nov 2015 10:28:13 +0100) |
|
138
|
|
|
|
|
|
|
## + Error message: 'Probably false alloc of over 1Gb PDL! (set $PDL::BIGPDL = 1 to enable) at ../blib/lib/PDL/Ngrams.pm line 136.' |
|
139
|
|
|
|
|
|
|
## + original line (v0.05003): $ngvecs = $ngvecs->clump(-2)->vv_qsortvec(); |
|
140
|
|
|
|
|
|
|
## + CASE 1: |
|
141
|
|
|
|
|
|
|
## - input $ngvecs has dims [2,13] |
|
142
|
|
|
|
|
|
|
## - $ngvecs->clump(-2) should also have dims [2,13], but winds up with dims [1,0,0,2,13], which is just bizarre |
|
143
|
|
|
|
|
|
|
## + CASE 2: |
|
144
|
|
|
|
|
|
|
## - $ngvecs has dims [3,2,13] |
|
145
|
|
|
|
|
|
|
## - $ngvecs->clump(-2) should have dims [6,13], but gets dims [1,0,0,2,13], which apparently leads to 'false alloc' error in later comparisons |
|
146
|
|
|
|
|
|
|
## + workaround: compute non-negative argument for clump() as (1+$ngvecs->ndims-2): this seems to work |
|
147
|
2
|
|
|
|
|
50
|
$ngvecs = $ngvecs->clump(1+$ngvecs->ndims-2)->vv_qsortvec(); |
|
148
|
2
|
|
|
|
|
108
|
my ($ngfreq,$ngelts) = rlevec($ngvecs); |
|
149
|
2
|
|
|
|
|
10
|
my $ngwhich = which($ngfreq); |
|
150
|
|
|
|
|
|
|
## |
|
151
|
|
|
|
|
|
|
##-- reshape results (using @ngvdims) |
|
152
|
2
|
|
|
|
|
83
|
$ngelts = $ngelts->reshape(@ngvdims); |
|
153
|
|
|
|
|
|
|
## |
|
154
|
|
|
|
|
|
|
##.... and return |
|
155
|
2
|
|
|
|
|
82
|
return ($ngfreq->index($ngwhich), $ngelts->dice_axis(-1,$ngwhich)); |
|
156
|
|
|
|
|
|
|
} |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
##====================================================================== |
|
159
|
|
|
|
|
|
|
## N-Gram construction: rotation |
|
160
|
|
|
|
|
|
|
=pod |
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
=head2 ng_rotate |
|
163
|
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
Signature: (toks(@adims,N,NToks); [o]rtoks(@adims,N,NToks-N+1)) |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
Create a co-occurrence matrix by rotating a (delimited) token vector $toks(). |
|
167
|
|
|
|
|
|
|
Returns a matrix $rtoks() suitable for passing to ng_cofreq(). |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=cut |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
*PDL::ng_rotate = \&ng_rotate; |
|
172
|
|
|
|
|
|
|
sub ng_rotate { |
|
173
|
5
|
|
|
5
|
1
|
2567
|
my ($toks,$rtoks) = @_; |
|
174
|
|
|
|
|
|
|
|
|
175
|
5
|
50
|
|
|
|
19
|
barf("Usage: ng_rotate (toks(NAttrs,N,NToks), [o]rtoks(NAttrs,N,NToks-N-1))") |
|
176
|
|
|
|
|
|
|
if (!defined($toks)); |
|
177
|
|
|
|
|
|
|
|
|
178
|
5
|
|
|
|
|
21
|
my @adims = $toks->dims(); |
|
179
|
5
|
50
|
|
|
|
165
|
$rtoks = zeroes($toks->type, @adims) if (!defined($rtoks)); |
|
180
|
5
|
|
|
|
|
498
|
my $NToks = pop(@adims); |
|
181
|
5
|
|
|
|
|
9
|
my $N = pop(@adims); |
|
182
|
5
|
|
|
|
|
9
|
my ($i); |
|
183
|
5
|
|
|
|
|
15
|
foreach $i (0..($N-1)) { |
|
184
|
|
|
|
|
|
|
##-- the following line pukes on cpan testers 5.15.x with: "Can't modify non-lvalue subroutine call at ..." |
|
185
|
|
|
|
|
|
|
#$rtoks->dice_axis(-2,$i) .= $toks->dice_axis(-2,$i)->xchg(-1,0)->rotate(-$i)->xchg(0,-1); |
|
186
|
|
|
|
|
|
|
## |
|
187
|
11
|
|
|
|
|
1046
|
my $rtoks_i = $rtoks->dice_axis(-2,$i); |
|
188
|
11
|
|
|
|
|
1084
|
$rtoks_i .= $toks->dice_axis(-2,$i)->xchg(-1,0)->rotate(-$i)->xchg(0,-1); |
|
189
|
|
|
|
|
|
|
} |
|
190
|
5
|
|
|
|
|
751
|
$rtoks = $rtoks->xchg(-1,0)->slice("0:-$N")->xchg(-1,0); |
|
191
|
|
|
|
|
|
|
|
|
192
|
5
|
|
|
|
|
130
|
return $rtoks; |
|
193
|
|
|
|
|
|
|
} |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
##====================================================================== |
|
197
|
|
|
|
|
|
|
## Delimit / Splice |
|
198
|
|
|
|
|
|
|
=pod |
|
199
|
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=head1 Delimiter Insertion and Removal |
|
201
|
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
The following functions can be used to add or remove delimiters to a PDL vector. |
|
203
|
|
|
|
|
|
|
This can be useful to add or remove beginning- and/or end-of-word markers to rsp. |
|
204
|
|
|
|
|
|
|
from a PDL vector, before rsp. after constructing a vector of N-gram vectors. |
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
=cut |
|
207
|
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
##---------------------------------------------------------------------- |
|
209
|
|
|
|
|
|
|
## ng_delimit() |
|
210
|
|
|
|
|
|
|
=pod |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
=head2 ng_delimit |
|
213
|
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
=for sig |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
Signature: (toks(NToks); indx boffsets(NBlocks); delims(NDelims); [o]dtoks(NDToks)) |
|
217
|
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
Add block-delimiters (e.g. BOS,EOS) to a vector of raw tokens. |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
See L. |
|
221
|
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
=cut |
|
223
|
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
##---------------------------------------------------------------------- |
|
225
|
|
|
|
|
|
|
## ng_undelimit() |
|
226
|
|
|
|
|
|
|
=pod |
|
227
|
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
=head2 ng_undelimit |
|
229
|
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
Signature: (dtoks(NDToks); indx boffsets(NBlocks); int NDelims(); [o]toks(NToks)) |
|
231
|
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
Remove block-delimiters (e.g. BOS,EOS) from a vector of delimited tokens. |
|
233
|
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
See L. |
|
235
|
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
=cut |
|
237
|
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
1; ##-- make perl happy |
|
240
|
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
##====================================================================== |
|
243
|
|
|
|
|
|
|
## pod: Functions: low-level |
|
244
|
|
|
|
|
|
|
=pod |
|
245
|
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
=head2 Low-Level Functions |
|
247
|
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
Some additional low-level functions are provided in the |
|
249
|
|
|
|
|
|
|
PDL::Ngrams::ngutils |
|
250
|
|
|
|
|
|
|
package. |
|
251
|
|
|
|
|
|
|
See L for details. |
|
252
|
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
=cut |
|
254
|
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
##====================================================================== |
|
256
|
|
|
|
|
|
|
## pod: Footer |
|
257
|
|
|
|
|
|
|
=pod |
|
258
|
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS |
|
260
|
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
perl by Larry Wall. |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
=head1 AUTHOR |
|
264
|
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
Bryan Jurish Emoocow@cpan.orgE |
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
PDL by Karl Glazebrook, Tuomas J. Lukka, Christian Soeller, and others. |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=head1 COPYRIGHT |
|
270
|
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
Copyright (c) 2007-2015, Bryan Jurish. All rights reserved. |
|
272
|
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
This package is free software. You may redistribute it |
|
274
|
|
|
|
|
|
|
and/or modify it under the same terms as Perl itself. |
|
275
|
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
perl(1), PDL(3perl), PDL::Ngrams::ngutils(3perl) |
|
279
|
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
=cut |