line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Summarizer; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
2611
|
use v5.10.0; |
|
1
|
|
|
|
|
6
|
|
4
|
1
|
|
|
1
|
|
9
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
31
|
|
5
|
1
|
|
|
1
|
|
11
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
35
|
|
6
|
1
|
|
|
1
|
|
807
|
use Moo; |
|
1
|
|
|
|
|
18939
|
|
|
1
|
|
|
|
|
8
|
|
7
|
1
|
|
|
1
|
|
3009
|
use Types::Standard qw/ Bool Ref Str Int Num InstanceOf Bool /; |
|
1
|
|
|
|
|
79895
|
|
|
1
|
|
|
|
|
11
|
|
8
|
1
|
|
|
1
|
|
2088
|
use List::AllUtils qw/ max min sum sum0 singleton /; |
|
1
|
|
|
|
|
14053
|
|
|
1
|
|
|
|
|
105
|
|
9
|
1
|
|
|
1
|
|
510
|
use Algorithm::CurveFit; |
|
1
|
|
|
|
|
141828
|
|
|
1
|
|
|
|
|
58
|
|
10
|
1
|
|
|
1
|
|
784
|
use utf8; |
|
1
|
|
|
|
|
14
|
|
|
1
|
|
|
|
|
6
|
|
11
|
|
|
|
|
|
|
|
12
|
1
|
|
|
1
|
|
980
|
binmode STDOUT, ':encoding(UTF-8)'; |
|
1
|
|
|
|
|
15
|
|
|
1
|
|
|
|
|
6
|
|
13
|
|
|
|
|
|
|
|
14
|
1
|
|
|
1
|
|
51
|
use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION); |
|
1
|
|
|
|
|
8
|
|
|
1
|
|
|
|
|
5445
|
|
15
|
|
|
|
|
|
|
require Exporter; |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
18
|
|
|
|
|
|
|
@EXPORT = qw(); |
19
|
|
|
|
|
|
|
@EXPORT_OK = qw(); |
20
|
|
|
|
|
|
|
%EXPORT_TAGS = (all => [@EXPORT_OK]); |
21
|
|
|
|
|
|
|
$VERSION = '1.053'; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
has permanent_path => ( |
25
|
|
|
|
|
|
|
is => 'rw', |
26
|
|
|
|
|
|
|
isa => Str, |
27
|
|
|
|
|
|
|
default => 'data/permanent.stop', |
28
|
|
|
|
|
|
|
); |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
has stopwords_path => ( |
31
|
|
|
|
|
|
|
is => 'rw', |
32
|
|
|
|
|
|
|
isa => Str, |
33
|
|
|
|
|
|
|
default => 'data/stopwords.stop', |
34
|
|
|
|
|
|
|
); |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
has articles_path => ( |
37
|
|
|
|
|
|
|
is => 'rw', |
38
|
|
|
|
|
|
|
isa => Str, |
39
|
|
|
|
|
|
|
default => 'articles/*' |
40
|
|
|
|
|
|
|
); |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
has store_scanner => ( |
43
|
|
|
|
|
|
|
is => 'rw', |
44
|
|
|
|
|
|
|
isa => Bool, |
45
|
|
|
|
|
|
|
default => 0, |
46
|
|
|
|
|
|
|
); |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
has print_scanner => ( |
49
|
|
|
|
|
|
|
is => 'rw', |
50
|
|
|
|
|
|
|
isa => Bool, |
51
|
|
|
|
|
|
|
default => 0, |
52
|
|
|
|
|
|
|
); |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
has print_summary => ( |
55
|
|
|
|
|
|
|
is => 'rw', |
56
|
|
|
|
|
|
|
isa => Bool, |
57
|
|
|
|
|
|
|
default => 0, |
58
|
|
|
|
|
|
|
); |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
has return_count => ( |
61
|
|
|
|
|
|
|
is => 'rw', |
62
|
|
|
|
|
|
|
isa => Num, |
63
|
|
|
|
|
|
|
default => 20, |
64
|
|
|
|
|
|
|
); |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
has phrase_thresh => ( |
67
|
|
|
|
|
|
|
is => 'rw', |
68
|
|
|
|
|
|
|
isa => Int, |
69
|
|
|
|
|
|
|
default => 2, |
70
|
|
|
|
|
|
|
); |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
has phrase_radius => ( |
73
|
|
|
|
|
|
|
is => 'rw', |
74
|
|
|
|
|
|
|
isa => Int, |
75
|
|
|
|
|
|
|
default => 5, |
76
|
|
|
|
|
|
|
); |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
has freq_constant => ( |
79
|
|
|
|
|
|
|
is => 'rw', |
80
|
|
|
|
|
|
|
isa => Num, |
81
|
|
|
|
|
|
|
default => 0.004, |
82
|
|
|
|
|
|
|
); |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
has watch_count => ( |
85
|
|
|
|
|
|
|
is => 'rwp', |
86
|
|
|
|
|
|
|
isa => Int, |
87
|
|
|
|
|
|
|
default => 0, |
88
|
|
|
|
|
|
|
); |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
has stopwords => ( |
91
|
|
|
|
|
|
|
is => 'lazy', |
92
|
|
|
|
|
|
|
isa => Ref['HASH'], |
93
|
|
|
|
|
|
|
); |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
has watchlist => ( |
96
|
|
|
|
|
|
|
is => 'rwp', |
97
|
|
|
|
|
|
|
isa => Ref['HASH'], |
98
|
|
|
|
|
|
|
lazy => 1, |
99
|
|
|
|
|
|
|
); |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
has article_length => ( |
102
|
|
|
|
|
|
|
is => 'rwp', |
103
|
|
|
|
|
|
|
isa => Int, |
104
|
|
|
|
|
|
|
default => 0, |
105
|
|
|
|
|
|
|
lazy => 1, |
106
|
|
|
|
|
|
|
); |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
has full_text => ( |
109
|
|
|
|
|
|
|
is => 'rwp', |
110
|
|
|
|
|
|
|
isa => Str, |
111
|
|
|
|
|
|
|
); |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
has sentences => ( |
114
|
|
|
|
|
|
|
is => 'rwp', |
115
|
|
|
|
|
|
|
isa => Ref['ARRAY'], |
116
|
|
|
|
|
|
|
); |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
has sen_words => ( |
119
|
|
|
|
|
|
|
is => 'rwp', |
120
|
|
|
|
|
|
|
isa => Ref['ARRAY'], |
121
|
|
|
|
|
|
|
); |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
has word_list => ( |
124
|
|
|
|
|
|
|
is => 'rwp', |
125
|
|
|
|
|
|
|
isa => Ref['ARRAY'], |
126
|
|
|
|
|
|
|
); |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
has freq_hash => ( |
129
|
|
|
|
|
|
|
is => 'rwp', |
130
|
|
|
|
|
|
|
isa => Ref['HASH'], |
131
|
|
|
|
|
|
|
); |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
has clst_hash => ( |
134
|
|
|
|
|
|
|
is => 'rwp', |
135
|
|
|
|
|
|
|
isa => Ref['HASH'], |
136
|
|
|
|
|
|
|
); |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
has phrs_hash => ( |
139
|
|
|
|
|
|
|
is => 'rwp', |
140
|
|
|
|
|
|
|
isa => Ref['HASH'], |
141
|
|
|
|
|
|
|
); |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
has sigma_hash => ( |
144
|
|
|
|
|
|
|
is => 'rwp', |
145
|
|
|
|
|
|
|
isa => Ref['HASH'], |
146
|
|
|
|
|
|
|
); |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
has inter_hash => ( |
149
|
|
|
|
|
|
|
is => 'rwp', |
150
|
|
|
|
|
|
|
isa => Ref['HASH'], |
151
|
|
|
|
|
|
|
); |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
has score_hash => ( |
154
|
|
|
|
|
|
|
is => 'rwp', |
155
|
|
|
|
|
|
|
isa => Ref['HASH'], |
156
|
|
|
|
|
|
|
); |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
has phrs_list => ( |
159
|
|
|
|
|
|
|
is => 'rwp', |
160
|
|
|
|
|
|
|
isa => Ref['HASH'], |
161
|
|
|
|
|
|
|
); |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
has frag_list => ( |
164
|
|
|
|
|
|
|
is => 'rwp', |
165
|
|
|
|
|
|
|
isa => Ref['ARRAY'], |
166
|
|
|
|
|
|
|
); |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
has summary => ( |
169
|
|
|
|
|
|
|
is => 'rwp', |
170
|
|
|
|
|
|
|
isa => Ref['HASH'], |
171
|
|
|
|
|
|
|
); |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
has file_name => ( |
174
|
|
|
|
|
|
|
is => 'rwp', |
175
|
|
|
|
|
|
|
isa => Str, |
176
|
|
|
|
|
|
|
); |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
has text_hint => ( |
179
|
|
|
|
|
|
|
is => 'rwp', |
180
|
|
|
|
|
|
|
isa => Str, |
181
|
|
|
|
|
|
|
); |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
sub _build_watchlist { |
184
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
185
|
0
|
|
|
|
|
0
|
my %watchlist; |
186
|
|
|
|
|
|
|
|
187
|
0
|
0
|
|
|
|
0
|
open( my $stopwords_file, '<', $self->stopwords_path ) |
188
|
|
|
|
|
|
|
or die "Can't open stopwords scanner file" . $self->stopwords_path . ": $!"; |
189
|
0
|
|
0
|
|
|
0
|
chomp and $watchlist{ $_ } = 1 for (<$stopwords_file>); |
190
|
0
|
|
|
|
|
0
|
close $stopwords_file; |
191
|
|
|
|
|
|
|
|
192
|
0
|
|
|
|
|
0
|
return \%watchlist; |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
sub _build_stopwords { |
196
|
75
|
|
|
75
|
|
173
|
my $self = shift; |
197
|
75
|
|
|
|
|
114
|
my %stopwords; |
198
|
|
|
|
|
|
|
|
199
|
75
|
50
|
|
|
|
2027
|
open( my $permanent_file, '<', $self->permanent_path ) |
200
|
|
|
|
|
|
|
or die "Can't open stopwords permanent file " . $self->permanent_path . ": $!"; |
201
|
75
|
|
50
|
|
|
31206
|
chomp and $stopwords{ $_ } = 1 for (<$permanent_file>); |
202
|
75
|
|
|
|
|
1688
|
close $permanent_file; |
203
|
|
|
|
|
|
|
|
204
|
75
|
50
|
|
|
|
2070
|
open( my $stopwords_file, '<', $self->stopwords_path ) |
205
|
|
|
|
|
|
|
or die "Can't open stopwords scanner file" . $self->stopwords_path . ": $!"; |
206
|
75
|
|
0
|
|
|
4398
|
chomp and $stopwords{ $_ } = 1 for (<$stopwords_file>); |
207
|
75
|
|
|
|
|
615
|
close $stopwords_file; |
208
|
|
|
|
|
|
|
|
209
|
75
|
|
|
|
|
453
|
return \%stopwords; |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
sub _store_stopwords { |
213
|
37
|
|
|
37
|
|
89
|
my $self = shift; |
214
|
|
|
|
|
|
|
|
215
|
37
|
50
|
|
|
|
763
|
open( my $stopwords_file, ">", $self->stopwords_path) |
216
|
|
|
|
|
|
|
or die "Can't open stopwords scanner file " . $self->stopwords_file . ": $!"; |
217
|
37
|
50
|
|
|
|
4167
|
grep { print $stopwords_file "$_\n" } sort keys %{$self->watchlist} if $self->store_scanner; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
218
|
37
|
|
|
|
|
634
|
close $stopwords_file; |
219
|
|
|
|
|
|
|
|
220
|
37
|
|
|
|
|
221
|
return $self; |
221
|
|
|
|
|
|
|
} |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
sub scan_text { |
226
|
37
|
|
|
37
|
0
|
143
|
my ($self, $text, $path) = @_; |
227
|
|
|
|
|
|
|
|
228
|
37
|
|
|
|
|
811
|
$self->_set_file_name( '' ); |
229
|
37
|
|
|
|
|
1684
|
$self->_set_text_hint( '' ); |
230
|
|
|
|
|
|
|
|
231
|
37
|
100
|
|
|
|
1139
|
if ( ref $text ) { |
232
|
36
|
|
|
|
|
525
|
$self->_set_file_name( $path ); |
233
|
|
|
|
|
|
|
|
234
|
36
|
|
|
|
|
2377
|
$text = join "\n" => map { $_ } <$text>; |
|
1041
|
|
|
|
|
5076
|
|
235
|
|
|
|
|
|
|
} |
236
|
|
|
|
|
|
|
|
237
|
37
|
|
|
|
|
2380
|
$self->_set_text_hint( '"' . substr($text,0,50) . '...' . substr($text,-30) . '"' ); |
238
|
37
|
|
|
|
|
949
|
$self->tokenize( $text ); #breaks the provided file into sentences and individual words |
239
|
|
|
|
|
|
|
|
240
|
37
|
|
|
|
|
201
|
$self->_build_stopwords; |
241
|
37
|
|
|
|
|
661
|
$self->_build_freq_hash; |
242
|
37
|
|
|
|
|
186
|
$self->_build_clst_hash; |
243
|
37
|
|
|
|
|
146
|
$self->_build_phrs_hash; |
244
|
37
|
|
|
|
|
179
|
$self->_build_sigma_hash; |
245
|
37
|
|
|
|
|
175
|
$self->_build_frag_list; |
246
|
|
|
|
|
|
|
|
247
|
37
|
|
|
|
|
179
|
$self->develop_stopwords; #analyzes the frequency and clustering of words within the provided file |
248
|
37
|
|
|
|
|
195
|
$self->_store_stopwords; |
249
|
|
|
|
|
|
|
|
250
|
37
|
|
|
|
|
812
|
return $self->watchlist; |
251
|
|
|
|
|
|
|
} |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
sub scan_file { |
254
|
36
|
|
|
36
|
0
|
165
|
my ($self, $file_path) = @_; |
255
|
|
|
|
|
|
|
|
256
|
36
|
50
|
|
|
|
1277
|
open( my $file, '<:encoding(UTF-8)', $file_path ) |
257
|
|
|
|
|
|
|
or die "Can't open file $file_path for scanning: $!"; |
258
|
|
|
|
|
|
|
|
259
|
36
|
|
|
|
|
3200
|
return $self->scan_text( $file, $file_path ); |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
sub scan_each { |
263
|
1
|
|
|
1
|
0
|
60213
|
my ($self, $dir_path) = @_; |
264
|
1
|
|
33
|
|
|
38
|
return map { $self->scan_file( $_ ) } glob( $dir_path // $self->articles_path ); |
|
35
|
|
|
|
|
592
|
|
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
sub summarize_text { |
270
|
37
|
|
|
37
|
0
|
127
|
my ($self, $text, $path) = @_; |
271
|
|
|
|
|
|
|
|
272
|
37
|
|
|
|
|
915
|
$self->_set_file_name( '' ); |
273
|
|
|
|
|
|
|
|
274
|
37
|
100
|
|
|
|
1327
|
if ( ref $text ) { |
275
|
36
|
|
|
|
|
558
|
$self->_set_file_name( $path ); |
276
|
|
|
|
|
|
|
|
277
|
36
|
|
|
|
|
3437
|
$text = join "\n" => map { $_ } <$text>; |
|
1041
|
|
|
|
|
6162
|
|
278
|
|
|
|
|
|
|
} |
279
|
|
|
|
|
|
|
|
280
|
37
|
|
|
|
|
2637
|
$self->_set_text_hint( '"' . substr($text,0,50) . '...' . substr($text,-30) . '"' ); |
281
|
37
|
|
|
|
|
1340
|
$self->tokenize($text); #breaks the provided file into sentences and individual words |
282
|
|
|
|
|
|
|
|
283
|
37
|
|
|
|
|
158
|
$self->_build_stopwords; |
284
|
37
|
|
|
|
|
665
|
$self->_build_freq_hash; |
285
|
37
|
|
|
|
|
180
|
$self->_build_clst_hash; |
286
|
37
|
|
|
|
|
153
|
$self->_build_phrs_hash; |
287
|
37
|
|
|
|
|
161
|
$self->_build_sigma_hash; |
288
|
37
|
|
|
|
|
124
|
$self->_build_frag_list; |
289
|
|
|
|
|
|
|
|
290
|
37
|
|
|
|
|
178
|
$self->analyze_phrases; #analyzes the frequency and clustering of words within the provided file |
291
|
|
|
|
|
|
|
|
292
|
37
|
|
|
|
|
1231
|
return $self->summary; |
293
|
|
|
|
|
|
|
} |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
#summarizing is used to extract common phrase fragments from a given text file. |
296
|
|
|
|
|
|
|
sub summarize_file { |
297
|
36
|
|
|
36
|
0
|
123
|
my ($self, $file_path) = @_; |
298
|
|
|
|
|
|
|
|
299
|
36
|
50
|
|
|
|
1768
|
open( my $file, '<:encoding(UTF-8)', $file_path ) |
300
|
|
|
|
|
|
|
or die "Can't open file $file_path for summarizing: $!"; |
301
|
|
|
|
|
|
|
|
302
|
36
|
|
|
|
|
3585
|
return $self->summarize_text( $file, $file_path ); |
303
|
|
|
|
|
|
|
} |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
sub summarize_each { |
306
|
1
|
|
|
1
|
0
|
4
|
my ($self, $dir_path) = @_; |
307
|
1
|
|
33
|
|
|
32
|
return map { $self->summarize_file( $_ ) } glob( $dir_path // $self->articles_path ); |
|
35
|
|
|
|
|
636
|
|
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
|
312
|
1
|
|
|
1
|
0
|
11
|
sub summ_text { return shift->summarize_text(@_); } |
313
|
1
|
|
|
1
|
0
|
10
|
sub summ_file { return shift->summarize_file(@_); } |
314
|
1
|
|
|
1
|
0
|
21293
|
sub summ_each { return shift->summarize_each(@_); } |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
sub tokenize { |
319
|
74
|
|
|
74
|
0
|
214
|
my ( $self, $text ) = @_; |
320
|
|
|
|
|
|
|
|
321
|
74
|
|
|
|
|
126
|
my $full_text = $text; |
322
|
|
|
|
|
|
|
#contains the full body of text |
323
|
74
|
|
|
|
|
268920
|
my @sentences = split qr/(?| (?<=(?
|
324
|
|
|
|
|
|
|
| (?: \n+ | ^\s+ | \s+$ ) |
325
|
|
|
|
|
|
|
)/mx => $full_text; |
326
|
|
|
|
|
|
|
# array of sentences |
327
|
|
|
|
|
|
|
|
328
|
74
|
|
|
|
|
374
|
my @word_list; # array literal of all the words in the entire text body |
329
|
|
|
|
|
|
|
my @sen_words; # array reference to all of the tokens in each sentence |
330
|
74
|
|
|
|
|
213
|
for (@sentences) { #creates an array of each word in the current article |
331
|
1
|
|
|
1
|
|
11
|
my @words = map { /\b (?: \w \. (?: ['’-] \w+ )?)+ | (?: \w+ ['’-]? )+ (?=\s|\b)/gx } lc $_; #tokenizes each sentence into complete words (single-quotes are considered part of the word they attach to) |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
16
|
|
|
4880
|
|
|
|
|
17478
|
|
|
4880
|
|
|
|
|
166281
|
|
332
|
4880
|
|
|
|
|
31191
|
push @word_list => @words; |
333
|
4880
|
|
|
|
|
8661
|
push @sen_words => \@words; |
334
|
|
|
|
|
|
|
} |
335
|
|
|
|
|
|
|
|
336
|
74
|
|
|
|
|
2014
|
$self->_set_article_length( scalar @word_list ); |
337
|
|
|
|
|
|
|
#counts the total number of words in the article |
338
|
|
|
|
|
|
|
|
339
|
74
|
|
|
|
|
3983
|
$self->_set_full_text( $full_text ); |
340
|
74
|
|
|
|
|
3588
|
$self->_set_sentences( \@sentences ); |
341
|
74
|
|
|
|
|
3897
|
$self->_set_word_list( \@word_list ); |
342
|
74
|
|
|
|
|
17673
|
$self->_set_sen_words( \@sen_words ); |
343
|
|
|
|
|
|
|
|
344
|
74
|
|
|
|
|
15278
|
return $self; |
345
|
|
|
|
|
|
|
} |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
sub _build_freq_hash { |
350
|
74
|
|
|
74
|
|
150
|
my $self = shift; |
351
|
|
|
|
|
|
|
|
352
|
74
|
|
50
|
|
|
1737
|
my $min_freq_thresh = int($self->article_length * $self->freq_constant) // 1; #estimates a minimum threshold of occurence for frequently occuring words |
353
|
74
|
|
|
|
|
2400
|
my %freq_hash; #counts the number of times each word appears in the *%word_list* hash |
354
|
74
|
|
|
|
|
122
|
for my $word (@{$self->word_list}) { |
|
74
|
|
|
|
|
351
|
|
355
|
102810
|
100
|
|
|
|
1705259
|
$freq_hash{$word}++ unless $self->stopwords->{$word}; |
356
|
|
|
|
|
|
|
} |
357
|
74
|
100
|
|
|
|
12790
|
grep { delete $freq_hash{$_} if $freq_hash{$_} < $min_freq_thresh } keys %freq_hash; |
|
34904
|
|
|
|
|
67853
|
|
358
|
|
|
|
|
|
|
#remove words that appear less than the *$min_freq_thresh* |
359
|
|
|
|
|
|
|
|
360
|
74
|
|
|
|
|
3806
|
$self->_set_freq_hash( \%freq_hash ); |
361
|
|
|
|
|
|
|
|
362
|
74
|
|
|
|
|
3222
|
return $self; |
363
|
|
|
|
|
|
|
} |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
sub _build_clst_hash { |
366
|
74
|
|
|
74
|
|
198
|
my $self = shift; |
367
|
|
|
|
|
|
|
|
368
|
74
|
|
|
|
|
151
|
my (%cluster_hash, %cluster_count); |
369
|
74
|
|
|
|
|
155
|
my $abs_pos = 0; |
370
|
74
|
|
|
|
|
134
|
for my $sen_index (0..scalar @{$self->sentences} - 1) { #gives the index of each sentence in the article |
|
74
|
|
|
|
|
482
|
|
371
|
4880
|
|
|
|
|
4291
|
my @sen_words = @{$self->sen_words->[$sen_index]}; # creates an array of each word in the given sentence |
|
4880
|
|
|
|
|
27649
|
|
372
|
|
|
|
|
|
|
|
373
|
4880
|
|
|
|
|
6318
|
for my $position (0..scalar @sen_words - 1) { #iterates across each word in the sentence |
374
|
102810
|
|
|
|
|
78484
|
$abs_pos++; |
375
|
|
|
|
|
|
|
|
376
|
102810
|
100
|
|
|
|
145682
|
if ( exists $self->freq_hash->{$sen_words[$position]}) { ## true if the given word at index *position* appears in the *freq_hash* |
377
|
9136
|
|
|
|
|
19403
|
my %word = ( abs => $abs_pos, sen => $sen_index, rel => $position, cnt => $cluster_count{$sen_words[$position]}++ ); |
378
|
|
|
|
|
|
|
# hash-vector of the following elements: |
379
|
|
|
|
|
|
|
# abs => absolute position of the currrent word within the entire token-stream |
380
|
|
|
|
|
|
|
# sen => the index of the current sentence |
381
|
|
|
|
|
|
|
# rel => position of the current word within the current sentence |
382
|
|
|
|
|
|
|
# cnt => number of times the given word has appeared in the entire text file |
383
|
9136
|
|
|
|
|
7774
|
push @{$cluster_hash{$sen_words[$position]}} => \%word; |
|
9136
|
|
|
|
|
17948
|
|
384
|
|
|
|
|
|
|
} |
385
|
|
|
|
|
|
|
} |
386
|
|
|
|
|
|
|
} |
387
|
|
|
|
|
|
|
|
388
|
74
|
|
|
|
|
1929
|
$self->_set_clst_hash( \%cluster_hash ); |
389
|
|
|
|
|
|
|
|
390
|
74
|
|
|
|
|
11393
|
return $self; |
391
|
|
|
|
|
|
|
} |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
sub _build_phrs_hash { |
394
|
74
|
|
|
74
|
|
150
|
my $self = shift; |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
#create long-form phrases around frequently used words by tracking forward and backward *phrase_radius* from any given *c_word* |
397
|
74
|
|
|
|
|
142
|
my %phrase_hash; |
398
|
74
|
|
|
|
|
141
|
for my $c_word (keys %{$self->clst_hash}) { |
|
74
|
|
|
|
|
985
|
|
399
|
2522
|
|
|
|
|
2479
|
for my $c_vector (@{$self->clst_hash->{$c_word}}) { |
|
2522
|
|
|
|
|
4914
|
|
400
|
|
|
|
|
|
|
|
401
|
9136
|
|
|
|
|
15823
|
my ($sen, $pos, $cnt) = @$c_vector{'sen', 'rel', 'cnt'}; |
402
|
|
|
|
|
|
|
# *sen* indicates which sentence the current *c_word* appears in |
403
|
|
|
|
|
|
|
# *pos* indicates the position of the *c_word* within the sentence (see above) |
404
|
|
|
|
|
|
|
# *cnt* counts the total number of times the word has been detected thus far |
405
|
|
|
|
|
|
|
|
406
|
9136
|
|
|
|
|
116341
|
my @phrase = [ @{$self->sen_words->[$sen]}[ max($pos - $self->phrase_radius, 0) .. min($pos + $self->phrase_radius, scalar(@{$self->sen_words->[$sen]}) - 1) ] ]; |
|
9136
|
|
|
|
|
35752
|
|
|
9136
|
|
|
|
|
68709
|
|
407
|
|
|
|
|
|
|
#array slice containing only tokens within *phrase_radius* of the *c_word* within the given sentence |
408
|
|
|
|
|
|
|
|
409
|
9136
|
|
|
|
|
18314
|
unshift @phrase => \$self->sentences->[$sen]; #begins the *phrase* array with a complete, unedited sentence (for reference only) |
410
|
9136
|
100
|
|
|
|
7536
|
push @{$phrase_hash{$c_word}} => \@phrase if scalar @{$phrase[-1]} > $self->phrase_thresh + 1; |
|
9046
|
|
|
|
|
52565
|
|
|
9136
|
|
|
|
|
118365
|
|
411
|
|
|
|
|
|
|
#the *phrase_hash* can only contain a given *phrase* array if it is longer than the defined *phrase_thresh* + 1 (defaults to 3) |
412
|
|
|
|
|
|
|
} |
413
|
|
|
|
|
|
|
} |
414
|
|
|
|
|
|
|
|
415
|
74
|
|
|
|
|
1562
|
$self->_set_phrs_hash( \%phrase_hash ); |
416
|
|
|
|
|
|
|
|
417
|
74
|
|
|
|
|
22842
|
return $self; |
418
|
|
|
|
|
|
|
} |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
sub _build_sigma_hash { |
421
|
74
|
|
|
74
|
|
155
|
my $self = shift; |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
#determine population standard deviation for word clustering |
424
|
74
|
|
|
|
|
115
|
my %sigma_hash; |
425
|
74
|
|
|
|
|
109
|
for my $c_word (keys %{$self->clst_hash}) { |
|
74
|
|
|
|
|
698
|
|
426
|
2522
|
|
|
|
|
2639
|
for my $c_vector (@{$self->clst_hash->{$c_word}}) { |
|
2522
|
|
|
|
|
4379
|
|
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
#create a list of the distances between each instance of the current *c_word* |
429
|
9136
|
|
|
|
|
8326
|
my %dist_list; |
430
|
9136
|
|
|
|
|
7732
|
my ($L_pos, $R_pos); |
431
|
9136
|
|
|
|
|
8655
|
for (my $i = 0; $i < scalar @{$self->clst_hash->{$c_word}}; $i++) { |
|
101936
|
|
|
|
|
148646
|
|
432
|
92800
|
|
|
|
|
114616
|
$R_pos = $self->clst_hash->{$c_word}->[$i]->{abs}; |
433
|
|
|
|
|
|
|
|
434
|
92800
|
|
66
|
|
|
110093
|
my $dist = $R_pos - ($L_pos // $R_pos); |
435
|
92800
|
50
|
|
|
|
97356
|
push @{$dist_list{$c_word}} => $dist if $dist >= 0; |
|
92800
|
|
|
|
|
102862
|
|
436
|
|
|
|
|
|
|
|
437
|
92800
|
|
|
|
|
88358
|
$L_pos = $R_pos; |
438
|
|
|
|
|
|
|
} |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
#the following is used for scoring purposes, and is used only to determine the *sigma* score (population standard deviation) of the given *c_word* |
442
|
9136
|
50
|
|
|
|
7994
|
my $pop_size = scalar @{$dist_list{$c_word}} or 1; |
|
9136
|
|
|
|
|
12677
|
|
443
|
9136
|
|
|
|
|
7749
|
my $pop_ave = sum0( @{$dist_list{$c_word}} ) / $pop_size; |
|
9136
|
|
|
|
|
16672
|
|
444
|
9136
|
|
|
|
|
8351
|
$sigma_hash{$c_word} = int sqrt( sum( map { ($_ - $pop_ave)**2 } @{$dist_list{$c_word}} ) / $pop_size ); #pop. std. deviation |
|
92800
|
|
|
|
|
119460
|
|
|
9136
|
|
|
|
|
10578
|
|
445
|
|
|
|
|
|
|
} |
446
|
|
|
|
|
|
|
} |
447
|
74
|
|
|
|
|
2357
|
$self->_set_sigma_hash( \%sigma_hash ); |
448
|
|
|
|
|
|
|
|
449
|
74
|
|
|
|
|
2905
|
return $self; |
450
|
|
|
|
|
|
|
} |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
sub _build_frag_list { |
453
|
74
|
|
|
74
|
|
110
|
my $self = shift; |
454
|
|
|
|
|
|
|
|
455
|
74
|
|
|
|
|
107
|
my @frag_list; |
456
|
74
|
|
|
|
|
105
|
F_WORD: for my $f_word (keys %{$self->phrs_hash}) { |
|
74
|
|
|
|
|
955
|
|
457
|
|
|
|
|
|
|
#find common phrase-fragments |
458
|
2516
|
|
|
|
|
3098
|
my %full_phrase; #*inter_hash* contains phrase fragments; |
459
|
2516
|
|
|
|
|
2472
|
my (@hash_list, %sums_hash, %words_count); #*hash_list* contains ordered, formatted lists of each word in the phrase fragment; *sums_hash* contains the total number of times each word appears in all phrases for the given *f_word* |
460
|
2516
|
|
|
|
|
2057
|
ORDER: for my $phrase (@{$self->phrs_hash->{$f_word}}) { |
|
2516
|
|
|
|
|
6257
|
|
461
|
9046
|
|
|
|
|
11059
|
my $sentence_ref = $phrase->[0]; |
462
|
9046
|
|
|
|
|
7996
|
my %ordered_words = map { $sums_hash{$phrase->[-1]->[$_]}++; ($_ => $phrase->[-1]->[$_]) } (0..scalar @{$phrase->[-1]} - 1); |
|
86336
|
|
|
|
|
112804
|
|
|
86336
|
|
|
|
|
126124
|
|
|
9046
|
|
|
|
|
13360
|
|
463
|
|
|
|
|
|
|
# *words* contains an ordered, formatted list of each word in the given phrase fragment, looks like: |
464
|
|
|
|
|
|
|
# '01' => 'some' |
465
|
|
|
|
|
|
|
# '02' => 'word' |
466
|
|
|
|
|
|
|
# '03' => 'goes' |
467
|
|
|
|
|
|
|
# '04' => 'here' |
468
|
9046
|
|
|
|
|
62391
|
$words_count{$_}++ for values %ordered_words; |
469
|
9046
|
|
|
|
|
25165
|
push @hash_list => { f_word => $f_word, sentence => $sentence_ref, counts => \%words_count, ordered => \%ordered_words }; |
470
|
|
|
|
|
|
|
} |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
#removes each word from the *word_hash* unless it occurs more than once amongst all phrases |
474
|
2516
|
|
|
|
|
3004
|
SCRAP: for my $word_hash (@hash_list) { |
475
|
9046
|
|
|
|
|
7745
|
for my $index ( keys %{$word_hash->{'ordered'}} ) { |
|
9046
|
|
|
|
|
17573
|
|
476
|
86336
|
100
|
|
|
|
132869
|
delete $word_hash->{'ordered'}->{$index} unless $sums_hash{$word_hash->{'ordered'}->{$index}} > 1 |
477
|
|
|
|
|
|
|
} |
478
|
|
|
|
|
|
|
} |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
#break phrases fragments into "scraps" (consecutive runs of words within the fragment) |
482
|
2516
|
|
|
|
|
2542
|
FRAG: for my $word_hash (@hash_list) { |
483
|
9046
|
|
|
|
|
10346
|
my (%L_scrap, %R_scrap); #a "scrap" is a sub-fragment |
484
|
9046
|
|
|
|
|
10762
|
my ($prev, $curr, $next) = (-1,0,0); #used to find consecutive sequences of words |
485
|
9046
|
|
|
|
|
7831
|
my $real = 0; #flag for stopwords identification |
486
|
|
|
|
|
|
|
|
487
|
9046
|
|
|
|
|
7830
|
my @word_keys = sort { $a <=> $b } keys %{$word_hash->{'ordered'}}; # *word_keys* contains a series of index-values |
|
52004
|
|
|
|
|
55050
|
|
|
9046
|
|
|
|
|
22210
|
|
488
|
9046
|
|
|
|
|
15749
|
for (my $i = 0; $i < scalar @word_keys; $i++ ) { |
489
|
35944
|
|
|
|
|
34410
|
$curr = $word_keys[$i]; |
490
|
35944
|
100
|
|
|
|
48798
|
$next = $word_keys[$i+1] if $i < scalar @word_keys - 1; # if-statement prevents out-of-bounds error |
491
|
|
|
|
|
|
|
|
492
|
35944
|
100
|
100
|
|
|
66389
|
if ( $next == $curr + 1 or $curr == $prev + 1 ) { |
493
|
27474
|
100
|
|
|
|
32430
|
unless ($curr == $prev + 1) { #resets *R_scrap* when the *curr* index skips over a number (i.e. a new scrap is encountered) |
494
|
6550
|
100
|
|
|
|
13730
|
%L_scrap = %R_scrap if keys %L_scrap <= keys %R_scrap; #chooses the longest or most recent scrap |
495
|
6550
|
|
|
|
|
7168
|
%R_scrap = (); #resets the *R_scrap* |
496
|
|
|
|
|
|
|
} |
497
|
27474
|
|
|
|
|
35679
|
$R_scrap{$curr} = $word_hash->{'ordered'}->{$curr}; |
498
|
27474
|
100
|
|
|
|
360719
|
$real = 1 unless $self->stopwords->{$R_scrap{$curr}}; #ensures that scraps consisting only of stopwords are ignored |
499
|
|
|
|
|
|
|
} else { |
500
|
8470
|
100
|
|
|
|
18215
|
%L_scrap = %R_scrap if keys %L_scrap <= keys %R_scrap; #chooses the longest or most recent scrap |
501
|
8470
|
|
|
|
|
9367
|
%R_scrap = (); #resets the *R_scrap* |
502
|
|
|
|
|
|
|
} |
503
|
35944
|
|
|
|
|
192084
|
$prev = $curr; |
504
|
|
|
|
|
|
|
} |
505
|
9046
|
100
|
|
|
|
21121
|
%L_scrap = %R_scrap if keys %L_scrap <= keys %R_scrap; #chooses the longest or most recent scrap |
506
|
9046
|
|
|
|
|
10996
|
%R_scrap = (); #resets the *R_scrap* |
507
|
9046
|
100
|
100
|
|
|
99204
|
push @frag_list => { %{$word_hash}, scrap => \%L_scrap } if $real and scalar keys %L_scrap >= $self->phrase_thresh; |
|
6300
|
|
|
|
|
65371
|
|
508
|
|
|
|
|
|
|
} |
509
|
|
|
|
|
|
|
} |
510
|
|
|
|
|
|
|
|
511
|
74
|
|
|
|
|
1876
|
$self->_set_frag_list( \@frag_list ); |
512
|
|
|
|
|
|
|
|
513
|
74
|
|
|
|
|
37268
|
return $self; |
514
|
|
|
|
|
|
|
} |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
sub develop_stopwords { |
519
|
37
|
|
|
37
|
0
|
86
|
my $self = shift; |
520
|
|
|
|
|
|
|
|
521
|
37
|
|
|
|
|
61
|
my %score_hash; #*score_hash* contains score values for words in those phrases |
522
|
|
|
|
|
|
|
|
523
|
37
|
|
|
|
|
61
|
$score_hash{$_}++ for keys %{$self->phrs_hash}; |
|
37
|
|
|
|
|
1022
|
|
524
|
|
|
|
|
|
|
|
525
|
37
|
|
|
|
|
134
|
JOIN: for my $fragment (@{$self->frag_list}) { |
|
37
|
|
|
|
|
159
|
|
526
|
|
|
|
|
|
|
#compile scraps for scoring |
527
|
|
|
|
|
|
|
|
528
|
10977
|
|
|
|
|
13467
|
my $scrap = join ' ' => map { $score_hash{$fragment->{'scrap'}->{$_}}++; |
529
|
10977
|
|
|
|
|
13726
|
$fragment->{'scrap'}->{$_} } sort { $a <=> $b |
|
13344
|
|
|
|
|
12710
|
|
530
|
3150
|
|
|
|
|
2868
|
} keys %{$fragment->{'scrap'}}; |
|
3150
|
|
|
|
|
7639
|
|
531
|
|
|
|
|
|
|
|
532
|
3150
|
|
|
|
|
6562
|
for my $word (split ' ' => $scrap) { |
533
|
10977
|
|
100
|
|
|
20979
|
$score_hash{$word} += $self->freq_hash->{$word} // 0; |
534
|
10977
|
|
100
|
|
|
19600
|
$score_hash{$word} += $self->sigma_hash->{$word} // 0; |
535
|
10977
|
|
50
|
|
|
19610
|
$score_hash{$word} -= $fragment->{'counts'}->{$word} // 0; |
536
|
|
|
|
|
|
|
} |
537
|
|
|
|
|
|
|
} |
538
|
|
|
|
|
|
|
|
539
|
37
|
100
|
|
|
|
519
|
grep { delete $score_hash{$_} if $self->stopwords->{$_} } keys %score_hash; |
|
2531
|
|
|
|
|
43401
|
|
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
|
544
|
37
|
50
|
|
|
|
834
|
my @word_keys = sort { $score_hash{$b} <=> $score_hash{$a} or $a cmp $b } keys %score_hash; |
|
8302
|
|
|
|
|
11532
|
|
545
|
37
|
|
|
|
|
189
|
my $highest = $score_hash{$word_keys[0]}; |
546
|
37
|
|
|
|
|
90
|
my $longest = max map { length } @word_keys; |
|
1724
|
|
|
|
|
2051
|
|
547
|
|
|
|
|
|
|
|
548
|
37
|
|
|
|
|
1492
|
$score_hash{$_} = 40 * $score_hash{$_} / $highest for keys %score_hash; |
549
|
37
|
|
|
|
|
165
|
@word_keys = reverse grep { $score_hash{$_} >= 1 } @word_keys; |
|
1724
|
|
|
|
|
2403
|
|
550
|
|
|
|
|
|
|
|
551
|
37
|
|
|
|
|
292
|
my $score_ave = sum( values %score_hash ) / keys %score_hash; |
552
|
|
|
|
|
|
|
|
553
|
37
|
|
|
|
|
83
|
my @scores = map { $score_hash{$_} } @word_keys; |
|
640
|
|
|
|
|
830
|
|
554
|
37
|
|
|
|
|
273
|
my @low = @scores[ 0..(int scalar @scores / 2 - 1.5) ]; |
555
|
37
|
|
|
|
|
231
|
my @high = @scores[ (int scalar @scores / 2 + 1)..(int scalar @scores - 1) ]; |
556
|
37
|
|
|
|
|
174
|
my @LM = @low[ (int scalar @low / 2 - 0.5)..(int scalar @low / 2) ]; |
557
|
37
|
|
|
|
|
156
|
my @UM = @high[ (int scalar @high / 2 - 0.5)..(int scalar @high / 2) ]; |
558
|
37
|
|
|
|
|
142
|
my $Q1 = sum( @LM ) / scalar @LM; |
559
|
37
|
|
|
|
|
106
|
my $Q3 = sum( @UM ) / scalar @UM; |
560
|
37
|
|
|
|
|
72
|
my $IQR = $Q3 - $Q1; |
561
|
37
|
|
|
|
|
78
|
my $lower = $Q1; |
562
|
37
|
|
|
|
|
71
|
my $upper = $Q3 + 1.5 * $IQR; |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
|
565
|
37
|
100
|
|
|
|
91
|
my @graph_data = grep { $_ > $lower and $_ < $upper } map { $score_hash{$_} } @word_keys; |
|
640
|
|
|
|
|
1345
|
|
|
640
|
|
|
|
|
763
|
|
566
|
37
|
|
|
|
|
81
|
my $n = scalar @graph_data; |
567
|
|
|
|
|
|
|
|
568
|
37
|
100
|
|
|
|
131
|
if ($n > 4) { |
569
|
33
|
|
|
|
|
104
|
my $average = sum( @graph_data ) / $n; |
570
|
33
|
|
|
|
|
112
|
my @xdata = 1..$n; # The data corresponsing to $variable |
571
|
33
|
|
|
|
|
106
|
my @ydata = @graph_data; # The data on the other axis |
572
|
33
|
|
|
|
|
67
|
my $max_iter = 100; # maximum iterations |
573
|
33
|
|
|
|
|
189
|
my @params_line = ( |
574
|
|
|
|
|
|
|
# Name Guess Accuracy |
575
|
|
|
|
|
|
|
['a', 0, 0.00001], |
576
|
|
|
|
|
|
|
['b', $average, 0.00001], |
577
|
|
|
|
|
|
|
['c', $highest, 0.00001], |
578
|
|
|
|
|
|
|
); |
579
|
33
|
|
|
|
|
361
|
Algorithm::CurveFit->curve_fit( |
580
|
|
|
|
|
|
|
formula => 'a + b * x + c * x^2', |
581
|
|
|
|
|
|
|
params => \@params_line, |
582
|
|
|
|
|
|
|
xdata => \@xdata, |
583
|
|
|
|
|
|
|
ydata => \@ydata, |
584
|
|
|
|
|
|
|
maximum_iterations => $max_iter, |
585
|
|
|
|
|
|
|
); |
586
|
33
|
|
|
|
|
2460238
|
my ($a, $b, $c) = ($params_line[0]->[1],$params_line[1]->[1],$params_line[2]->[1]); |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
|
589
|
33
|
|
100
|
|
|
96
|
my %watchlist = %{$self->watchlist // {} }; |
|
33
|
|
|
|
|
522
|
|
590
|
33
|
|
|
|
|
157
|
KEY: for my $index ( reverse 1..scalar @word_keys ) { |
591
|
623
|
|
|
|
|
820
|
my $score = $a + $b * $index + $c * $index**2; |
592
|
623
|
100
|
100
|
|
|
2125
|
$watchlist{$word_keys[$index - 1]}++ if $score >= $lower and $score < $score_hash{$word_keys[$index - 1]}; |
593
|
|
|
|
|
|
|
} |
594
|
33
|
|
|
|
|
787
|
$self->_set_watchlist( \%watchlist ); |
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
|
597
|
33
|
50
|
|
|
|
1560
|
if ($self->print_scanner) { |
598
|
0
|
|
|
|
|
0
|
say "\n\n———————————————————————————————————————————\n\n"; |
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
|
601
|
0
|
0
|
|
|
|
0
|
say "[file name] " . $self->file_name if $self->file_name; |
602
|
0
|
|
|
|
|
0
|
say "[text hint] " . $self->text_hint; |
603
|
|
|
|
|
|
|
|
604
|
0
|
|
|
|
|
0
|
say "\n---SCANNER GRAPHS---\n"; |
605
|
|
|
|
|
|
|
|
606
|
0
|
|
|
|
|
0
|
say "KNOWN:"; |
607
|
0
|
|
|
|
|
0
|
KEY: for my $index ( reverse 0..scalar @word_keys - 1 ) { |
608
|
0
|
|
|
|
|
0
|
my $format = "%" . $longest . "s|%s\n"; |
609
|
0
|
|
|
|
|
0
|
my $score = $score_hash{$word_keys[$index]}; |
610
|
|
|
|
|
|
|
|
611
|
0
|
|
|
|
|
0
|
my $score_string = sprintf " %5.2f |" => $score; |
612
|
0
|
|
|
|
|
0
|
for (0..max($score, $upper)) { |
613
|
0
|
0
|
0
|
|
|
0
|
if ($score > $lower and $score < $upper) { |
614
|
0
|
0
|
|
|
|
0
|
$score_string .= '+' if $_ <= $score; |
615
|
|
|
|
|
|
|
} else { |
616
|
0
|
0
|
|
|
|
0
|
$score_string .= ']' if $_ == int $upper; |
617
|
0
|
0
|
|
|
|
0
|
$score_string .= '-' if $_ <= int $score; |
618
|
0
|
0
|
|
|
|
0
|
$score_string .= ' ' if $_ > int $score; |
619
|
0
|
0
|
|
|
|
0
|
$score_string .= '[' if $_ == int $lower; |
620
|
|
|
|
|
|
|
} |
621
|
|
|
|
|
|
|
} |
622
|
|
|
|
|
|
|
|
623
|
0
|
|
|
|
|
0
|
printf $format => ($word_keys[$index], $score_string); |
624
|
|
|
|
|
|
|
} |
625
|
0
|
|
|
|
|
0
|
printf "\n[whiskers] lower = %.2f; upper = %.2f\n\n" => ($lower, $upper); |
626
|
|
|
|
|
|
|
|
627
|
0
|
|
|
|
|
0
|
say "CALCULATED:"; |
628
|
0
|
|
|
|
|
0
|
KEY: for my $index ( reverse 1..scalar @word_keys ) { |
629
|
0
|
|
|
|
|
0
|
my $format = "%" . $longest . "s|%s\n"; |
630
|
0
|
|
|
|
|
0
|
my $score = $a + $b * $index + $c * $index**2; |
631
|
0
|
|
0
|
|
|
0
|
my $score_string = sprintf " %5.2f |%s" => $score, ($score >= $lower and $score < $score_hash{$word_keys[$index - 1]} ? '-' x $score : '+' x $score); |
632
|
0
|
|
|
|
|
0
|
printf $format => $word_keys[$index - 1], $score_string; |
633
|
|
|
|
|
|
|
} |
634
|
|
|
|
|
|
|
} |
635
|
|
|
|
|
|
|
} |
636
|
|
|
|
|
|
|
|
637
|
|
|
|
|
|
|
|
638
|
37
|
|
|
|
|
1184
|
return $self; |
639
|
|
|
|
|
|
|
} |
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
sub analyze_phrases { |
644
|
37
|
|
|
37
|
0
|
73
|
my $self = shift; |
645
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
#find common phrase-fragments |
647
|
37
|
|
|
|
|
78
|
my (%inter_hash, %score_hash, %bare_phrase, %full_phrase); #*inter_hash* contains phrase fragments; *score_hash* contains score values for words in those phrases |
648
|
37
|
|
|
|
|
49
|
F_WORD: for my $f_word (keys %{$self->phrs_hash}) { |
|
37
|
|
|
|
|
427
|
|
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
#compile scraps for scoring |
651
|
1258
|
|
|
|
|
1339
|
JOIN: for my $fragment (@{$self->frag_list}) { |
|
1258
|
|
|
|
|
2777
|
|
652
|
317645
|
|
|
|
|
275855
|
my $scrap = join ' ' => map { $score_hash{$_}++; |
653
|
94226
|
|
|
|
|
91334
|
$fragment->{'scrap'}->{$_} } sort { $a <=> $b } keys %{$fragment->{'scrap'}}; |
|
317645
|
|
|
|
|
480807
|
|
|
371733
|
|
|
|
|
401366
|
|
|
94226
|
|
|
|
|
256131
|
|
654
|
94226
|
|
|
|
|
132407
|
my @bare = map { $fragment->{'scrap'}->{$_} } grep { !$self->stopwords->{$fragment->{'scrap'}->{$_}} } sort { $a <=> $b } keys %{$fragment->{'scrap'}}; |
|
177564
|
|
|
|
|
735503
|
|
|
317645
|
|
|
|
|
5187344
|
|
|
371733
|
|
|
|
|
380332
|
|
|
94226
|
|
|
|
|
170487
|
|
655
|
|
|
|
|
|
|
|
656
|
94226
|
|
|
|
|
160792
|
$score_hash{$f_word}++; #scores each *f_word* |
657
|
94226
|
|
|
|
|
124428
|
$inter_hash{$scrap}++; #contains the final *L_scrap* |
658
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
|
660
|
94226
|
|
|
|
|
85969
|
my $score = 1; |
661
|
94226
|
|
|
|
|
185178
|
for my $word (split ' ' => $scrap) { |
662
|
317645
|
|
100
|
|
|
621865
|
$score += $self->freq_hash->{$word} // 0; |
663
|
317645
|
|
100
|
|
|
537695
|
$score += $self->sigma_hash->{$word} // 0; |
664
|
317645
|
|
100
|
|
|
573372
|
$score += $score_hash{$word} // 0; |
665
|
|
|
|
|
|
|
} |
666
|
|
|
|
|
|
|
|
667
|
94226
|
|
|
|
|
109710
|
$full_phrase{ ${$fragment->{'sentence'}} } += $score; #contains the full phrase from which the *L_scrap* was drawn |
|
94226
|
|
|
|
|
185846
|
|
668
|
94226
|
100
|
|
|
|
249076
|
$bare_phrase{ $scrap } = \@bare if scalar @bare; #contains the final *L_scrap* without any stopwords |
669
|
|
|
|
|
|
|
} |
670
|
|
|
|
|
|
|
} |
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
#each phrases' score is multiplied by the sum of the compound score of each word within the phrase |
674
|
37
|
|
|
|
|
1202
|
for my $scrap (keys %inter_hash) { |
675
|
2324
|
|
|
|
|
4121
|
for my $word (split ' ' => $scrap) { |
676
|
8350
|
|
|
|
|
6821
|
my $score = 1; |
677
|
8350
|
|
100
|
|
|
14584
|
$score += $self->freq_hash->{$word} // 0; |
678
|
8350
|
|
100
|
|
|
13962
|
$score += $self->sigma_hash->{$word} // 0; |
679
|
8350
|
|
100
|
|
|
13098
|
$score += $score_hash{$word} // 0; |
680
|
|
|
|
|
|
|
|
681
|
8350
|
|
|
|
|
10511
|
$inter_hash{$scrap} *= $score; |
682
|
|
|
|
|
|
|
} |
683
|
|
|
|
|
|
|
} |
684
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
#combine scraps — if scrap "a" contains scrap "b", add the value of "b" to "a" and delete "b" |
687
|
37
|
50
|
|
|
|
677
|
CLEAR: for my $scrap (sort { $inter_hash{$b} <=> $inter_hash{$a} or $a cmp $b } keys %inter_hash) { |
|
11571
|
|
|
|
|
17319
|
|
688
|
2324
|
|
|
|
|
19709
|
my $compare = qr/\b$scrap\b/; |
689
|
2324
|
|
|
|
|
2962
|
my $delete = 0; |
690
|
2324
|
|
|
|
|
25093
|
TEST: for my $test (keys %inter_hash) { |
691
|
120849
|
100
|
|
|
|
147692
|
if ($test ne $scrap) { |
692
|
119009
|
100
|
|
|
|
203814
|
if ($test =~ /$compare/) { #true iff *scrap* ∈ *test* |
|
|
100
|
|
|
|
|
|
693
|
611
|
|
|
|
|
1262
|
$inter_hash{$test} += $inter_hash{$scrap}; |
694
|
611
|
50
|
|
|
|
3674
|
delete $inter_hash{$scrap} and next CLEAR; |
695
|
118398
|
|
|
|
|
160460
|
} elsif (not scalar singleton (@{$bare_phrase{$test}}, @{$bare_phrase{$scrap}}) ) { #true iff *bare_phrase{test}* == *bare_phrase{scrap}* |
|
118398
|
|
|
|
|
469252
|
|
696
|
3116
|
100
|
|
|
|
2662
|
next TEST unless scalar @{$bare_phrase{$test}} > 1; |
|
3116
|
|
|
|
|
6974
|
|
697
|
|
|
|
|
|
|
|
698
|
228
|
|
|
|
|
290
|
my $joined = join '|' => @{$bare_phrase{$test}}; |
|
228
|
|
|
|
|
465
|
|
699
|
228
|
|
|
|
|
749
|
$inter_hash{"($joined)"} = $inter_hash{$test} + $inter_hash{$scrap}; |
700
|
228
|
|
|
|
|
477
|
$inter_hash{$test} += $inter_hash{$scrap}; |
701
|
228
|
50
|
|
|
|
1547
|
delete $inter_hash{$scrap} and next CLEAR; |
702
|
|
|
|
|
|
|
} |
703
|
|
|
|
|
|
|
} |
704
|
|
|
|
|
|
|
} |
705
|
|
|
|
|
|
|
} |
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
|
708
|
37
|
|
|
|
|
1674
|
$self->_set_score_hash( \%score_hash ); |
709
|
37
|
|
|
|
|
2542
|
$self->_set_inter_hash( \%inter_hash ); |
710
|
37
|
|
|
|
|
2074
|
$self->_set_phrs_list( \%full_phrase ); |
711
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
|
716
|
|
|
|
|
|
|
#returns a summary array for the given text, in the form of a hash of array-refs: |
717
|
|
|
|
|
|
|
# sentences => a list of full sentences from the given article, scored based on the scores of the words contained therein |
718
|
|
|
|
|
|
|
# fragments => a list of phrase fragments from the given article, scored as above |
719
|
|
|
|
|
|
|
# words => a list of all words in the article, scored by a three-factor system consisting of |
720
|
|
|
|
|
|
|
# (frequency of appearance, population standard deviation, and use in important phrase fragments) |
721
|
|
|
|
|
|
|
|
722
|
37
|
|
|
|
|
1389
|
my %sort_list; |
723
|
37
|
|
|
|
|
51
|
for (keys %{$self->freq_hash}) { |
|
37
|
|
|
|
|
675
|
|
724
|
1261
|
|
50
|
|
|
2464
|
$sort_list{$_} += $self->freq_hash->{$_} // 0; |
725
|
1261
|
|
50
|
|
|
2139
|
$sort_list{$_} += $self->sigma_hash->{$_} // 0; |
726
|
1261
|
|
100
|
|
|
2289
|
$sort_list{$_} += $self->score_hash->{$_} // 0; |
727
|
|
|
|
|
|
|
} |
728
|
|
|
|
|
|
|
|
729
|
37
|
|
|
|
|
136
|
my %sentences = map { ($_ => $self->phrs_list->{$_}) } sort { $self->phrs_list->{$b} <=> $self->phrs_list->{$a} } keys %{$self->phrs_list}; |
|
1295
|
|
|
|
|
2908
|
|
|
5431
|
|
|
|
|
9278
|
|
|
37
|
|
|
|
|
1163
|
|
730
|
37
|
50
|
|
|
|
241
|
my %fragments = map { ($_ => $self->inter_hash->{$_}) } sort { $self->inter_hash->{$b} <=> $self->inter_hash->{$a} or $a cmp $b } keys %{$self->inter_hash}; |
|
1645
|
|
|
|
|
3230
|
|
|
7316
|
|
|
|
|
12632
|
|
|
37
|
|
|
|
|
504
|
|
731
|
37
|
50
|
|
|
|
567
|
my %singleton = map { ($_ => $sort_list{$_}) } sort { $sort_list{$b} <=> $sort_list{$a} or $a cmp $b } keys %sort_list; |
|
1261
|
|
|
|
|
2180
|
|
|
5994
|
|
|
|
|
8557
|
|
732
|
|
|
|
|
|
|
|
733
|
37
|
|
|
|
|
348
|
my %summary = ( sentences => \%sentences, fragments => \%fragments, words => \%singleton ); |
734
|
|
|
|
|
|
|
|
735
|
37
|
|
|
|
|
877
|
$self->_set_summary( \%summary ); |
736
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
|
738
|
37
|
50
|
|
|
|
1750
|
if ($self->print_summary) { |
739
|
0
|
|
|
|
|
0
|
say "\n\n———————————————————————————————————————————\n\n"; |
740
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
|
742
|
0
|
0
|
|
|
|
0
|
say "[file name] " . $self->file_name if $self->file_name; |
743
|
0
|
|
|
|
|
0
|
say "[text hint] " . $self->text_hint; |
744
|
|
|
|
|
|
|
|
745
|
0
|
|
|
|
|
0
|
say "\n---SUMMARY CHARTS---\n"; |
746
|
|
|
|
|
|
|
|
747
|
0
|
|
|
|
|
0
|
my ($sentences, $fragments, $words) = @{$self->summary}{'sentences','fragments','words'}; |
|
0
|
|
|
|
|
0
|
|
748
|
|
|
|
|
|
|
|
749
|
0
|
|
|
|
|
0
|
say "SUMMARY:"; |
750
|
0
|
0
|
|
|
|
0
|
my @sentence_keys = sort { $sentences->{$b} <=> $sentences->{$a} or $a cmp $b} keys %$sentences; |
|
0
|
|
|
|
|
0
|
|
751
|
0
|
|
|
|
|
0
|
for my $sen ( @sentence_keys[0..min($self->return_count,scalar @sentence_keys - 1)] ) { |
752
|
0
|
|
|
|
|
0
|
printf "%4d => %s\n" => $sentences->{$sen}, $sen; |
753
|
|
|
|
|
|
|
} |
754
|
0
|
|
|
|
|
0
|
say "\n"; |
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
|
757
|
0
|
|
|
|
|
0
|
say "PHRASES:"; |
758
|
0
|
0
|
|
|
|
0
|
my @phrase_keys = sort { $fragments->{$b} <=> $fragments->{$a} or $a cmp $b } keys %$fragments; |
|
0
|
|
|
|
|
0
|
|
759
|
0
|
|
|
|
|
0
|
for my $phrase ( @phrase_keys[0..min($self->return_count,scalar @phrase_keys - 1)] ) { |
760
|
0
|
|
|
|
|
0
|
printf "%8d => %s\n" => $fragments->{$phrase}, $phrase; |
761
|
|
|
|
|
|
|
} |
762
|
0
|
|
|
|
|
0
|
say "\n"; |
763
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
|
765
|
0
|
|
|
|
|
0
|
say " WORDS:"; |
766
|
0
|
0
|
|
|
|
0
|
my @word_keys = sort { $words->{$b} <=> $words->{$a} or $a cmp $b } keys %$words; |
|
0
|
|
|
|
|
0
|
|
767
|
0
|
|
|
|
|
0
|
my $highest = $words->{$word_keys[0]}; |
768
|
0
|
|
|
|
|
0
|
my $longest = max map {length} @word_keys; |
|
0
|
|
|
|
|
0
|
|
769
|
0
|
|
|
|
|
0
|
KEY: for my $word ( @word_keys[0..min($self->return_count,scalar @word_keys - 1)] ) { |
770
|
0
|
|
|
|
|
0
|
my $format = "%" . $longest . "s|%s\n"; |
771
|
0
|
|
|
|
|
0
|
my $score = int(40*$words->{$word}/$highest); |
772
|
0
|
0
|
|
|
|
0
|
printf $format => ( $word , "-" x $score ) if $score > 2; |
773
|
|
|
|
|
|
|
} |
774
|
0
|
|
|
|
|
0
|
say "\n"; |
775
|
|
|
|
|
|
|
} |
776
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
|
778
|
37
|
|
|
|
|
1752
|
return $self; |
779
|
|
|
|
|
|
|
} |
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
1; |
785
|
|
|
|
|
|
|
__END__ |