File Coverage

lib/Text/Summarizer.pm

Criterion	Covered	Total	%
statement	351	412	85.1
branch	64	106	60.3
condition	38	58	65.5
subroutine	30	31	96.7
pod	0	12	0.0
total	483	619	78.0

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Summarizer;
2
3	1			1		2611	use v5.10.0;
	1					6
4	1			1		9	use strict;
	1					3
	1					31
5	1			1		11	use warnings;
	1					2
	1					35
6	1			1		807	use Moo;
	1					18939
	1					8
7	1			1		3009	use Types::Standard qw/ Bool Ref Str Int Num InstanceOf Bool /;
	1					79895
	1					11
8	1			1		2088	use List::AllUtils qw/ max min sum sum0 singleton /;
	1					14053
	1					105
9	1			1		510	use Algorithm::CurveFit;
	1					141828
	1					58
10	1			1		784	use utf8;
	1					14
	1					6
11
12	1			1		980	binmode STDOUT, ':encoding(UTF-8)';
	1					15
	1					6
13
14	1			1		51	use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION);
	1					8
	1					5445
15							require Exporter;
16
17							@ISA = qw(Exporter);
18							@EXPORT = qw();
19							@EXPORT_OK = qw();
20							%EXPORT_TAGS = (all => [@EXPORT_OK]);
21							$VERSION = '1.053';
22
23
24							has permanent_path => (
25							is => 'rw',
26							isa => Str,
27							default => 'data/permanent.stop',
28							);
29
30							has stopwords_path => (
31							is => 'rw',
32							isa => Str,
33							default => 'data/stopwords.stop',
34							);
35
36							has articles_path => (
37							is => 'rw',
38							isa => Str,
39							default => 'articles/*'
40							);
41
42							has store_scanner => (
43							is => 'rw',
44							isa => Bool,
45							default => 0,
46							);
47
48							has print_scanner => (
49							is => 'rw',
50							isa => Bool,
51							default => 0,
52							);
53
54							has print_summary => (
55							is => 'rw',
56							isa => Bool,
57							default => 0,
58							);
59
60							has return_count => (
61							is => 'rw',
62							isa => Num,
63							default => 20,
64							);
65
66							has phrase_thresh => (
67							is => 'rw',
68							isa => Int,
69							default => 2,
70							);
71
72							has phrase_radius => (
73							is => 'rw',
74							isa => Int,
75							default => 5,
76							);
77
78							has freq_constant => (
79							is => 'rw',
80							isa => Num,
81							default => 0.004,
82							);
83
84							has watch_count => (
85							is => 'rwp',
86							isa => Int,
87							default => 0,
88							);
89
90							has stopwords => (
91							is => 'lazy',
92							isa => Ref['HASH'],
93							);
94
95							has watchlist => (
96							is => 'rwp',
97							isa => Ref['HASH'],
98							lazy => 1,
99							);
100
101							has article_length => (
102							is => 'rwp',
103							isa => Int,
104							default => 0,
105							lazy => 1,
106							);
107
108							has full_text => (
109							is => 'rwp',
110							isa => Str,
111							);
112
113							has sentences => (
114							is => 'rwp',
115							isa => Ref['ARRAY'],
116							);
117
118							has sen_words => (
119							is => 'rwp',
120							isa => Ref['ARRAY'],
121							);
122
123							has word_list => (
124							is => 'rwp',
125							isa => Ref['ARRAY'],
126							);
127
128							has freq_hash => (
129							is => 'rwp',
130							isa => Ref['HASH'],
131							);
132
133							has clst_hash => (
134							is => 'rwp',
135							isa => Ref['HASH'],
136							);
137
138							has phrs_hash => (
139							is => 'rwp',
140							isa => Ref['HASH'],
141							);
142
143							has sigma_hash => (
144							is => 'rwp',
145							isa => Ref['HASH'],
146							);
147
148							has inter_hash => (
149							is => 'rwp',
150							isa => Ref['HASH'],
151							);
152
153							has score_hash => (
154							is => 'rwp',
155							isa => Ref['HASH'],
156							);
157
158							has phrs_list => (
159							is => 'rwp',
160							isa => Ref['HASH'],
161							);
162
163							has frag_list => (
164							is => 'rwp',
165							isa => Ref['ARRAY'],
166							);
167
168							has summary => (
169							is => 'rwp',
170							isa => Ref['HASH'],
171							);
172
173							has file_name => (
174							is => 'rwp',
175							isa => Str,
176							);
177
178							has text_hint => (
179							is => 'rwp',
180							isa => Str,
181							);
182
183							sub _build_watchlist {
184	0			0		0	my $self = shift;
185	0					0	my %watchlist;
186
187	0	0				0	open( my $stopwords_file, '<', $self->stopwords_path )
188							or die "Can't open stopwords scanner file" . $self->stopwords_path . ": $!";
189	0		0			0	chomp and $watchlist{ $_ } = 1 for (<$stopwords_file>);
190	0					0	close $stopwords_file;
191
192	0					0	return \%watchlist;
193							}
194
195							sub _build_stopwords {
196	75			75		173	my $self = shift;
197	75					114	my %stopwords;
198
199	75	50				2027	open( my $permanent_file, '<', $self->permanent_path )
200							or die "Can't open stopwords permanent file " . $self->permanent_path . ": $!";
201	75		50			31206	chomp and $stopwords{ $_ } = 1 for (<$permanent_file>);
202	75					1688	close $permanent_file;
203
204	75	50				2070	open( my $stopwords_file, '<', $self->stopwords_path )
205							or die "Can't open stopwords scanner file" . $self->stopwords_path . ": $!";
206	75		0			4398	chomp and $stopwords{ $_ } = 1 for (<$stopwords_file>);
207	75					615	close $stopwords_file;
208
209	75					453	return \%stopwords;
210							}
211
212							sub _store_stopwords {
213	37			37		89	my $self = shift;
214
215	37	50				763	open( my $stopwords_file, ">", $self->stopwords_path)
216							or die "Can't open stopwords scanner file " . $self->stopwords_file . ": $!";
217	37	50				4167	grep { print $stopwords_file "$_\n" } sort keys %{$self->watchlist} if $self->store_scanner;
	0					0
	0					0
218	37					634	close $stopwords_file;
219
220	37					221	return $self;
221							}
222
223
224
225							sub scan_text {
226	37			37	0	143	my ($self, $text, $path) = @_;
227
228	37					811	$self->_set_file_name( '' );
229	37					1684	$self->_set_text_hint( '' );
230
231	37	100				1139	if ( ref $text ) {
232	36					525	$self->_set_file_name( $path );
233
234	36					2377	$text = join "\n" => map { $_ } <$text>;
	1041					5076
235							}
236
237	37					2380	$self->_set_text_hint( '"' . substr($text,0,50) . '...' . substr($text,-30) . '"' );
238	37					949	$self->tokenize( $text ); #breaks the provided file into sentences and individual words
239
240	37					201	$self->_build_stopwords;
241	37					661	$self->_build_freq_hash;
242	37					186	$self->_build_clst_hash;
243	37					146	$self->_build_phrs_hash;
244	37					179	$self->_build_sigma_hash;
245	37					175	$self->_build_frag_list;
246
247	37					179	$self->develop_stopwords; #analyzes the frequency and clustering of words within the provided file
248	37					195	$self->_store_stopwords;
249
250	37					812	return $self->watchlist;
251							}
252
253							sub scan_file {
254	36			36	0	165	my ($self, $file_path) = @_;
255
256	36	50				1277	open( my $file, '<:encoding(UTF-8)', $file_path )
257							or die "Can't open file $file_path for scanning: $!";
258
259	36					3200	return $self->scan_text( $file, $file_path );
260							}
261
262							sub scan_each {
263	1			1	0	60213	my ($self, $dir_path) = @_;
264	1		33			38	return map { $self->scan_file( $_ ) } glob( $dir_path // $self->articles_path );
	35					592
265							}
266
267
268
269							sub summarize_text {
270	37			37	0	127	my ($self, $text, $path) = @_;
271
272	37					915	$self->_set_file_name( '' );
273
274	37	100				1327	if ( ref $text ) {
275	36					558	$self->_set_file_name( $path );
276
277	36					3437	$text = join "\n" => map { $_ } <$text>;
	1041					6162
278							}
279
280	37					2637	$self->_set_text_hint( '"' . substr($text,0,50) . '...' . substr($text,-30) . '"' );
281	37					1340	$self->tokenize($text); #breaks the provided file into sentences and individual words
282
283	37					158	$self->_build_stopwords;
284	37					665	$self->_build_freq_hash;
285	37					180	$self->_build_clst_hash;
286	37					153	$self->_build_phrs_hash;
287	37					161	$self->_build_sigma_hash;
288	37					124	$self->_build_frag_list;
289
290	37					178	$self->analyze_phrases; #analyzes the frequency and clustering of words within the provided file
291
292	37					1231	return $self->summary;
293							}
294
295							#summarizing is used to extract common phrase fragments from a given text file.
296							sub summarize_file {
297	36			36	0	123	my ($self, $file_path) = @_;
298
299	36	50				1768	open( my $file, '<:encoding(UTF-8)', $file_path )
300							or die "Can't open file $file_path for summarizing: $!";
301
302	36					3585	return $self->summarize_text( $file, $file_path );
303							}
304
305							sub summarize_each {
306	1			1	0	4	my ($self, $dir_path) = @_;
307	1		33			32	return map { $self->summarize_file( $_ ) } glob( $dir_path // $self->articles_path );
	35					636
308							}
309
310
311
312	1			1	0	11	sub summ_text { return shift->summarize_text(@_); }
313	1			1	0	10	sub summ_file { return shift->summarize_file(@_); }
314	1			1	0	21293	sub summ_each { return shift->summarize_each(@_); }
315
316
317
318							sub tokenize {
319	74			74	0	214	my ( $self, $text ) = @_;
320
321	74					126	my $full_text = $text;
322							#contains the full body of text
323	74					268920	my @sentences = split qr/(?\| (?<=(?
324							\| (?: \n+ \| ^\s+ \| \s+$ )
325							)/mx => $full_text;
326							# array of sentences
327
328	74					374	my @word_list; # array literal of all the words in the entire text body
329							my @sen_words; # array reference to all of the tokens in each sentence
330	74					213	for (@sentences) { #creates an array of each word in the current article
331	1			1		11	my @words = map { /\b (?: \w \. (?: ['’-] \w+ )?)+ \| (?: \w+ ['’-]? )+ (?=\s\|\b)/gx } lc $_; #tokenizes each sentence into complete words (single-quotes are considered part of the word they attach to)
	1					1
	1					16
	4880					17478
	4880					166281
332	4880					31191	push @word_list => @words;
333	4880					8661	push @sen_words => \@words;
334							}
335
336	74					2014	$self->_set_article_length( scalar @word_list );
337							#counts the total number of words in the article
338
339	74					3983	$self->_set_full_text( $full_text );
340	74					3588	$self->_set_sentences( \@sentences );
341	74					3897	$self->_set_word_list( \@word_list );
342	74					17673	$self->_set_sen_words( \@sen_words );
343
344	74					15278	return $self;
345							}
346
347
348
349							sub _build_freq_hash {
350	74			74		150	my $self = shift;
351
352	74		50			1737	my $min_freq_thresh = int($self->article_length * $self->freq_constant) // 1; #estimates a minimum threshold of occurence for frequently occuring words
353	74					2400	my %freq_hash; #counts the number of times each word appears in the %word_list hash
354	74					122	for my $word (@{$self->word_list}) {
	74					351
355	102810	100				1705259	$freq_hash{$word}++ unless $self->stopwords->{$word};
356							}
357	74	100				12790	grep { delete $freq_hash{$_} if $freq_hash{$_} < $min_freq_thresh } keys %freq_hash;
	34904					67853
358							#remove words that appear less than the $min_freq_thresh
359
360	74					3806	$self->_set_freq_hash( \%freq_hash );
361
362	74					3222	return $self;
363							}
364
365							sub _build_clst_hash {
366	74			74		198	my $self = shift;
367
368	74					151	my (%cluster_hash, %cluster_count);
369	74					155	my $abs_pos = 0;
370	74					134	for my $sen_index (0..scalar @{$self->sentences} - 1) { #gives the index of each sentence in the article
	74					482
371	4880					4291	my @sen_words = @{$self->sen_words->[$sen_index]}; # creates an array of each word in the given sentence
	4880					27649
372
373	4880					6318	for my $position (0..scalar @sen_words - 1) { #iterates across each word in the sentence
374	102810					78484	$abs_pos++;
375
376	102810	100				145682	if ( exists $self->freq_hash->{$sen_words[$position]}) { ## true if the given word at index position appears in the freq_hash
377	9136					19403	my %word = ( abs => $abs_pos, sen => $sen_index, rel => $position, cnt => $cluster_count{$sen_words[$position]}++ );
378							# hash-vector of the following elements:
379							# abs => absolute position of the currrent word within the entire token-stream
380							# sen => the index of the current sentence
381							# rel => position of the current word within the current sentence
382							# cnt => number of times the given word has appeared in the entire text file
383	9136					7774	push @{$cluster_hash{$sen_words[$position]}} => \%word;
	9136					17948
384							}
385							}
386							}
387
388	74					1929	$self->_set_clst_hash( \%cluster_hash );
389
390	74					11393	return $self;
391							}
392
393							sub _build_phrs_hash {
394	74			74		150	my $self = shift;
395
396							#create long-form phrases around frequently used words by tracking forward and backward phrase_radius from any given c_word
397	74					142	my %phrase_hash;
398	74					141	for my $c_word (keys %{$self->clst_hash}) {
	74					985
399	2522					2479	for my $c_vector (@{$self->clst_hash->{$c_word}}) {
	2522					4914
400
401	9136					15823	my ($sen, $pos, $cnt) = @$c_vector{'sen', 'rel', 'cnt'};
402							# sen indicates which sentence the current c_word appears in
403							# pos indicates the position of the c_word within the sentence (see above)
404							# cnt counts the total number of times the word has been detected thus far
405
406	9136					116341	my @phrase = [ @{$self->sen_words->[$sen]}[ max($pos - $self->phrase_radius, 0) .. min($pos + $self->phrase_radius, scalar(@{$self->sen_words->[$sen]}) - 1) ] ];
	9136					35752
	9136					68709
407							#array slice containing only tokens within phrase_radius of the c_word within the given sentence
408
409	9136					18314	unshift @phrase => \$self->sentences->[$sen]; #begins the phrase array with a complete, unedited sentence (for reference only)
410	9136	100				7536	push @{$phrase_hash{$c_word}} => \@phrase if scalar @{$phrase[-1]} > $self->phrase_thresh + 1;
	9046					52565
	9136					118365
411							#the phrase_hash can only contain a given phrase array if it is longer than the defined phrase_thresh + 1 (defaults to 3)
412							}
413							}
414
415	74					1562	$self->_set_phrs_hash( \%phrase_hash );
416
417	74					22842	return $self;
418							}
419
420							sub _build_sigma_hash {
421	74			74		155	my $self = shift;
422
423							#determine population standard deviation for word clustering
424	74					115	my %sigma_hash;
425	74					109	for my $c_word (keys %{$self->clst_hash}) {
	74					698
426	2522					2639	for my $c_vector (@{$self->clst_hash->{$c_word}}) {
	2522					4379
427
428							#create a list of the distances between each instance of the current c_word
429	9136					8326	my %dist_list;
430	9136					7732	my ($L_pos, $R_pos);
431	9136					8655	for (my $i = 0; $i < scalar @{$self->clst_hash->{$c_word}}; $i++) {
	101936					148646
432	92800					114616	$R_pos = $self->clst_hash->{$c_word}->[$i]->{abs};
433
434	92800		66			110093	my $dist = $R_pos - ($L_pos // $R_pos);
435	92800	50				97356	push @{$dist_list{$c_word}} => $dist if $dist >= 0;
	92800					102862
436
437	92800					88358	$L_pos = $R_pos;
438							}
439
440
441							#the following is used for scoring purposes, and is used only to determine the sigma score (population standard deviation) of the given c_word
442	9136	50				7994	my $pop_size = scalar @{$dist_list{$c_word}} or 1;
	9136					12677
443	9136					7749	my $pop_ave = sum0( @{$dist_list{$c_word}} ) / $pop_size;
	9136					16672
444	9136					8351	$sigma_hash{$c_word} = int sqrt( sum( map { ($_ - $pop_ave)**2 } @{$dist_list{$c_word}} ) / $pop_size ); #pop. std. deviation
	92800					119460
	9136					10578
445							}
446							}
447	74					2357	$self->_set_sigma_hash( \%sigma_hash );
448
449	74					2905	return $self;
450							}
451
452							sub _build_frag_list {
453	74			74		110	my $self = shift;
454
455	74					107	my @frag_list;
456	74					105	F_WORD: for my $f_word (keys %{$self->phrs_hash}) {
	74					955
457							#find common phrase-fragments
458	2516					3098	my %full_phrase; #inter_hash contains phrase fragments;
459	2516					2472	my (@hash_list, %sums_hash, %words_count); #hash_list contains ordered, formatted lists of each word in the phrase fragment; sums_hash contains the total number of times each word appears in all phrases for the given f_word
460	2516					2057	ORDER: for my $phrase (@{$self->phrs_hash->{$f_word}}) {
	2516					6257
461	9046					11059	my $sentence_ref = $phrase->[0];
462	9046					7996	my %ordered_words = map { $sums_hash{$phrase->[-1]->[$_]}++; ($_ => $phrase->[-1]->[$_]) } (0..scalar @{$phrase->[-1]} - 1);
	86336					112804
	86336					126124
	9046					13360
463							# words contains an ordered, formatted list of each word in the given phrase fragment, looks like:
464							# '01' => 'some'
465							# '02' => 'word'
466							# '03' => 'goes'
467							# '04' => 'here'
468	9046					62391	$words_count{$_}++ for values %ordered_words;
469	9046					25165	push @hash_list => { f_word => $f_word, sentence => $sentence_ref, counts => \%words_count, ordered => \%ordered_words };
470							}
471
472
473							#removes each word from the word_hash unless it occurs more than once amongst all phrases
474	2516					3004	SCRAP: for my $word_hash (@hash_list) {
475	9046					7745	for my $index ( keys %{$word_hash->{'ordered'}} ) {
	9046					17573
476	86336	100				132869	delete $word_hash->{'ordered'}->{$index} unless $sums_hash{$word_hash->{'ordered'}->{$index}} > 1
477							}
478							}
479
480
481							#break phrases fragments into "scraps" (consecutive runs of words within the fragment)
482	2516					2542	FRAG: for my $word_hash (@hash_list) {
483	9046					10346	my (%L_scrap, %R_scrap); #a "scrap" is a sub-fragment
484	9046					10762	my ($prev, $curr, $next) = (-1,0,0); #used to find consecutive sequences of words
485	9046					7831	my $real = 0; #flag for stopwords identification
486
487	9046					7830	my @word_keys = sort { $a <=> $b } keys %{$word_hash->{'ordered'}}; # word_keys contains a series of index-values
	52004					55050
	9046					22210
488	9046					15749	for (my $i = 0; $i < scalar @word_keys; $i++ ) {
489	35944					34410	$curr = $word_keys[$i];
490	35944	100				48798	$next = $word_keys[$i+1] if $i < scalar @word_keys - 1; # if-statement prevents out-of-bounds error
491
492	35944	100	100			66389	if ( $next == $curr + 1 or $curr == $prev + 1 ) {
493	27474	100				32430	unless ($curr == $prev + 1) { #resets R_scrap when the curr index skips over a number (i.e. a new scrap is encountered)
494	6550	100				13730	%L_scrap = %R_scrap if keys %L_scrap <= keys %R_scrap; #chooses the longest or most recent scrap
495	6550					7168	%R_scrap = (); #resets the R_scrap
496							}
497	27474					35679	$R_scrap{$curr} = $word_hash->{'ordered'}->{$curr};
498	27474	100				360719	$real = 1 unless $self->stopwords->{$R_scrap{$curr}}; #ensures that scraps consisting only of stopwords are ignored
499							} else {
500	8470	100				18215	%L_scrap = %R_scrap if keys %L_scrap <= keys %R_scrap; #chooses the longest or most recent scrap
501	8470					9367	%R_scrap = (); #resets the R_scrap
502							}
503	35944					192084	$prev = $curr;
504							}
505	9046	100				21121	%L_scrap = %R_scrap if keys %L_scrap <= keys %R_scrap; #chooses the longest or most recent scrap
506	9046					10996	%R_scrap = (); #resets the R_scrap
507	9046	100	100			99204	push @frag_list => { %{$word_hash}, scrap => \%L_scrap } if $real and scalar keys %L_scrap >= $self->phrase_thresh;
	6300					65371
508							}
509							}
510
511	74					1876	$self->_set_frag_list( \@frag_list );
512
513	74					37268	return $self;
514							}
515
516
517
518							sub develop_stopwords {
519	37			37	0	86	my $self = shift;
520
521	37					61	my %score_hash; #score_hash contains score values for words in those phrases
522
523	37					61	$score_hash{$_}++ for keys %{$self->phrs_hash};
	37					1022
524
525	37					134	JOIN: for my $fragment (@{$self->frag_list}) {
	37					159
526							#compile scraps for scoring
527
528	10977					13467	my $scrap = join ' ' => map { $score_hash{$fragment->{'scrap'}->{$_}}++;
529	10977					13726	$fragment->{'scrap'}->{$_} } sort { $a <=> $b
	13344					12710
530	3150					2868	} keys %{$fragment->{'scrap'}};
	3150					7639
531
532	3150					6562	for my $word (split ' ' => $scrap) {
533	10977		100			20979	$score_hash{$word} += $self->freq_hash->{$word} // 0;
534	10977		100			19600	$score_hash{$word} += $self->sigma_hash->{$word} // 0;
535	10977		50			19610	$score_hash{$word} -= $fragment->{'counts'}->{$word} // 0;
536							}
537							}
538
539	37	100				519	grep { delete $score_hash{$_} if $self->stopwords->{$_} } keys %score_hash;
	2531					43401
540
541
542
543
544	37	50				834	my @word_keys = sort { $score_hash{$b} <=> $score_hash{$a} or $a cmp $b } keys %score_hash;
	8302					11532
545	37					189	my $highest = $score_hash{$word_keys[0]};
546	37					90	my $longest = max map { length } @word_keys;
	1724					2051
547
548	37					1492	$score_hash{$_} = 40 * $score_hash{$_} / $highest for keys %score_hash;
549	37					165	@word_keys = reverse grep { $score_hash{$_} >= 1 } @word_keys;
	1724					2403
550
551	37					292	my $score_ave = sum( values %score_hash ) / keys %score_hash;
552
553	37					83	my @scores = map { $score_hash{$_} } @word_keys;
	640					830
554	37					273	my @low = @scores[ 0..(int scalar @scores / 2 - 1.5) ];
555	37					231	my @high = @scores[ (int scalar @scores / 2 + 1)..(int scalar @scores - 1) ];
556	37					174	my @LM = @low[ (int scalar @low / 2 - 0.5)..(int scalar @low / 2) ];
557	37					156	my @UM = @high[ (int scalar @high / 2 - 0.5)..(int scalar @high / 2) ];
558	37					142	my $Q1 = sum( @LM ) / scalar @LM;
559	37					106	my $Q3 = sum( @UM ) / scalar @UM;
560	37					72	my $IQR = $Q3 - $Q1;
561	37					78	my $lower = $Q1;
562	37					71	my $upper = $Q3 + 1.5 * $IQR;
563
564
565	37	100				91	my @graph_data = grep { $_ > $lower and $_ < $upper } map { $score_hash{$_} } @word_keys;
	640					1345
	640					763
566	37					81	my $n = scalar @graph_data;
567
568	37	100				131	if ($n > 4) {
569	33					104	my $average = sum( @graph_data ) / $n;
570	33					112	my @xdata = 1..$n; # The data corresponsing to $variable
571	33					106	my @ydata = @graph_data; # The data on the other axis
572	33					67	my $max_iter = 100; # maximum iterations
573	33					189	my @params_line = (
574							# Name Guess Accuracy
575							['a', 0, 0.00001],
576							['b', $average, 0.00001],
577							['c', $highest, 0.00001],
578							);
579	33					361	Algorithm::CurveFit->curve_fit(
580							formula => 'a + b * x + c * x^2',
581							params => \@params_line,
582							xdata => \@xdata,
583							ydata => \@ydata,
584							maximum_iterations => $max_iter,
585							);
586	33					2460238	my ($a, $b, $c) = ($params_line[0]->[1],$params_line[1]->[1],$params_line[2]->[1]);
587
588
589	33		100			96	my %watchlist = %{$self->watchlist // {} };
	33					522
590	33					157	KEY: for my $index ( reverse 1..scalar @word_keys ) {
591	623					820	my $score = $a + $b * $index + $c * $index**2;
592	623	100	100			2125	$watchlist{$word_keys[$index - 1]}++ if $score >= $lower and $score < $score_hash{$word_keys[$index - 1]};
593							}
594	33					787	$self->_set_watchlist( \%watchlist );
595
596
597	33	50				1560	if ($self->print_scanner) {
598	0					0	say "\n\n———————————————————————————————————————————\n\n";
599
600
601	0	0				0	say "[file name] " . $self->file_name if $self->file_name;
602	0					0	say "[text hint] " . $self->text_hint;
603
604	0					0	say "\n---SCANNER GRAPHS---\n";
605
606	0					0	say "KNOWN:";
607	0					0	KEY: for my $index ( reverse 0..scalar @word_keys - 1 ) {
608	0					0	my $format = "%" . $longest . "s\|%s\n";
609	0					0	my $score = $score_hash{$word_keys[$index]};
610
611	0					0	my $score_string = sprintf " %5.2f \|" => $score;
612	0					0	for (0..max($score, $upper)) {
613	0	0	0			0	if ($score > $lower and $score < $upper) {
614	0	0				0	$score_string .= '+' if $_ <= $score;
615							} else {
616	0	0				0	$score_string .= ']' if $_ == int $upper;
617	0	0				0	$score_string .= '-' if $_ <= int $score;
618	0	0				0	$score_string .= ' ' if $_ > int $score;
619	0	0				0	$score_string .= '[' if $_ == int $lower;
620							}
621							}
622
623	0					0	printf $format => ($word_keys[$index], $score_string);
624							}
625	0					0	printf "\n[whiskers] lower = %.2f; upper = %.2f\n\n" => ($lower, $upper);
626
627	0					0	say "CALCULATED:";
628	0					0	KEY: for my $index ( reverse 1..scalar @word_keys ) {
629	0					0	my $format = "%" . $longest . "s\|%s\n";
630	0					0	my $score = $a + $b * $index + $c * $index**2;
631	0		0			0	my $score_string = sprintf " %5.2f \|%s" => $score, ($score >= $lower and $score < $score_hash{$word_keys[$index - 1]} ? '-' x $score : '+' x $score);
632	0					0	printf $format => $word_keys[$index - 1], $score_string;
633							}
634							}
635							}
636
637
638	37					1184	return $self;
639							}
640
641
642
643							sub analyze_phrases {
644	37			37	0	73	my $self = shift;
645
646							#find common phrase-fragments
647	37					78	my (%inter_hash, %score_hash, %bare_phrase, %full_phrase); #inter_hash contains phrase fragments; score_hash contains score values for words in those phrases
648	37					49	F_WORD: for my $f_word (keys %{$self->phrs_hash}) {
	37					427
649
650							#compile scraps for scoring
651	1258					1339	JOIN: for my $fragment (@{$self->frag_list}) {
	1258					2777
652	317645					275855	my $scrap = join ' ' => map { $score_hash{$_}++;
653	94226					91334	$fragment->{'scrap'}->{$_} } sort { $a <=> $b } keys %{$fragment->{'scrap'}};
	317645					480807
	371733					401366
	94226					256131
654	94226					132407	my @bare = map { $fragment->{'scrap'}->{$_} } grep { !$self->stopwords->{$fragment->{'scrap'}->{$_}} } sort { $a <=> $b } keys %{$fragment->{'scrap'}};
	177564					735503
	317645					5187344
	371733					380332
	94226					170487
655
656	94226					160792	$score_hash{$f_word}++; #scores each f_word
657	94226					124428	$inter_hash{$scrap}++; #contains the final L_scrap
658
659
660	94226					85969	my $score = 1;
661	94226					185178	for my $word (split ' ' => $scrap) {
662	317645		100			621865	$score += $self->freq_hash->{$word} // 0;
663	317645		100			537695	$score += $self->sigma_hash->{$word} // 0;
664	317645		100			573372	$score += $score_hash{$word} // 0;
665							}
666
667	94226					109710	$full_phrase{ ${$fragment->{'sentence'}} } += $score; #contains the full phrase from which the L_scrap was drawn
	94226					185846
668	94226	100				249076	$bare_phrase{ $scrap } = \@bare if scalar @bare; #contains the final L_scrap without any stopwords
669							}
670							}
671
672
673							#each phrases' score is multiplied by the sum of the compound score of each word within the phrase
674	37					1202	for my $scrap (keys %inter_hash) {
675	2324					4121	for my $word (split ' ' => $scrap) {
676	8350					6821	my $score = 1;
677	8350		100			14584	$score += $self->freq_hash->{$word} // 0;
678	8350		100			13962	$score += $self->sigma_hash->{$word} // 0;
679	8350		100			13098	$score += $score_hash{$word} // 0;
680
681	8350					10511	$inter_hash{$scrap} *= $score;
682							}
683							}
684
685
686							#combine scraps — if scrap "a" contains scrap "b", add the value of "b" to "a" and delete "b"
687	37	50				677	CLEAR: for my $scrap (sort { $inter_hash{$b} <=> $inter_hash{$a} or $a cmp $b } keys %inter_hash) {
	11571					17319
688	2324					19709	my $compare = qr/\b$scrap\b/;
689	2324					2962	my $delete = 0;
690	2324					25093	TEST: for my $test (keys %inter_hash) {
691	120849	100				147692	if ($test ne $scrap) {
692	119009	100				203814	if ($test =~ /$compare/) { #true iff scrap ∈ test
		100
693	611					1262	$inter_hash{$test} += $inter_hash{$scrap};
694	611	50				3674	delete $inter_hash{$scrap} and next CLEAR;
695	118398					160460	} elsif (not scalar singleton (@{$bare_phrase{$test}}, @{$bare_phrase{$scrap}}) ) { #true iff bare_phrase{test} == bare_phrase{scrap}
	118398					469252
696	3116	100				2662	next TEST unless scalar @{$bare_phrase{$test}} > 1;
	3116					6974
697
698	228					290	my $joined = join '\|' => @{$bare_phrase{$test}};
	228					465
699	228					749	$inter_hash{"($joined)"} = $inter_hash{$test} + $inter_hash{$scrap};
700	228					477	$inter_hash{$test} += $inter_hash{$scrap};
701	228	50				1547	delete $inter_hash{$scrap} and next CLEAR;
702							}
703							}
704							}
705							}
706
707
708	37					1674	$self->_set_score_hash( \%score_hash );
709	37					2542	$self->_set_inter_hash( \%inter_hash );
710	37					2074	$self->_set_phrs_list( \%full_phrase );
711
712
713
714
715
716							#returns a summary array for the given text, in the form of a hash of array-refs:
717							# sentences => a list of full sentences from the given article, scored based on the scores of the words contained therein
718							# fragments => a list of phrase fragments from the given article, scored as above
719							# words => a list of all words in the article, scored by a three-factor system consisting of
720							# (frequency of appearance, population standard deviation, and use in important phrase fragments)
721
722	37					1389	my %sort_list;
723	37					51	for (keys %{$self->freq_hash}) {
	37					675
724	1261		50			2464	$sort_list{$_} += $self->freq_hash->{$_} // 0;
725	1261		50			2139	$sort_list{$_} += $self->sigma_hash->{$_} // 0;
726	1261		100			2289	$sort_list{$_} += $self->score_hash->{$_} // 0;
727							}
728
729	37					136	my %sentences = map { ($_ => $self->phrs_list->{$_}) } sort { $self->phrs_list->{$b} <=> $self->phrs_list->{$a} } keys %{$self->phrs_list};
	1295					2908
	5431					9278
	37					1163
730	37	50				241	my %fragments = map { ($_ => $self->inter_hash->{$_}) } sort { $self->inter_hash->{$b} <=> $self->inter_hash->{$a} or $a cmp $b } keys %{$self->inter_hash};
	1645					3230
	7316					12632
	37					504
731	37	50				567	my %singleton = map { ($_ => $sort_list{$_}) } sort { $sort_list{$b} <=> $sort_list{$a} or $a cmp $b } keys %sort_list;
	1261					2180
	5994					8557
732
733	37					348	my %summary = ( sentences => \%sentences, fragments => \%fragments, words => \%singleton );
734
735	37					877	$self->_set_summary( \%summary );
736
737
738	37	50				1750	if ($self->print_summary) {
739	0					0	say "\n\n———————————————————————————————————————————\n\n";
740
741
742	0	0				0	say "[file name] " . $self->file_name if $self->file_name;
743	0					0	say "[text hint] " . $self->text_hint;
744
745	0					0	say "\n---SUMMARY CHARTS---\n";
746
747	0					0	my ($sentences, $fragments, $words) = @{$self->summary}{'sentences','fragments','words'};
	0					0
748
749	0					0	say "SUMMARY:";
750	0	0				0	my @sentence_keys = sort { $sentences->{$b} <=> $sentences->{$a} or $a cmp $b} keys %$sentences;
	0					0
751	0					0	for my $sen ( @sentence_keys[0..min($self->return_count,scalar @sentence_keys - 1)] ) {
752	0					0	printf "%4d => %s\n" => $sentences->{$sen}, $sen;
753							}
754	0					0	say "\n";
755
756
757	0					0	say "PHRASES:";
758	0	0				0	my @phrase_keys = sort { $fragments->{$b} <=> $fragments->{$a} or $a cmp $b } keys %$fragments;
	0					0
759	0					0	for my $phrase ( @phrase_keys[0..min($self->return_count,scalar @phrase_keys - 1)] ) {
760	0					0	printf "%8d => %s\n" => $fragments->{$phrase}, $phrase;
761							}
762	0					0	say "\n";
763
764
765	0					0	say " WORDS:";
766	0	0				0	my @word_keys = sort { $words->{$b} <=> $words->{$a} or $a cmp $b } keys %$words;
	0					0
767	0					0	my $highest = $words->{$word_keys[0]};
768	0					0	my $longest = max map {length} @word_keys;
	0					0
769	0					0	KEY: for my $word ( @word_keys[0..min($self->return_count,scalar @word_keys - 1)] ) {
770	0					0	my $format = "%" . $longest . "s\|%s\n";
771	0					0	my $score = int(40*$words->{$word}/$highest);
772	0	0				0	printf $format => ( $word , "-" x $score ) if $score > 2;
773							}
774	0					0	say "\n";
775							}
776
777
778	37					1752	return $self;
779							}
780
781
782
783
784							1;
785							__END__