File Coverage

lib/Text/Summarizer.pm
Criterion Covered Total %
statement 351 412 85.1
branch 64 106 60.3
condition 38 58 65.5
subroutine 30 31 96.7
pod 0 12 0.0
total 483 619 78.0


line stmt bran cond sub pod time code
1             package Text::Summarizer;
2              
3 1     1   2611 use v5.10.0;
  1         6  
4 1     1   9 use strict;
  1         3  
  1         31  
5 1     1   11 use warnings;
  1         2  
  1         35  
6 1     1   807 use Moo;
  1         18939  
  1         8  
7 1     1   3009 use Types::Standard qw/ Bool Ref Str Int Num InstanceOf Bool /;
  1         79895  
  1         11  
8 1     1   2088 use List::AllUtils qw/ max min sum sum0 singleton /;
  1         14053  
  1         105  
9 1     1   510 use Algorithm::CurveFit;
  1         141828  
  1         58  
10 1     1   784 use utf8;
  1         14  
  1         6  
11              
12 1     1   980 binmode STDOUT, ':encoding(UTF-8)';
  1         15  
  1         6  
13              
14 1     1   51 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION);
  1         8  
  1         5445  
15             require Exporter;
16              
17             @ISA = qw(Exporter);
18             @EXPORT = qw();
19             @EXPORT_OK = qw();
20             %EXPORT_TAGS = (all => [@EXPORT_OK]);
21             $VERSION = '1.053';
22              
23              
24             has permanent_path => (
25             is => 'rw',
26             isa => Str,
27             default => 'data/permanent.stop',
28             );
29              
30             has stopwords_path => (
31             is => 'rw',
32             isa => Str,
33             default => 'data/stopwords.stop',
34             );
35              
36             has articles_path => (
37             is => 'rw',
38             isa => Str,
39             default => 'articles/*'
40             );
41              
42             has store_scanner => (
43             is => 'rw',
44             isa => Bool,
45             default => 0,
46             );
47              
48             has print_scanner => (
49             is => 'rw',
50             isa => Bool,
51             default => 0,
52             );
53              
54             has print_summary => (
55             is => 'rw',
56             isa => Bool,
57             default => 0,
58             );
59              
60             has return_count => (
61             is => 'rw',
62             isa => Num,
63             default => 20,
64             );
65              
66             has phrase_thresh => (
67             is => 'rw',
68             isa => Int,
69             default => 2,
70             );
71              
72             has phrase_radius => (
73             is => 'rw',
74             isa => Int,
75             default => 5,
76             );
77              
78             has freq_constant => (
79             is => 'rw',
80             isa => Num,
81             default => 0.004,
82             );
83              
84             has watch_count => (
85             is => 'rwp',
86             isa => Int,
87             default => 0,
88             );
89              
90             has stopwords => (
91             is => 'lazy',
92             isa => Ref['HASH'],
93             );
94              
95             has watchlist => (
96             is => 'rwp',
97             isa => Ref['HASH'],
98             lazy => 1,
99             );
100              
101             has article_length => (
102             is => 'rwp',
103             isa => Int,
104             default => 0,
105             lazy => 1,
106             );
107              
108             has full_text => (
109             is => 'rwp',
110             isa => Str,
111             );
112              
113             has sentences => (
114             is => 'rwp',
115             isa => Ref['ARRAY'],
116             );
117              
118             has sen_words => (
119             is => 'rwp',
120             isa => Ref['ARRAY'],
121             );
122              
123             has word_list => (
124             is => 'rwp',
125             isa => Ref['ARRAY'],
126             );
127              
128             has freq_hash => (
129             is => 'rwp',
130             isa => Ref['HASH'],
131             );
132              
133             has clst_hash => (
134             is => 'rwp',
135             isa => Ref['HASH'],
136             );
137              
138             has phrs_hash => (
139             is => 'rwp',
140             isa => Ref['HASH'],
141             );
142              
143             has sigma_hash => (
144             is => 'rwp',
145             isa => Ref['HASH'],
146             );
147              
148             has inter_hash => (
149             is => 'rwp',
150             isa => Ref['HASH'],
151             );
152              
153             has score_hash => (
154             is => 'rwp',
155             isa => Ref['HASH'],
156             );
157              
158             has phrs_list => (
159             is => 'rwp',
160             isa => Ref['HASH'],
161             );
162              
163             has frag_list => (
164             is => 'rwp',
165             isa => Ref['ARRAY'],
166             );
167              
168             has summary => (
169             is => 'rwp',
170             isa => Ref['HASH'],
171             );
172              
173             has file_name => (
174             is => 'rwp',
175             isa => Str,
176             );
177              
178             has text_hint => (
179             is => 'rwp',
180             isa => Str,
181             );
182              
183             sub _build_watchlist {
184 0     0   0 my $self = shift;
185 0         0 my %watchlist;
186              
187 0 0       0 open( my $stopwords_file, '<', $self->stopwords_path )
188             or die "Can't open stopwords scanner file" . $self->stopwords_path . ": $!";
189 0   0     0 chomp and $watchlist{ $_ } = 1 for (<$stopwords_file>);
190 0         0 close $stopwords_file;
191              
192 0         0 return \%watchlist;
193             }
194              
195             sub _build_stopwords {
196 75     75   173 my $self = shift;
197 75         114 my %stopwords;
198              
199 75 50       2027 open( my $permanent_file, '<', $self->permanent_path )
200             or die "Can't open stopwords permanent file " . $self->permanent_path . ": $!";
201 75   50     31206 chomp and $stopwords{ $_ } = 1 for (<$permanent_file>);
202 75         1688 close $permanent_file;
203              
204 75 50       2070 open( my $stopwords_file, '<', $self->stopwords_path )
205             or die "Can't open stopwords scanner file" . $self->stopwords_path . ": $!";
206 75   0     4398 chomp and $stopwords{ $_ } = 1 for (<$stopwords_file>);
207 75         615 close $stopwords_file;
208              
209 75         453 return \%stopwords;
210             }
211              
212             sub _store_stopwords {
213 37     37   89 my $self = shift;
214              
215 37 50       763 open( my $stopwords_file, ">", $self->stopwords_path)
216             or die "Can't open stopwords scanner file " . $self->stopwords_file . ": $!";
217 37 50       4167 grep { print $stopwords_file "$_\n" } sort keys %{$self->watchlist} if $self->store_scanner;
  0         0  
  0         0  
218 37         634 close $stopwords_file;
219              
220 37         221 return $self;
221             }
222              
223              
224              
225             sub scan_text {
226 37     37 0 143 my ($self, $text, $path) = @_;
227              
228 37         811 $self->_set_file_name( '' );
229 37         1684 $self->_set_text_hint( '' );
230              
231 37 100       1139 if ( ref $text ) {
232 36         525 $self->_set_file_name( $path );
233              
234 36         2377 $text = join "\n" => map { $_ } <$text>;
  1041         5076  
235             }
236              
237 37         2380 $self->_set_text_hint( '"' . substr($text,0,50) . '...' . substr($text,-30) . '"' );
238 37         949 $self->tokenize( $text ); #breaks the provided file into sentences and individual words
239              
240 37         201 $self->_build_stopwords;
241 37         661 $self->_build_freq_hash;
242 37         186 $self->_build_clst_hash;
243 37         146 $self->_build_phrs_hash;
244 37         179 $self->_build_sigma_hash;
245 37         175 $self->_build_frag_list;
246              
247 37         179 $self->develop_stopwords; #analyzes the frequency and clustering of words within the provided file
248 37         195 $self->_store_stopwords;
249              
250 37         812 return $self->watchlist;
251             }
252              
253             sub scan_file {
254 36     36 0 165 my ($self, $file_path) = @_;
255              
256 36 50       1277 open( my $file, '<:encoding(UTF-8)', $file_path )
257             or die "Can't open file $file_path for scanning: $!";
258              
259 36         3200 return $self->scan_text( $file, $file_path );
260             }
261              
262             sub scan_each {
263 1     1 0 60213 my ($self, $dir_path) = @_;
264 1   33     38 return map { $self->scan_file( $_ ) } glob( $dir_path // $self->articles_path );
  35         592  
265             }
266              
267              
268              
269             sub summarize_text {
270 37     37 0 127 my ($self, $text, $path) = @_;
271              
272 37         915 $self->_set_file_name( '' );
273              
274 37 100       1327 if ( ref $text ) {
275 36         558 $self->_set_file_name( $path );
276            
277 36         3437 $text = join "\n" => map { $_ } <$text>;
  1041         6162  
278             }
279              
280 37         2637 $self->_set_text_hint( '"' . substr($text,0,50) . '...' . substr($text,-30) . '"' );
281 37         1340 $self->tokenize($text); #breaks the provided file into sentences and individual words
282              
283 37         158 $self->_build_stopwords;
284 37         665 $self->_build_freq_hash;
285 37         180 $self->_build_clst_hash;
286 37         153 $self->_build_phrs_hash;
287 37         161 $self->_build_sigma_hash;
288 37         124 $self->_build_frag_list;
289              
290 37         178 $self->analyze_phrases; #analyzes the frequency and clustering of words within the provided file
291              
292 37         1231 return $self->summary;
293             }
294              
295             #summarizing is used to extract common phrase fragments from a given text file.
296             sub summarize_file {
297 36     36 0 123 my ($self, $file_path) = @_;
298              
299 36 50       1768 open( my $file, '<:encoding(UTF-8)', $file_path )
300             or die "Can't open file $file_path for summarizing: $!";
301              
302 36         3585 return $self->summarize_text( $file, $file_path );
303             }
304              
305             sub summarize_each {
306 1     1 0 4 my ($self, $dir_path) = @_;
307 1   33     32 return map { $self->summarize_file( $_ ) } glob( $dir_path // $self->articles_path );
  35         636  
308             }
309              
310              
311              
312 1     1 0 11 sub summ_text { return shift->summarize_text(@_); }
313 1     1 0 10 sub summ_file { return shift->summarize_file(@_); }
314 1     1 0 21293 sub summ_each { return shift->summarize_each(@_); }
315              
316              
317              
318             sub tokenize {
319 74     74 0 214 my ( $self, $text ) = @_;
320              
321 74         126 my $full_text = $text;
322             #contains the full body of text
323 74         268920 my @sentences = split qr/(?| (?<=(?
324             | (?: \n+ | ^\s+ | \s+$ )
325             )/mx => $full_text;
326             # array of sentences
327              
328 74         374 my @word_list; # array literal of all the words in the entire text body
329             my @sen_words; # array reference to all of the tokens in each sentence
330 74         213 for (@sentences) { #creates an array of each word in the current article
331 1     1   11 my @words = map { /\b (?: \w \. (?: ['’-] \w+ )?)+ | (?: \w+ ['’-]? )+ (?=\s|\b)/gx } lc $_; #tokenizes each sentence into complete words (single-quotes are considered part of the word they attach to)
  1         1  
  1         16  
  4880         17478  
  4880         166281  
332 4880         31191 push @word_list => @words;
333 4880         8661 push @sen_words => \@words;
334             }
335              
336 74         2014 $self->_set_article_length( scalar @word_list );
337             #counts the total number of words in the article
338              
339 74         3983 $self->_set_full_text( $full_text );
340 74         3588 $self->_set_sentences( \@sentences );
341 74         3897 $self->_set_word_list( \@word_list );
342 74         17673 $self->_set_sen_words( \@sen_words );
343              
344 74         15278 return $self;
345             }
346              
347              
348              
349             sub _build_freq_hash {
350 74     74   150 my $self = shift;
351              
352 74   50     1737 my $min_freq_thresh = int($self->article_length * $self->freq_constant) // 1; #estimates a minimum threshold of occurence for frequently occuring words
353 74         2400 my %freq_hash; #counts the number of times each word appears in the *%word_list* hash
354 74         122 for my $word (@{$self->word_list}) {
  74         351  
355 102810 100       1705259 $freq_hash{$word}++ unless $self->stopwords->{$word};
356             }
357 74 100       12790 grep { delete $freq_hash{$_} if $freq_hash{$_} < $min_freq_thresh } keys %freq_hash;
  34904         67853  
358             #remove words that appear less than the *$min_freq_thresh*
359              
360 74         3806 $self->_set_freq_hash( \%freq_hash );
361              
362 74         3222 return $self;
363             }
364              
365             sub _build_clst_hash {
366 74     74   198 my $self = shift;
367              
368 74         151 my (%cluster_hash, %cluster_count);
369 74         155 my $abs_pos = 0;
370 74         134 for my $sen_index (0..scalar @{$self->sentences} - 1) { #gives the index of each sentence in the article
  74         482  
371 4880         4291 my @sen_words = @{$self->sen_words->[$sen_index]}; # creates an array of each word in the given sentence
  4880         27649  
372            
373 4880         6318 for my $position (0..scalar @sen_words - 1) { #iterates across each word in the sentence
374 102810         78484 $abs_pos++;
375              
376 102810 100       145682 if ( exists $self->freq_hash->{$sen_words[$position]}) { ## true if the given word at index *position* appears in the *freq_hash*
377 9136         19403 my %word = ( abs => $abs_pos, sen => $sen_index, rel => $position, cnt => $cluster_count{$sen_words[$position]}++ );
378             # hash-vector of the following elements:
379             # abs => absolute position of the currrent word within the entire token-stream
380             # sen => the index of the current sentence
381             # rel => position of the current word within the current sentence
382             # cnt => number of times the given word has appeared in the entire text file
383 9136         7774 push @{$cluster_hash{$sen_words[$position]}} => \%word;
  9136         17948  
384             }
385             }
386             }
387              
388 74         1929 $self->_set_clst_hash( \%cluster_hash );
389              
390 74         11393 return $self;
391             }
392              
393             sub _build_phrs_hash {
394 74     74   150 my $self = shift;
395              
396             #create long-form phrases around frequently used words by tracking forward and backward *phrase_radius* from any given *c_word*
397 74         142 my %phrase_hash;
398 74         141 for my $c_word (keys %{$self->clst_hash}) {
  74         985  
399 2522         2479 for my $c_vector (@{$self->clst_hash->{$c_word}}) {
  2522         4914  
400              
401 9136         15823 my ($sen, $pos, $cnt) = @$c_vector{'sen', 'rel', 'cnt'};
402             # *sen* indicates which sentence the current *c_word* appears in
403             # *pos* indicates the position of the *c_word* within the sentence (see above)
404             # *cnt* counts the total number of times the word has been detected thus far
405              
406 9136         116341 my @phrase = [ @{$self->sen_words->[$sen]}[ max($pos - $self->phrase_radius, 0) .. min($pos + $self->phrase_radius, scalar(@{$self->sen_words->[$sen]}) - 1) ] ];
  9136         35752  
  9136         68709  
407             #array slice containing only tokens within *phrase_radius* of the *c_word* within the given sentence
408              
409 9136         18314 unshift @phrase => \$self->sentences->[$sen]; #begins the *phrase* array with a complete, unedited sentence (for reference only)
410 9136 100       7536 push @{$phrase_hash{$c_word}} => \@phrase if scalar @{$phrase[-1]} > $self->phrase_thresh + 1;
  9046         52565  
  9136         118365  
411             #the *phrase_hash* can only contain a given *phrase* array if it is longer than the defined *phrase_thresh* + 1 (defaults to 3)
412             }
413             }
414              
415 74         1562 $self->_set_phrs_hash( \%phrase_hash );
416              
417 74         22842 return $self;
418             }
419              
420             sub _build_sigma_hash {
421 74     74   155 my $self = shift;
422              
423             #determine population standard deviation for word clustering
424 74         115 my %sigma_hash;
425 74         109 for my $c_word (keys %{$self->clst_hash}) {
  74         698  
426 2522         2639 for my $c_vector (@{$self->clst_hash->{$c_word}}) {
  2522         4379  
427              
428             #create a list of the distances between each instance of the current *c_word*
429 9136         8326 my %dist_list;
430 9136         7732 my ($L_pos, $R_pos);
431 9136         8655 for (my $i = 0; $i < scalar @{$self->clst_hash->{$c_word}}; $i++) {
  101936         148646  
432 92800         114616 $R_pos = $self->clst_hash->{$c_word}->[$i]->{abs};
433              
434 92800   66     110093 my $dist = $R_pos - ($L_pos // $R_pos);
435 92800 50       97356 push @{$dist_list{$c_word}} => $dist if $dist >= 0;
  92800         102862  
436              
437 92800         88358 $L_pos = $R_pos;
438             }
439              
440              
441             #the following is used for scoring purposes, and is used only to determine the *sigma* score (population standard deviation) of the given *c_word*
442 9136 50       7994 my $pop_size = scalar @{$dist_list{$c_word}} or 1;
  9136         12677  
443 9136         7749 my $pop_ave = sum0( @{$dist_list{$c_word}} ) / $pop_size;
  9136         16672  
444 9136         8351 $sigma_hash{$c_word} = int sqrt( sum( map { ($_ - $pop_ave)**2 } @{$dist_list{$c_word}} ) / $pop_size ); #pop. std. deviation
  92800         119460  
  9136         10578  
445             }
446             }
447 74         2357 $self->_set_sigma_hash( \%sigma_hash );
448              
449 74         2905 return $self;
450             }
451              
452             sub _build_frag_list {
453 74     74   110 my $self = shift;
454              
455 74         107 my @frag_list;
456 74         105 F_WORD: for my $f_word (keys %{$self->phrs_hash}) {
  74         955  
457             #find common phrase-fragments
458 2516         3098 my %full_phrase; #*inter_hash* contains phrase fragments;
459 2516         2472 my (@hash_list, %sums_hash, %words_count); #*hash_list* contains ordered, formatted lists of each word in the phrase fragment; *sums_hash* contains the total number of times each word appears in all phrases for the given *f_word*
460 2516         2057 ORDER: for my $phrase (@{$self->phrs_hash->{$f_word}}) {
  2516         6257  
461 9046         11059 my $sentence_ref = $phrase->[0];
462 9046         7996 my %ordered_words = map { $sums_hash{$phrase->[-1]->[$_]}++; ($_ => $phrase->[-1]->[$_]) } (0..scalar @{$phrase->[-1]} - 1);
  86336         112804  
  86336         126124  
  9046         13360  
463             # *words* contains an ordered, formatted list of each word in the given phrase fragment, looks like:
464             # '01' => 'some'
465             # '02' => 'word'
466             # '03' => 'goes'
467             # '04' => 'here'
468 9046         62391 $words_count{$_}++ for values %ordered_words;
469 9046         25165 push @hash_list => { f_word => $f_word, sentence => $sentence_ref, counts => \%words_count, ordered => \%ordered_words };
470             }
471              
472              
473             #removes each word from the *word_hash* unless it occurs more than once amongst all phrases
474 2516         3004 SCRAP: for my $word_hash (@hash_list) {
475 9046         7745 for my $index ( keys %{$word_hash->{'ordered'}} ) {
  9046         17573  
476 86336 100       132869 delete $word_hash->{'ordered'}->{$index} unless $sums_hash{$word_hash->{'ordered'}->{$index}} > 1
477             }
478             }
479              
480              
481             #break phrases fragments into "scraps" (consecutive runs of words within the fragment)
482 2516         2542 FRAG: for my $word_hash (@hash_list) {
483 9046         10346 my (%L_scrap, %R_scrap); #a "scrap" is a sub-fragment
484 9046         10762 my ($prev, $curr, $next) = (-1,0,0); #used to find consecutive sequences of words
485 9046         7831 my $real = 0; #flag for stopwords identification
486              
487 9046         7830 my @word_keys = sort { $a <=> $b } keys %{$word_hash->{'ordered'}}; # *word_keys* contains a series of index-values
  52004         55050  
  9046         22210  
488 9046         15749 for (my $i = 0; $i < scalar @word_keys; $i++ ) {
489 35944         34410 $curr = $word_keys[$i];
490 35944 100       48798 $next = $word_keys[$i+1] if $i < scalar @word_keys - 1; # if-statement prevents out-of-bounds error
491              
492 35944 100 100     66389 if ( $next == $curr + 1 or $curr == $prev + 1 ) {
493 27474 100       32430 unless ($curr == $prev + 1) { #resets *R_scrap* when the *curr* index skips over a number (i.e. a new scrap is encountered)
494 6550 100       13730 %L_scrap = %R_scrap if keys %L_scrap <= keys %R_scrap; #chooses the longest or most recent scrap
495 6550         7168 %R_scrap = (); #resets the *R_scrap*
496             }
497 27474         35679 $R_scrap{$curr} = $word_hash->{'ordered'}->{$curr};
498 27474 100       360719 $real = 1 unless $self->stopwords->{$R_scrap{$curr}}; #ensures that scraps consisting only of stopwords are ignored
499             } else {
500 8470 100       18215 %L_scrap = %R_scrap if keys %L_scrap <= keys %R_scrap; #chooses the longest or most recent scrap
501 8470         9367 %R_scrap = (); #resets the *R_scrap*
502             }
503 35944         192084 $prev = $curr;
504             }
505 9046 100       21121 %L_scrap = %R_scrap if keys %L_scrap <= keys %R_scrap; #chooses the longest or most recent scrap
506 9046         10996 %R_scrap = (); #resets the *R_scrap*
507 9046 100 100     99204 push @frag_list => { %{$word_hash}, scrap => \%L_scrap } if $real and scalar keys %L_scrap >= $self->phrase_thresh;
  6300         65371  
508             }
509             }
510            
511 74         1876 $self->_set_frag_list( \@frag_list );
512              
513 74         37268 return $self;
514             }
515              
516              
517              
518             sub develop_stopwords {
519 37     37 0 86 my $self = shift;
520              
521 37         61 my %score_hash; #*score_hash* contains score values for words in those phrases
522              
523 37         61 $score_hash{$_}++ for keys %{$self->phrs_hash};
  37         1022  
524              
525 37         134 JOIN: for my $fragment (@{$self->frag_list}) {
  37         159  
526             #compile scraps for scoring
527              
528 10977         13467 my $scrap = join ' ' => map { $score_hash{$fragment->{'scrap'}->{$_}}++;
529 10977         13726 $fragment->{'scrap'}->{$_} } sort { $a <=> $b
  13344         12710  
530 3150         2868 } keys %{$fragment->{'scrap'}};
  3150         7639  
531              
532 3150         6562 for my $word (split ' ' => $scrap) {
533 10977   100     20979 $score_hash{$word} += $self->freq_hash->{$word} // 0;
534 10977   100     19600 $score_hash{$word} += $self->sigma_hash->{$word} // 0;
535 10977   50     19610 $score_hash{$word} -= $fragment->{'counts'}->{$word} // 0;
536             }
537             }
538              
539 37 100       519 grep { delete $score_hash{$_} if $self->stopwords->{$_} } keys %score_hash;
  2531         43401  
540              
541              
542              
543              
544 37 50       834 my @word_keys = sort { $score_hash{$b} <=> $score_hash{$a} or $a cmp $b } keys %score_hash;
  8302         11532  
545 37         189 my $highest = $score_hash{$word_keys[0]};
546 37         90 my $longest = max map { length } @word_keys;
  1724         2051  
547              
548 37         1492 $score_hash{$_} = 40 * $score_hash{$_} / $highest for keys %score_hash;
549 37         165 @word_keys = reverse grep { $score_hash{$_} >= 1 } @word_keys;
  1724         2403  
550              
551 37         292 my $score_ave = sum( values %score_hash ) / keys %score_hash;
552              
553 37         83 my @scores = map { $score_hash{$_} } @word_keys;
  640         830  
554 37         273 my @low = @scores[ 0..(int scalar @scores / 2 - 1.5) ];
555 37         231 my @high = @scores[ (int scalar @scores / 2 + 1)..(int scalar @scores - 1) ];
556 37         174 my @LM = @low[ (int scalar @low / 2 - 0.5)..(int scalar @low / 2) ];
557 37         156 my @UM = @high[ (int scalar @high / 2 - 0.5)..(int scalar @high / 2) ];
558 37         142 my $Q1 = sum( @LM ) / scalar @LM;
559 37         106 my $Q3 = sum( @UM ) / scalar @UM;
560 37         72 my $IQR = $Q3 - $Q1;
561 37         78 my $lower = $Q1;
562 37         71 my $upper = $Q3 + 1.5 * $IQR;
563              
564              
565 37 100       91 my @graph_data = grep { $_ > $lower and $_ < $upper } map { $score_hash{$_} } @word_keys;
  640         1345  
  640         763  
566 37         81 my $n = scalar @graph_data;
567              
568 37 100       131 if ($n > 4) {
569 33         104 my $average = sum( @graph_data ) / $n;
570 33         112 my @xdata = 1..$n; # The data corresponsing to $variable
571 33         106 my @ydata = @graph_data; # The data on the other axis
572 33         67 my $max_iter = 100; # maximum iterations
573 33         189 my @params_line = (
574             # Name Guess Accuracy
575             ['a', 0, 0.00001],
576             ['b', $average, 0.00001],
577             ['c', $highest, 0.00001],
578             );
579 33         361 Algorithm::CurveFit->curve_fit(
580             formula => 'a + b * x + c * x^2',
581             params => \@params_line,
582             xdata => \@xdata,
583             ydata => \@ydata,
584             maximum_iterations => $max_iter,
585             );
586 33         2460238 my ($a, $b, $c) = ($params_line[0]->[1],$params_line[1]->[1],$params_line[2]->[1]);
587              
588              
589 33   100     96 my %watchlist = %{$self->watchlist // {} };
  33         522  
590 33         157 KEY: for my $index ( reverse 1..scalar @word_keys ) {
591 623         820 my $score = $a + $b * $index + $c * $index**2;
592 623 100 100     2125 $watchlist{$word_keys[$index - 1]}++ if $score >= $lower and $score < $score_hash{$word_keys[$index - 1]};
593             }
594 33         787 $self->_set_watchlist( \%watchlist );
595              
596              
597 33 50       1560 if ($self->print_scanner) {
598 0         0 say "\n\n———————————————————————————————————————————\n\n";
599              
600              
601 0 0       0 say "[file name] " . $self->file_name if $self->file_name;
602 0         0 say "[text hint] " . $self->text_hint;
603              
604 0         0 say "\n---SCANNER GRAPHS---\n";
605              
606 0         0 say "KNOWN:";
607 0         0 KEY: for my $index ( reverse 0..scalar @word_keys - 1 ) {
608 0         0 my $format = "%" . $longest . "s|%s\n";
609 0         0 my $score = $score_hash{$word_keys[$index]};
610              
611 0         0 my $score_string = sprintf " %5.2f |" => $score;
612 0         0 for (0..max($score, $upper)) {
613 0 0 0     0 if ($score > $lower and $score < $upper) {
614 0 0       0 $score_string .= '+' if $_ <= $score;
615             } else {
616 0 0       0 $score_string .= ']' if $_ == int $upper;
617 0 0       0 $score_string .= '-' if $_ <= int $score;
618 0 0       0 $score_string .= ' ' if $_ > int $score;
619 0 0       0 $score_string .= '[' if $_ == int $lower;
620             }
621             }
622              
623 0         0 printf $format => ($word_keys[$index], $score_string);
624             }
625 0         0 printf "\n[whiskers] lower = %.2f; upper = %.2f\n\n" => ($lower, $upper);
626              
627 0         0 say "CALCULATED:";
628 0         0 KEY: for my $index ( reverse 1..scalar @word_keys ) {
629 0         0 my $format = "%" . $longest . "s|%s\n";
630 0         0 my $score = $a + $b * $index + $c * $index**2;
631 0   0     0 my $score_string = sprintf " %5.2f |%s" => $score, ($score >= $lower and $score < $score_hash{$word_keys[$index - 1]} ? '-' x $score : '+' x $score);
632 0         0 printf $format => $word_keys[$index - 1], $score_string;
633             }
634             }
635             }
636              
637              
638 37         1184 return $self;
639             }
640              
641              
642              
643             sub analyze_phrases {
644 37     37 0 73 my $self = shift;
645              
646             #find common phrase-fragments
647 37         78 my (%inter_hash, %score_hash, %bare_phrase, %full_phrase); #*inter_hash* contains phrase fragments; *score_hash* contains score values for words in those phrases
648 37         49 F_WORD: for my $f_word (keys %{$self->phrs_hash}) {
  37         427  
649              
650             #compile scraps for scoring
651 1258         1339 JOIN: for my $fragment (@{$self->frag_list}) {
  1258         2777  
652 317645         275855 my $scrap = join ' ' => map { $score_hash{$_}++;
653 94226         91334 $fragment->{'scrap'}->{$_} } sort { $a <=> $b } keys %{$fragment->{'scrap'}};
  317645         480807  
  371733         401366  
  94226         256131  
654 94226         132407 my @bare = map { $fragment->{'scrap'}->{$_} } grep { !$self->stopwords->{$fragment->{'scrap'}->{$_}} } sort { $a <=> $b } keys %{$fragment->{'scrap'}};
  177564         735503  
  317645         5187344  
  371733         380332  
  94226         170487  
655              
656 94226         160792 $score_hash{$f_word}++; #scores each *f_word*
657 94226         124428 $inter_hash{$scrap}++; #contains the final *L_scrap*
658              
659              
660 94226         85969 my $score = 1;
661 94226         185178 for my $word (split ' ' => $scrap) {
662 317645   100     621865 $score += $self->freq_hash->{$word} // 0;
663 317645   100     537695 $score += $self->sigma_hash->{$word} // 0;
664 317645   100     573372 $score += $score_hash{$word} // 0;
665             }
666              
667 94226         109710 $full_phrase{ ${$fragment->{'sentence'}} } += $score; #contains the full phrase from which the *L_scrap* was drawn
  94226         185846  
668 94226 100       249076 $bare_phrase{ $scrap } = \@bare if scalar @bare; #contains the final *L_scrap* without any stopwords
669             }
670             }
671              
672              
673             #each phrases' score is multiplied by the sum of the compound score of each word within the phrase
674 37         1202 for my $scrap (keys %inter_hash) {
675 2324         4121 for my $word (split ' ' => $scrap) {
676 8350         6821 my $score = 1;
677 8350   100     14584 $score += $self->freq_hash->{$word} // 0;
678 8350   100     13962 $score += $self->sigma_hash->{$word} // 0;
679 8350   100     13098 $score += $score_hash{$word} // 0;
680              
681 8350         10511 $inter_hash{$scrap} *= $score;
682             }
683             }
684              
685              
686             #combine scraps — if scrap "a" contains scrap "b", add the value of "b" to "a" and delete "b"
687 37 50       677 CLEAR: for my $scrap (sort { $inter_hash{$b} <=> $inter_hash{$a} or $a cmp $b } keys %inter_hash) {
  11571         17319  
688 2324         19709 my $compare = qr/\b$scrap\b/;
689 2324         2962 my $delete = 0;
690 2324         25093 TEST: for my $test (keys %inter_hash) {
691 120849 100       147692 if ($test ne $scrap) {
692 119009 100       203814 if ($test =~ /$compare/) { #true iff *scrap* ∈ *test*
    100          
693 611         1262 $inter_hash{$test} += $inter_hash{$scrap};
694 611 50       3674 delete $inter_hash{$scrap} and next CLEAR;
695 118398         160460 } elsif (not scalar singleton (@{$bare_phrase{$test}}, @{$bare_phrase{$scrap}}) ) { #true iff *bare_phrase{test}* == *bare_phrase{scrap}*
  118398         469252  
696 3116 100       2662 next TEST unless scalar @{$bare_phrase{$test}} > 1;
  3116         6974  
697              
698 228         290 my $joined = join '|' => @{$bare_phrase{$test}};
  228         465  
699 228         749 $inter_hash{"($joined)"} = $inter_hash{$test} + $inter_hash{$scrap};
700 228         477 $inter_hash{$test} += $inter_hash{$scrap};
701 228 50       1547 delete $inter_hash{$scrap} and next CLEAR;
702             }
703             }
704             }
705             }
706              
707              
708 37         1674 $self->_set_score_hash( \%score_hash );
709 37         2542 $self->_set_inter_hash( \%inter_hash );
710 37         2074 $self->_set_phrs_list( \%full_phrase );
711              
712              
713              
714              
715              
716             #returns a summary array for the given text, in the form of a hash of array-refs:
717             # sentences => a list of full sentences from the given article, scored based on the scores of the words contained therein
718             # fragments => a list of phrase fragments from the given article, scored as above
719             # words => a list of all words in the article, scored by a three-factor system consisting of
720             # (frequency of appearance, population standard deviation, and use in important phrase fragments)
721              
722 37         1389 my %sort_list;
723 37         51 for (keys %{$self->freq_hash}) {
  37         675  
724 1261   50     2464 $sort_list{$_} += $self->freq_hash->{$_} // 0;
725 1261   50     2139 $sort_list{$_} += $self->sigma_hash->{$_} // 0;
726 1261   100     2289 $sort_list{$_} += $self->score_hash->{$_} // 0;
727             }
728              
729 37         136 my %sentences = map { ($_ => $self->phrs_list->{$_}) } sort { $self->phrs_list->{$b} <=> $self->phrs_list->{$a} } keys %{$self->phrs_list};
  1295         2908  
  5431         9278  
  37         1163  
730 37 50       241 my %fragments = map { ($_ => $self->inter_hash->{$_}) } sort { $self->inter_hash->{$b} <=> $self->inter_hash->{$a} or $a cmp $b } keys %{$self->inter_hash};
  1645         3230  
  7316         12632  
  37         504  
731 37 50       567 my %singleton = map { ($_ => $sort_list{$_}) } sort { $sort_list{$b} <=> $sort_list{$a} or $a cmp $b } keys %sort_list;
  1261         2180  
  5994         8557  
732              
733 37         348 my %summary = ( sentences => \%sentences, fragments => \%fragments, words => \%singleton );
734              
735 37         877 $self->_set_summary( \%summary );
736              
737              
738 37 50       1750 if ($self->print_summary) {
739 0         0 say "\n\n———————————————————————————————————————————\n\n";
740              
741              
742 0 0       0 say "[file name] " . $self->file_name if $self->file_name;
743 0         0 say "[text hint] " . $self->text_hint;
744              
745 0         0 say "\n---SUMMARY CHARTS---\n";
746              
747 0         0 my ($sentences, $fragments, $words) = @{$self->summary}{'sentences','fragments','words'};
  0         0  
748              
749 0         0 say "SUMMARY:";
750 0 0       0 my @sentence_keys = sort { $sentences->{$b} <=> $sentences->{$a} or $a cmp $b} keys %$sentences;
  0         0  
751 0         0 for my $sen ( @sentence_keys[0..min($self->return_count,scalar @sentence_keys - 1)] ) {
752 0         0 printf "%4d => %s\n" => $sentences->{$sen}, $sen;
753             }
754 0         0 say "\n";
755              
756              
757 0         0 say "PHRASES:";
758 0 0       0 my @phrase_keys = sort { $fragments->{$b} <=> $fragments->{$a} or $a cmp $b } keys %$fragments;
  0         0  
759 0         0 for my $phrase ( @phrase_keys[0..min($self->return_count,scalar @phrase_keys - 1)] ) {
760 0         0 printf "%8d => %s\n" => $fragments->{$phrase}, $phrase;
761             }
762 0         0 say "\n";
763              
764              
765 0         0 say " WORDS:";
766 0 0       0 my @word_keys = sort { $words->{$b} <=> $words->{$a} or $a cmp $b } keys %$words;
  0         0  
767 0         0 my $highest = $words->{$word_keys[0]};
768 0         0 my $longest = max map {length} @word_keys;
  0         0  
769 0         0 KEY: for my $word ( @word_keys[0..min($self->return_count,scalar @word_keys - 1)] ) {
770 0         0 my $format = "%" . $longest . "s|%s\n";
771 0         0 my $score = int(40*$words->{$word}/$highest);
772 0 0       0 printf $format => ( $word , "-" x $score ) if $score > 2;
773             }
774 0         0 say "\n";
775             }
776              
777              
778 37         1752 return $self;
779             }
780              
781              
782              
783              
784             1;
785             __END__