File Coverage

blib/lib/Text/NGrammer.pm
Criterion Covered Total %
statement 58 67 86.5
branch 8 16 50.0
condition n/a
subroutine 9 11 81.8
pod 7 7 100.0
total 82 101 81.1


line stmt bran cond sub pod time code
1             # Copyright 2018 Francesco Nidito. All rights reserved.
2             #
3             # This library is free software; you can redistribute it and/or
4             # modify it under the same terms as Perl itself.
5              
6             package Text::NGrammer;
7              
8 1     1   5982 use strict;
  1         3  
  1         30  
9 1     1   5 use Carp;
  1         3  
  1         49  
10 1     1   527 use Lingua::Sentence;
  1         72387  
  1         43  
11              
12 1     1   7 use vars qw($VERSION);
  1         3  
  1         322  
13             $VERSION = '0.04';
14              
15             sub new {
16 1     1 1 121 my $class = shift;
17 1         4 my %config = (lang => 'en', # used by the sentencer
18             );
19              
20 1         3 my %param = @_;
21              
22 1         5 for my $opt (keys %param) {
23 0 0       0 croak "option $opt unsupported by version $VERSION of Text::NGrammer" unless exists $config{$opt};
24 0         0 $config{$opt} = $param{$opt};
25             }
26 1         3 $config{version} = $VERSION;
27 1         4 return bless \%config, $class;
28             }
29              
30             ##
31             # Skip-grams
32              
33             sub skipgrams_array {
34 11     11 1 17 my $self = shift;
35 11         15 my $n = shift;
36 11         13 my $k = shift;
37 11         26 my $length = scalar(@_);
38              
39 11 50       21 croak "the n-gram length cannot be lesser than 1" if $n < 1;
40 11 50       19 croak "the tokens to be skipped cannot be lesser than 0" if $k < 0;
41              
42 11         16 my @ngrams = ();
43 11         17 my $step = $k + 1;
44 11         31 for (my $i = 0; $i <= ($length - ($n+($k*($n-1)))); $i += 1) {
45 25         70 my @tokens = ();
46 25         35 my $at = $i;
47 25         44 while (@tokens < $n) {
48 64         100 push @tokens, $_[$at];
49 64         105 $at += $step;
50             }
51 25         61 push @ngrams, \@tokens;
52             }
53              
54 11         36 return @ngrams;
55             }
56              
57             sub skipgrams_sentence {
58 11     11 1 18 my $self = shift;
59 11         15 my $n = shift;
60 11         16 my $k = shift;
61 11         16 my $sentence = shift;
62              
63 11 50       28 croak "the n-gram length cannot be lesser than 1" if $n < 1;
64 11 50       20 croak "the tokens to be skipped cannot be lesser than 0" if $k < 0;
65              
66             # splits a string -- assumed to be a sentence -- according to spaces, control chars, etc.
67 11         172 my @tokens = grep /\S+/, split(/(?:\p{C}|\p{M}|\p{P}|\p{S}|\p{Z})+/, $sentence);
68 11 50       35 return () if @tokens < $n;
69 11         28 return $self->skipgrams_array($n, $k, @tokens);
70             }
71              
72             sub skipgrams_text {
73 12     12 1 1475 my $self = shift;
74 12         19 my $n = shift;
75 12         16 my $k = shift;
76 12         20 my $text = shift;
77              
78 12 50       29 croak "the n-gram length cannot be lesser than 1" if $n < 1;
79 12 100       213 croak "the tokens to be skipped cannot be lesser than 0" if $k < 0;
80              
81 11         17 my @ngrams = ();
82              
83 11         39 my $splitter = Lingua::Sentence->new($self->{lang});
84 11         15225 for my $sentence ($splitter->split_array($text)) {
85 11         708 push @ngrams, $self->skipgrams_sentence($n, $k, $sentence);
86             }
87              
88 11         157 return @ngrams;
89             }
90              
91             ##
92             # N-Grams
93              
94             sub ngrams_array {
95 0     0 1 0 my $self = shift;
96 0         0 my $n = shift;
97              
98 0         0 return $self->skipgrams_array($n, 0, @_);
99             }
100              
101             sub ngrams_sentence {
102 0     0 1 0 my $self = shift;
103 0         0 my $n = shift;
104 0         0 my $sentence = shift;
105              
106 0         0 return $self->skipgrams_sentencey($n, 0, $sentence);
107             }
108              
109             sub ngrams_text {
110 3     3 1 333 my $self = shift;
111 3         5 my $n = shift;
112 3         5 my $text = shift;
113              
114 3         8 return $self->skipgrams_text($n, 0, $text);
115             }
116              
117              
118             1;
119              
120             __END__