File Coverage

blib/lib/Text/Shingle.pm
Criterion Covered Total %
statement 38 47 80.8
branch 5 10 50.0
condition 1 3 33.3
subroutine 8 10 80.0
pod 4 4 100.0
total 56 74 75.6


line stmt bran cond sub pod time code
1             # Copyright 2018 Francesco Nidito. All rights reserved.
2             #
3             # This library is free software; you can redistribute it and/or
4             # modify it under the same terms as Perl itself.
5              
6             package Text::Shingle;
7              
8 1     1   5980 use strict;
  1         2  
  1         29  
9 1     1   17 use Carp;
  1         2  
  1         56  
10 1     1   656 use Unicode::Normalize;
  1         2421  
  1         63  
11 1     1   596 use Text::NGrammer;
  1         78313  
  1         42  
12              
13 1     1   8 use vars qw($VERSION);
  1         3  
  1         491  
14             $VERSION = '0.07';
15              
16             sub new {
17 4     4 1 535 my $class = shift;
18 4         17 my %config = ( w => 2, # shingles length
19             norm => 1, # by default, enable the normalization
20             lang => 'en', # used by the sentencer
21             );
22              
23 4         9 my %param = @_;
24              
25 4         12 for my $opt (keys %param) {
26 2 50       22 croak "option $opt unsupported by version $VERSION of Text::Shingle" unless exists $config{$opt};
27 2 50 33     19 croak "window size cannot be negative or zero" if $opt eq 'w' && $param{w} < 1;
28 2         6 $config{$opt} = $param{$opt};
29             }
30 4         10 $config{version} = $VERSION;
31 4         14 return bless \%config, $class;
32             }
33              
34             sub _shingles_from_ngrams {
35 3     3   5832 my $self = shift;
36 3         10 my @ngrams = @_;
37              
38 3         7 my %shingles = ();
39 3         8 for my $ngram (@ngrams) {
40 16         39 my $shingle = join(' ', sort {$a cmp $b}
41 25 50       122 map { ($self->{norm}) ? NFKC($_) : $_ }
42 11         16 @{$ngram}
  11         19  
43             );
44 11 100       39 $shingles{$shingle} = 1 if (!exists $shingles{$shingle});
45             }
46 3         27 return keys %shingles;
47             }
48              
49             sub shingle_array {
50 0     0 1 0 my $self = shift;
51 0         0 my $n = scalar(@_);
52              
53 0 0       0 return () if $n < $self->{w};
54              
55 0         0 my $ngrammer = Text::NGrammer->new(lang => $self->{lang});
56 0         0 return $self->_shingles_from_ngrams($ngrammer->ngrams_array($self->{w}, @_));
57             }
58              
59             sub shingle_sentence {
60 0     0 1 0 my $self = shift;
61 0         0 my $sentence = shift;
62              
63 0         0 my $ngrammer = Text::NGrammer->new(lang => $self->{lang});
64 0         0 return $self->_shingles_from_ngrams($ngrammer->ngrams_sentence($self->{w}, $sentence));
65             }
66              
67             sub shingle_text {
68 3     3 1 13 my $self = shift;
69 3         7 my $text = shift;
70              
71 3         19 my $ngrammer = Text::NGrammer->new(lang => $self->{lang});
72 3         57 return $self->_shingles_from_ngrams($ngrammer->ngrams_text($self->{w}, $text));
73             }
74              
75             1;
76              
77             __END__