File Coverage

blib/lib/Algorithm/LossyCount.pm
Criterion Covered Total %
statement 52 56 92.8
branch 11 16 68.7
condition 3 5 60.0
subroutine 15 15 100.0
pod 5 11 45.4
total 86 103 83.5


line stmt bran cond sub pod time code
1             package Algorithm::LossyCount;
2              
3             # ABSTRACT: Memory-efficient approximate frequency count.
4              
5 1     1   1005 use v5.10;
  1         3  
  1         52  
6 1     1   597 use Algorithm::LossyCount::Entry;
  1         2  
  1         26  
7 1     1   6 use Carp;
  1         2  
  1         68  
8 1     1   909 use POSIX qw//;
  1         8123  
  1         721  
9              
10             our $VERSION = 0.03;
11              
12             sub new {
13 2     2 1 232893 my ($class, %params) = @_;
14              
15 2   66     286 my $max_error_ratio = delete $params{max_error_ratio}
16             // Carp::croak('Missing mandatory parameter: "max_error_ratio"');
17 1 50       5 if (%params) {
18 0         0 Carp::croak(
19             'Unknown parameter(s): ',
20 0         0 join ', ', map { qq/"$_"/ } sort keys %params,
21             )
22             }
23              
24 1 50       7 Carp::croak('max_error_ratio must be positive.') if $max_error_ratio <= 0;
25              
26 1         32 my $self = bless +{
27             bucket_size => POSIX::ceil(1 / $max_error_ratio),
28             current_bucket => 1,
29             entries => +{},
30             max_error_ratio => $max_error_ratio,
31             num_samples => 0,
32             num_samples_in_current_bucket => 0,
33             } => $class;
34              
35 1         5 return $self;
36             }
37              
38             sub add_sample {
39 14715     14715 1 91927 my ($self, $sample) = @_;
40              
41 14715 50       32143 Carp::croak('add_sample() requires 1 parameter.') unless defined $sample;
42              
43 14715 100       26269 if (defined (my $entry = $self->entries->{$sample})) {
44 8963         26555 $entry->increment_frequency;
45 8963         15395 $entry->num_allowed_errors($self->current_bucket - 1);
46             } else {
47 5752         10020 $self->entries->{$sample} = Algorithm::LossyCount::Entry->new(
48             num_allowed_errors => $self->current_bucket - 1,
49             );
50             }
51              
52 14715         22126 ++$self->{num_samples};
53 14715         31671 ++$self->{num_samples_in_current_bucket};
54 14715 100       30869 $self->clear_bucket if $self->bucket_is_full;
55             }
56              
57             sub bucket_is_full {
58 14715     14715 0 30240 my ($self) = @_;
59              
60 14715         24503 $self->num_samples_in_current_bucket >= $self->bucket_size;
61             }
62              
63 14715     14715 0 94259 sub bucket_size { $_[0]->{bucket_size} }
64              
65             sub clear_bucket {
66 73     73 0 95 my ($self) = @_;
67              
68 73         97 for my $sample (keys %{ $self->entries }) {
  73         122  
69 9988         17148 my $entry = $self->entries->{$sample};
70 9988 100       22285 unless ($entry->survive_in_bucket($self->current_bucket)) {
71 5632         10300 delete $self->entries->{$sample};
72             }
73             }
74 73         990 ++$self->{current_bucket};
75 73         394 $self->{num_samples_in_current_bucket} = 0;
76             }
77              
78 24703     24703 0 105919 sub current_bucket { $_[0]->{current_bucket} }
79              
80 36281     36281 0 121588 sub entries { $_[0]->{entries} }
81              
82             sub frequencies {
83 1     1 1 9 my ($self, %params) = @_;
84              
85 1   50     11 my $support = delete $params{support} // 0;
86 1 50       5 if (%params) {
87 0         0 Carp::croak(
88             'Unknown parameter(s): ',
89 0         0 join ', ', map { qq/"$_"/ } sort keys %params,
90             )
91             }
92              
93 1         5 my $threshold = ($support - $self->max_error_ratio) * $self->num_samples;
94 120         216 my %frequencies = map {
95 1         4 my $frequency = $self->entries->{$_}->frequency;
96 120 50       391 $frequency < $threshold ? () : ($_ => $frequency);
97 1         3 } keys %{ $self->entries };
98 1         21 return \%frequencies;
99             }
100              
101 3     3 1 16209 sub max_error_ratio { $_[0]->{max_error_ratio} }
102              
103 1     1 1 5 sub num_samples { $_[0]->{num_samples} }
104              
105 14715     14715 0 42325 sub num_samples_in_current_bucket { $_[0]->{num_samples_in_current_bucket} }
106              
107             1;
108              
109             __END__