line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Algorithm::LossyCount; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
# ABSTRACT: Memory-efficient approximate frequency count. |
4
|
|
|
|
|
|
|
|
5
|
1
|
|
|
1
|
|
1005
|
use v5.10; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
52
|
|
6
|
1
|
|
|
1
|
|
597
|
use Algorithm::LossyCount::Entry; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
26
|
|
7
|
1
|
|
|
1
|
|
6
|
use Carp; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
68
|
|
8
|
1
|
|
|
1
|
|
909
|
use POSIX qw//; |
|
1
|
|
|
|
|
8123
|
|
|
1
|
|
|
|
|
721
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our $VERSION = 0.03; |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
sub new { |
13
|
2
|
|
|
2
|
1
|
232893
|
my ($class, %params) = @_; |
14
|
|
|
|
|
|
|
|
15
|
2
|
|
66
|
|
|
286
|
my $max_error_ratio = delete $params{max_error_ratio} |
16
|
|
|
|
|
|
|
// Carp::croak('Missing mandatory parameter: "max_error_ratio"'); |
17
|
1
|
50
|
|
|
|
5
|
if (%params) { |
18
|
0
|
|
|
|
|
0
|
Carp::croak( |
19
|
|
|
|
|
|
|
'Unknown parameter(s): ', |
20
|
0
|
|
|
|
|
0
|
join ', ', map { qq/"$_"/ } sort keys %params, |
21
|
|
|
|
|
|
|
) |
22
|
|
|
|
|
|
|
} |
23
|
|
|
|
|
|
|
|
24
|
1
|
50
|
|
|
|
7
|
Carp::croak('max_error_ratio must be positive.') if $max_error_ratio <= 0; |
25
|
|
|
|
|
|
|
|
26
|
1
|
|
|
|
|
32
|
my $self = bless +{ |
27
|
|
|
|
|
|
|
bucket_size => POSIX::ceil(1 / $max_error_ratio), |
28
|
|
|
|
|
|
|
current_bucket => 1, |
29
|
|
|
|
|
|
|
entries => +{}, |
30
|
|
|
|
|
|
|
max_error_ratio => $max_error_ratio, |
31
|
|
|
|
|
|
|
num_samples => 0, |
32
|
|
|
|
|
|
|
num_samples_in_current_bucket => 0, |
33
|
|
|
|
|
|
|
} => $class; |
34
|
|
|
|
|
|
|
|
35
|
1
|
|
|
|
|
5
|
return $self; |
36
|
|
|
|
|
|
|
} |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
sub add_sample { |
39
|
14715
|
|
|
14715
|
1
|
91927
|
my ($self, $sample) = @_; |
40
|
|
|
|
|
|
|
|
41
|
14715
|
50
|
|
|
|
32143
|
Carp::croak('add_sample() requires 1 parameter.') unless defined $sample; |
42
|
|
|
|
|
|
|
|
43
|
14715
|
100
|
|
|
|
26269
|
if (defined (my $entry = $self->entries->{$sample})) { |
44
|
8963
|
|
|
|
|
26555
|
$entry->increment_frequency; |
45
|
8963
|
|
|
|
|
15395
|
$entry->num_allowed_errors($self->current_bucket - 1); |
46
|
|
|
|
|
|
|
} else { |
47
|
5752
|
|
|
|
|
10020
|
$self->entries->{$sample} = Algorithm::LossyCount::Entry->new( |
48
|
|
|
|
|
|
|
num_allowed_errors => $self->current_bucket - 1, |
49
|
|
|
|
|
|
|
); |
50
|
|
|
|
|
|
|
} |
51
|
|
|
|
|
|
|
|
52
|
14715
|
|
|
|
|
22126
|
++$self->{num_samples}; |
53
|
14715
|
|
|
|
|
31671
|
++$self->{num_samples_in_current_bucket}; |
54
|
14715
|
100
|
|
|
|
30869
|
$self->clear_bucket if $self->bucket_is_full; |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
sub bucket_is_full { |
58
|
14715
|
|
|
14715
|
0
|
30240
|
my ($self) = @_; |
59
|
|
|
|
|
|
|
|
60
|
14715
|
|
|
|
|
24503
|
$self->num_samples_in_current_bucket >= $self->bucket_size; |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
|
63
|
14715
|
|
|
14715
|
0
|
94259
|
sub bucket_size { $_[0]->{bucket_size} } |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
sub clear_bucket { |
66
|
73
|
|
|
73
|
0
|
95
|
my ($self) = @_; |
67
|
|
|
|
|
|
|
|
68
|
73
|
|
|
|
|
97
|
for my $sample (keys %{ $self->entries }) { |
|
73
|
|
|
|
|
122
|
|
69
|
9988
|
|
|
|
|
17148
|
my $entry = $self->entries->{$sample}; |
70
|
9988
|
100
|
|
|
|
22285
|
unless ($entry->survive_in_bucket($self->current_bucket)) { |
71
|
5632
|
|
|
|
|
10300
|
delete $self->entries->{$sample}; |
72
|
|
|
|
|
|
|
} |
73
|
|
|
|
|
|
|
} |
74
|
73
|
|
|
|
|
990
|
++$self->{current_bucket}; |
75
|
73
|
|
|
|
|
394
|
$self->{num_samples_in_current_bucket} = 0; |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
|
78
|
24703
|
|
|
24703
|
0
|
105919
|
sub current_bucket { $_[0]->{current_bucket} } |
79
|
|
|
|
|
|
|
|
80
|
36281
|
|
|
36281
|
0
|
121588
|
sub entries { $_[0]->{entries} } |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
sub frequencies { |
83
|
1
|
|
|
1
|
1
|
9
|
my ($self, %params) = @_; |
84
|
|
|
|
|
|
|
|
85
|
1
|
|
50
|
|
|
11
|
my $support = delete $params{support} // 0; |
86
|
1
|
50
|
|
|
|
5
|
if (%params) { |
87
|
0
|
|
|
|
|
0
|
Carp::croak( |
88
|
|
|
|
|
|
|
'Unknown parameter(s): ', |
89
|
0
|
|
|
|
|
0
|
join ', ', map { qq/"$_"/ } sort keys %params, |
90
|
|
|
|
|
|
|
) |
91
|
|
|
|
|
|
|
} |
92
|
|
|
|
|
|
|
|
93
|
1
|
|
|
|
|
5
|
my $threshold = ($support - $self->max_error_ratio) * $self->num_samples; |
94
|
120
|
|
|
|
|
216
|
my %frequencies = map { |
95
|
1
|
|
|
|
|
4
|
my $frequency = $self->entries->{$_}->frequency; |
96
|
120
|
50
|
|
|
|
391
|
$frequency < $threshold ? () : ($_ => $frequency); |
97
|
1
|
|
|
|
|
3
|
} keys %{ $self->entries }; |
98
|
1
|
|
|
|
|
21
|
return \%frequencies; |
99
|
|
|
|
|
|
|
} |
100
|
|
|
|
|
|
|
|
101
|
3
|
|
|
3
|
1
|
16209
|
sub max_error_ratio { $_[0]->{max_error_ratio} } |
102
|
|
|
|
|
|
|
|
103
|
1
|
|
|
1
|
1
|
5
|
sub num_samples { $_[0]->{num_samples} } |
104
|
|
|
|
|
|
|
|
105
|
14715
|
|
|
14715
|
0
|
42325
|
sub num_samples_in_current_bucket { $_[0]->{num_samples_in_current_bucket} } |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
1; |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
__END__ |