File Coverage

blib/lib/Clusterize.pm

Criterion	Covered	Total	%
statement	10	46	21.7
branch	0	10	0.0
condition	0	2	0.0
subroutine	4	13	30.7
pod	4	10	40.0
total	18	81	22.2

line	stmt	bran	cond	sub	pod	time	code
1							package Clusterize;
2	3			3		49060	use warnings;
	3					6
	3					88
3	3			3		13	use strict;
	3					4
	3					125
4	3			3		1383	use Clusterize::Pattern;
	3					7
	3					1650
5							our $VERSION = '0.02';
6
7							sub pair {
8	0			0	0	0	my ($self, $key, $val) = @_;
9	0	0				0	$self->{pairs}{$key} = $val if defined $val;
10	0					0	$self->{pairs}{$key};
11							}
12
13							sub delete_pair {
14	0			0	0	0	my ($self, $key) = @_;
15	0					0	delete $self->{pairs}{$key};
16							}
17
18	0			0	0	0	sub cluster_list { keys %{shift->{clusters}} }
	0					0
19							sub remove_cluster_pair {
20	0			0	0	0	my ($self, $digest, $key) = @_;
21	0					0	delete $self->{clusters}{$digest}{$key};
22	0					0	delete $self->{clusters}{$digest}
23	0	0				0	unless %{$self->{clusters}{$digest}};
24							}
25
26							sub add_cluster_pair {
27	0			0	0	0	my ($self, $digest, $pair) = @_;
28	0					0	$self->{clusters}{$digest}{$pair->{key}} = $pair->{val};
29							}
30
31							sub cluster_pairs {
32	0			0	0	0	my ($self, $digest) = @_;
33	0					0	return $self->{clusters}{$digest};
34							}
35
36	1			1	1	710	sub new { return bless {}, shift }
37							sub add_pair {
38	0			0	1		my ($self, $key, $digest) = @_;
39	0	0					return if $self->pair($key);
40	0	0					$digest = Clusterize::Pattern->text2digest($digest)
41							if ref $digest eq 'ARRAY';
42	0						$self->pair($key, $digest);
43	0						for (keys %{$digest}) {
	0
44	0						$self->add_cluster_pair($_, {key => $key, val => $digest->{$_}});
45							}
46							}
47
48							sub remove_pair {
49	0			0	1		my ($self, $key) = @_;
50	0		0				my $cluster_pair = $self->pair($key) \|\| return;
51	0						for (keys %{$cluster_pair}) {$self->remove_cluster_pair($_, $key)}
	0
	0
52	0						$self->delete_pair($key);
53							}
54
55							sub list {
56	0			0	1		my ($self, $opt) = @_;
57	0						my (%md5, @clusters);
58	0						for (map { Clusterize::Pattern->new($self->cluster_pairs($_)) }
	0
59							$self->cluster_list($opt)) {
60	0	0					next if $md5{$_->digest};
61	0						$md5{$_->digest} = 1;
62	0						push @clusters, $_
63							}
64	0						@clusters;
65							}
66
67							1;
68
69							=head1 NAME
70
71							Clusterize - clustering text documents.
72
73							=head1 VERSION
74
75							Version 0.02
76
77							=head1 SYNOPSIS
78
79							use Clusterize;
80
81							my %pairs = (
82							key1 => [ string1, string2, ...stringN ],
83							key2 => [ string5, string6, ...stringM ],
84							...
85							keyN => [ ... ],
86							);
87
88							my $clusterize = Clusterize->new();
89							while (my @pair = each %files) { $clusterize->add_pair(@pair) }
90
91							foreach my $c ( $clusterize->list ) {
92							printf "# /%s/ (digest=%s) (accuracy=%.3f) (size=%d)",
93							$c->pattern, $digest, $c->accuracy, $c->size;
94							my $pairs = $c->pairs;
95							for ( keys %{$pairs} ) { print $_." ".$pairs->{$_} }
96							}
97
98
99							=head1 DESCRIPTION
100
101							B module implements specific algorithm for clustering text documents.
102
103							=head1 PUBLIC METHODS
104
105							=head2 new
106
107							This is the constructor. No parameter is required.
108
109							=head2 add_pair
110
111							This method is used to add new document into cluster set:
112
113							$clusterize->add_pair($key, [$string1, $string2, ...]);
114
115							$key - is uniq name of the document (e.g. filename),
116							[$string1, $string2, ...] - text of the document.
117
118							=head2 remove_pair
119
120							This method is used to remove document from cluster set:
121
122							$clusterize->remove_pair($key);
123
124							$key - is name of the document (e.g. filename).
125
126							=head2 list
127
128							This method is used to get list of built clusters:
129
130							my @clusters = $clusterize->list();
131
132							Returns list of B objects with the following attributes:
133
134							$c->pattern - regexp that matches all strings in the given cluster;
135
136							$c->accuracy - this value reflects how similar strings in the cluster (value from 0 to 1);
137
138							$c->size - how many documents in the cluster;
139
140							$c->digest - MD5 digest of the cluster to identify duplicate clusters;
141
142							$c->pairs - list of { key => $key1, val => $val1 } hash pairs, where:
143							key - is name of document, val - is string from 'key' document;
144
145							=head1 AUTHOR
146
147							Slava Moiseev,
148