blib/lib/Set/Partitions/Similarity.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 91 | 104 | 87.5 |
branch | 12 | 28 | 42.8 |
condition | 0 | 3 | 0.0 |
subroutine | 12 | 14 | 85.7 |
pod | 5 | 9 | 55.5 |
total | 120 | 158 | 75.9 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package Set::Partitions::Similarity; | ||||||
2 | |||||||
3 | 1 | 1 | 28173 | use strict; | |||
1 | 2 | ||||||
1 | 37 | ||||||
4 | 1 | 1 | 5 | use warnings; | |||
1 | 2 | ||||||
1 | 40 | ||||||
5 | #use Data::Dump qw(dump); | ||||||
6 | |||||||
7 | BEGIN | ||||||
8 | { | ||||||
9 | 1 | 1 | 5 | use Exporter (); | |||
1 | 5 | ||||||
1 | 25 | ||||||
10 | 1 | 1 | 5 | use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); | |||
1 | 9 | ||||||
1 | 123 | ||||||
11 | 1 | 1 | 1 | $VERSION = '0.54'; | |||
12 | 1 | 15 | @ISA = qw(Exporter); | ||||
13 | 1 | 2 | @EXPORT = qw(); | ||||
14 | 1 | 3 | @EXPORT_OK = qw(getAccuracy getAccuracyAndPrecision getDistance getPrecision areSubsetsDisjoint); | ||||
15 | 1 | 1283 | %EXPORT_TAGS = (); | ||||
16 | } | ||||||
17 | |||||||
18 | #Routines to measure similarity of partitions. | ||||||
19 | #012345678901234567890123456789012345678901234 | ||||||
20 | |||||||
21 | =head1 NAME | ||||||
22 | |||||||
23 | C |
||||||
24 | |||||||
25 | =head1 SYNOPSIS | ||||||
26 | |||||||
27 | use Set::Partitions::Similarity qw(getAccuracyAndPrecision); | ||||||
28 | use Data::Dump qw(dump); | ||||||
29 | |||||||
30 | # set elements are Perl strings, sets are array references | ||||||
31 | # partitions are nested arrays. | ||||||
32 | dump getAccuracyAndPrecision ([[qw(a b)],[1,2]], [[qw(a b 1)],[2]]); | ||||||
33 | # dumps: | ||||||
34 | # ("0.5", "0.25") | ||||||
35 | |||||||
36 | # a partition is equivalent to itself, even the empty partition. | ||||||
37 | dump getAccuracyAndPrecision ([[1,2], [3,4]], [[2,1], [4,3]]); | ||||||
38 | dump getAccuracyAndPrecision ([], []); | ||||||
39 | # dumps: | ||||||
40 | # (1, 1) | ||||||
41 | # (1, 1) | ||||||
42 | |||||||
43 | # accuracy and precision are symmetric functions. | ||||||
44 | my ($p, $q) = ([[1,2,3], [4]], [[1], [2,3,4]]); | ||||||
45 | dump getAccuracyAndPrecision ($p, $q); | ||||||
46 | dump getAccuracyAndPrecision ($q, $p); | ||||||
47 | # dumps: | ||||||
48 | # ("0.333333333333333", "0.2") | ||||||
49 | # ("0.333333333333333", "0.2") | ||||||
50 | |||||||
51 | # checks partitions and throws an exception. | ||||||
52 | eval { getAccuracyAndPrecision ([[1]], [[1,2]], 1); }; | ||||||
53 | warn $@ if $@; | ||||||
54 | # dumps: | ||||||
55 | # partitions are invalid, they have different set elements. | ||||||
56 | |||||||
57 | =head1 DESCRIPTION | ||||||
58 | |||||||
59 | A partition of a set is a collection of mutually disjoint subsets of the set | ||||||
60 | whose union is the set. C |
||||||
61 | that measure the I |
||||||
62 | assess the performance of a binary clustering algorithm by comparing | ||||||
63 | the clusters the algorithm creates against the correct clusters of test data. | ||||||
64 | |||||||
65 | =head2 Accuracy and Precision | ||||||
66 | |||||||
67 | Let C be a partition of C |
||||||
68 | be the set of all sets of two distinct elements of C |
||||||
69 | The partition C uniquely defines a partitioning of C |
||||||
70 | C |
||||||
71 | occur in the same set in C , and define C |
||||||
72 | |||||||
73 | Given two partitions C and C |
||||||
74 | C<(|C(P) ^ C(Q)| + |D(P) ^ D(Q)|) / (n*(n-1)/2)>, where | | gives the size of a set and | ||||||
75 | ^ represents the intersection operator. The I |
||||||
76 | C<|C(P) ^ C(Q)| / (|C(P) ^ C(Q)| + |C(P) ^ D(Q)| + |D(P) ^ C(Q)|)>. The I |
||||||
77 | I |
||||||
78 | The I |
||||||
79 | The I |
||||||
80 | |||||||
81 | All the methods implemented that compute the I |
||||||
82 | number of elements of the set partitioned. | ||||||
83 | |||||||
84 | =head1 ROUTINES | ||||||
85 | |||||||
86 | =head2 C |
||||||
87 | |||||||
88 | The routine C |
||||||
89 | false otherwise. It can be used to check the validity of a partition. | ||||||
90 | |||||||
91 | =over | ||||||
92 | |||||||
93 | =item C<$Partition> | ||||||
94 | |||||||
95 | The partition is stored as a nested array reference of the form | ||||||
96 | C<[[],...[]]>. For example, the set partition C<{{a,b}, {1,2}}> of the set | ||||||
97 | C<{a,b,1,2}> should be stored as the nested array reference | ||||||
98 | C<[['a','b']],[1,2]]>. Note the elements | ||||||
99 | of a set are represented as Perl strings. | ||||||
100 | |||||||
101 | =back | ||||||
102 | |||||||
103 | An example: | ||||||
104 | |||||||
105 | use Set::Partitions::Similarity qw(areSubsetsDisjoint); | ||||||
106 | use Data::Dump qw(dump); | ||||||
107 | dump areSubsetsDisjoint ([[1,2,3], [4]]); | ||||||
108 | dump areSubsetsDisjoint ([[1,2,3], [4,1]]); | ||||||
109 | # dumps: | ||||||
110 | # "1" | ||||||
111 | # "0" | ||||||
112 | |||||||
113 | =cut | ||||||
114 | |||||||
115 | # a valid partition has all the subsets mutually disjoint. this routine | ||||||
116 | # returns 0 if it finds two distinct subsets have an element in common. | ||||||
117 | # this is done in time linear in the number of elements by computing the | ||||||
118 | # prefix union of the sets in the partition using a hash. | ||||||
119 | sub areSubsetsDisjoint | ||||||
120 | { | ||||||
121 | # the hash %prefixUnionOfSubsets holds the union of elements in each subset | ||||||
122 | # as they are checked for elements that occur in more than one subset. | ||||||
123 | 60 | 60 | 1 | 127 | my %prefixUnionOfSubsets; | ||
124 | 60 | 105 | foreach my $subset (@{$_[0]}) | ||||
60 | 304 | ||||||
125 | { | ||||||
126 | # since it is possible that a subset could contain a repeating element, | ||||||
127 | # first each element is checked without adding it to the hash. | ||||||
128 | 28468 | 49086 | foreach my $element (@$subset) | ||||
129 | { | ||||||
130 | 943332 | 50 | 2169148 | if (exists $prefixUnionOfSubsets{$element}) | |||
131 | { | ||||||
132 | # if the second parameter is true, throw and exception, otherwise return false. | ||||||
133 | 0 | 0 | 0 | 0 | if (defined ($_[1]) && $_[1]) | ||
134 | { | ||||||
135 | 0 | 0 | die "element '$element' occurs in two of the subsets.\n"; | ||||
136 | } | ||||||
137 | else | ||||||
138 | { | ||||||
139 | 0 | 0 | return 0; | ||||
140 | } | ||||||
141 | } | ||||||
142 | } | ||||||
143 | |||||||
144 | # now we can add all the elements to the hash. | ||||||
145 | 28468 | 44660 | foreach my $element (@$subset) | ||||
146 | { | ||||||
147 | 943332 | 1829294 | $prefixUnionOfSubsets{$element} = 1; | ||||
148 | } | ||||||
149 | } | ||||||
150 | |||||||
151 | # the subsets are disjoint, return true. | ||||||
152 | 60 | 427242 | return 1; | ||||
153 | } | ||||||
154 | |||||||
155 | =head2 C |
||||||
156 | |||||||
157 | The routine C |
||||||
158 | two partitions. | ||||||
159 | |||||||
160 | =over | ||||||
161 | |||||||
162 | =item C<$PartitionP, $PartitionQ> | ||||||
163 | |||||||
164 | The partitions are stored as nested array references of the form | ||||||
165 | C<[[],...[]]>. For example, the set partition C<{{a,b}, {1,2}}> of the set | ||||||
166 | C<{a,b,1,2}> should be stored as the nested array references | ||||||
167 | C<[['a','b']],[1,2]]>. Note the elements | ||||||
168 | of a set are represented as Perl strings. | ||||||
169 | |||||||
170 | =item C<$CheckValidity> | ||||||
171 | |||||||
172 | If C<$CheckValidity> evaluates to true, then checks are performed to | ||||||
173 | ensure both partitions are valid and an exception is thrown if they | ||||||
174 | are not. The default is false. | ||||||
175 | |||||||
176 | =back | ||||||
177 | |||||||
178 | An example: | ||||||
179 | |||||||
180 | use Set::Partitions::Similarity qw(getAccuracy); | ||||||
181 | use Data::Dump qw(dump); | ||||||
182 | dump getAccuracy ([[qw(a b)], [qw(c d)]], [[qw(a b c d)]]); | ||||||
183 | dump getAccuracy ([[qw(a b c d)]], [[qw(a b)], [qw(c d)]]); | ||||||
184 | # dumps: | ||||||
185 | # "0.333333333333333" | ||||||
186 | # "0.333333333333333" | ||||||
187 | |||||||
188 | =cut | ||||||
189 | |||||||
190 | sub getAccuracy | ||||||
191 | { | ||||||
192 | 30 | 30 | 1 | 237977 | my ($ReferencePartition, $ModelPartition, $CheckValidity) = @_; | ||
193 | |||||||
194 | # get both similarities. | ||||||
195 | 30 | 225 | my ($accuracy, $precision) = getAccuracyAndPrecision ($ReferencePartition, $ModelPartition, $CheckValidity); | ||||
196 | |||||||
197 | # return just the accuracy. | ||||||
198 | 30 | 242 | return $accuracy; | ||||
199 | } | ||||||
200 | |||||||
201 | |||||||
202 | =head2 C |
||||||
203 | |||||||
204 | The routine C |
||||||
205 | two partitions as an array C<(accuracy, precision)>. | ||||||
206 | |||||||
207 | =over | ||||||
208 | |||||||
209 | =item C<$PartitionP, $PartitionQ> | ||||||
210 | |||||||
211 | The partitions are stored as nested array references of the form | ||||||
212 | C<[[],...[]]>. For example, the set partition C<{{a,b}, {1,2}}> of the set | ||||||
213 | C<{a,b,1,2}> should be stored as the nested array references | ||||||
214 | C<[['a','b']],[1,2]]>. Note the elements | ||||||
215 | of a set are represented as Perl strings. | ||||||
216 | |||||||
217 | =item C<$CheckValidity> | ||||||
218 | |||||||
219 | If C<$CheckValidity> evaluates to true, then checks are performed to | ||||||
220 | ensure both partitions are valid and an exception is thrown if they | ||||||
221 | are not. The default is false. | ||||||
222 | |||||||
223 | =back | ||||||
224 | |||||||
225 | An example: | ||||||
226 | |||||||
227 | use Set::Partitions::Similarity qw(getAccuracyAndPrecision); | ||||||
228 | use Data::Dump qw(dump); | ||||||
229 | dump getAccuracyAndPrecision ([[1,2], [3,4]], [[1], [2], [3], [4]]); | ||||||
230 | dump getAccuracyAndPrecision ([[1], [2], [3], [4]], [[1,2], [3,4]]); | ||||||
231 | # dumps: | ||||||
232 | # ("0.666666666666667", 0) | ||||||
233 | # ("0.666666666666667", 0) | ||||||
234 | |||||||
235 | =cut | ||||||
236 | |||||||
237 | sub getAccuracyAndPrecision | ||||||
238 | { | ||||||
239 | 30 | 30 | 1 | 75 | my ($ReferencePartition, $ModelPartition, $CheckValidity) = @_; | ||
240 | |||||||
241 | # get the base count of edge types. | ||||||
242 | 30 | 101 | my ($sameRefSameModel, $sameRefDiffModel, $diffRefSameModel, $diffRefDiffModel) = getBaseEdgeCounts ($ReferencePartition, $ModelPartition, $CheckValidity); | ||||
243 | |||||||
244 | # get the total number of bases edges. | ||||||
245 | 30 | 124 | my $baseEdges = $sameRefSameModel + $sameRefDiffModel + $diffRefSameModel; | ||||
246 | |||||||
247 | # if there are no base edges, the precision is one. | ||||||
248 | 30 | 62 | my $precision = 1; | ||||
249 | |||||||
250 | # get the precision. | ||||||
251 | 30 | 50 | 200 | $precision = $sameRefSameModel / $baseEdges if $baseEdges; | |||
252 | |||||||
253 | # get the total number of edges. | ||||||
254 | 30 | 83 | my $totalEdges = $sameRefSameModel + $sameRefDiffModel + $diffRefSameModel + $diffRefDiffModel; | ||||
255 | |||||||
256 | # if there are no edges, the accuracy is one. | ||||||
257 | 30 | 65 | my $accuracy = 1; | ||||
258 | |||||||
259 | # get the accuracy. | ||||||
260 | 30 | 50 | 141 | $accuracy = ($sameRefSameModel + $diffRefDiffModel) / $totalEdges if $totalEdges; | |||
261 | |||||||
262 | 30 | 157 | return ($accuracy, $precision); | ||||
263 | } | ||||||
264 | |||||||
265 | |||||||
266 | =head2 C |
||||||
267 | |||||||
268 | The routine C |
||||||
269 | two partitions, or C<1-getAccuracy($PartitionP, $PartitionQ, $CheckValidity)>. | ||||||
270 | |||||||
271 | =over | ||||||
272 | |||||||
273 | =item C<$PartitionP, $PartitionQ> | ||||||
274 | |||||||
275 | The partitions are stored as nested array references of the form | ||||||
276 | C<[[],...[]]>. For example, the set partition C<{{a,b}, {1,2}}> of the set | ||||||
277 | C<{a,b,1,2}> should be stored as the nested array references | ||||||
278 | C<[['a','b']],[1,2]]>. Note the elements | ||||||
279 | of a set are represented as Perl strings. | ||||||
280 | |||||||
281 | =item C<$CheckValidity> | ||||||
282 | |||||||
283 | If C<$CheckValidity> evaluates to true, then checks are performed to | ||||||
284 | ensure both partitions are valid and an exception is thrown if they | ||||||
285 | are not. The default is false. | ||||||
286 | |||||||
287 | =back | ||||||
288 | |||||||
289 | An example: | ||||||
290 | |||||||
291 | use Set::Partitions::Similarity qw(getDistance); | ||||||
292 | use Data::Dump qw(dump); | ||||||
293 | dump getDistance ([[1,2,3], [4]], [[1], [2,3,4]]); | ||||||
294 | # dumps: | ||||||
295 | # "0.666666666666667" | ||||||
296 | |||||||
297 | =cut | ||||||
298 | |||||||
299 | sub getDistance | ||||||
300 | { | ||||||
301 | 0 | 0 | 1 | 0 | my $accuracy = getAccuracy (@_); | ||
302 | 0 | 0 | 0 | return 1 - $accuracy if defined $accuracy; | |||
303 | 0 | 0 | return undef; | ||||
304 | } | ||||||
305 | |||||||
306 | |||||||
307 | =head2 C |
||||||
308 | |||||||
309 | The routine C |
||||||
310 | two partitions. | ||||||
311 | |||||||
312 | =over | ||||||
313 | |||||||
314 | =item C<$PartitionP, $PartitionQ> | ||||||
315 | |||||||
316 | The partitions are stored as nested array references of the form | ||||||
317 | C<[[],...[]]>. For example, the set partition C<{{a,b}, {1,2}}> of the set | ||||||
318 | C<{a,b,1,2}> should be stored as the nested array references | ||||||
319 | C<[['a','b']],[1,2]]>. Note the elements | ||||||
320 | of a set are represented as Perl strings. | ||||||
321 | |||||||
322 | =item C<$CheckValidity> | ||||||
323 | |||||||
324 | If C<$CheckValidity> evaluates to true, then checks are performed to | ||||||
325 | ensure both partitions are valid and an exception is thrown if they | ||||||
326 | are not. The default is false. | ||||||
327 | |||||||
328 | =back | ||||||
329 | |||||||
330 | An example: | ||||||
331 | |||||||
332 | use Set::Partitions::Similarity qw(getPrecision); | ||||||
333 | use Data::Dump qw(dump); | ||||||
334 | dump getPrecision ([[1,2,3], [4]], [[1], [2,3,4]]); | ||||||
335 | # dumps: | ||||||
336 | # "0.2" | ||||||
337 | |||||||
338 | =cut | ||||||
339 | |||||||
340 | sub getPrecision | ||||||
341 | { | ||||||
342 | 0 | 0 | 1 | 0 | my ($ReferencePartition, $ModelPartition, $CheckValidity) = @_; | ||
343 | |||||||
344 | # get both Similarity. | ||||||
345 | 0 | 0 | my ($accuracy, $precision) = getAccuracyAndPrecision ($ReferencePartition, $ModelPartition, $CheckValidity); | ||||
346 | |||||||
347 | # return just the precision. | ||||||
348 | 0 | 0 | return $precision; | ||||
349 | } | ||||||
350 | |||||||
351 | |||||||
352 | sub getBaseEdgeCounts | ||||||
353 | { | ||||||
354 | 30 | 30 | 0 | 63 | my ($ReferencePartition, $ModelPartition, $CheckValidity) = @_; | ||
355 | |||||||
356 | # validates the partitions or throws an exception. | ||||||
357 | 30 | 50 | 201 | validatePartitionsOrDie ($ReferencePartition, $ModelPartition) if $CheckValidity; | |||
358 | |||||||
359 | 30 | 199 | my ($sameRefSameModel, $sameRefDiffModel) = getPartitionsEdgeCounts ($ReferencePartition, $ModelPartition); | ||||
360 | 30 | 257 | my ($sameModelSameRef, $diffRefSameModel) = getPartitionsEdgeCounts ($ModelPartition, $ReferencePartition); | ||||
361 | |||||||
362 | # make sure the number of edges for | ||||||
363 | 30 | 50 | 261 | if ($sameRefSameModel != $sameModelSameRef) | |||
364 | { | ||||||
365 | 0 | 0 | die "programming error: computed different values for number of edges in same partitions.\n"; | ||||
366 | } | ||||||
367 | |||||||
368 | # get the number of elements in the universe of the sets. | ||||||
369 | 30 | 66 | my $totalElements; | ||||
370 | { | ||||||
371 | 30 | 104 | my %universe = map { ($_, 1) } map { @$_ } @$ReferencePartition; | ||||
30 | 222 | ||||||
471666 | 1547952 | ||||||
13946 | 327477 | ||||||
372 | 30 | 554477 | $totalElements = scalar keys %universe; | ||||
373 | } | ||||||
374 | |||||||
375 | # get the total edges. | ||||||
376 | 30 | 254 | my $totalEdges = ($totalElements * ($totalElements - 1)) / 2; | ||||
377 | |||||||
378 | 30 | 354 | return ($sameRefSameModel, $sameRefDiffModel, $diffRefSameModel, $totalEdges - $sameRefSameModel - $sameRefDiffModel - $diffRefSameModel); | ||||
379 | } | ||||||
380 | |||||||
381 | |||||||
382 | sub getPartitionsEdgeCounts | ||||||
383 | { | ||||||
384 | 60 | 60 | 0 | 162 | my ($ReferencePartition, $ModelPartition) = @_; | ||
385 | |||||||
386 | 60 | 120 | my %modelId; | ||||
387 | 60 | 427 | for (my $id = 0; $id < @$ModelPartition; $id++) | ||||
388 | { | ||||||
389 | 28468 | 41099 | my $subset = $ModelPartition->[$id]; | ||||
390 | 28468 | 43905 | foreach my $element (@$subset) | ||||
391 | { | ||||||
392 | 943332 | 2213372 | $modelId{$element} = $id; | ||||
393 | } | ||||||
394 | } | ||||||
395 | |||||||
396 | # count the number of edges in the same partitions and the number in the | ||||||
397 | # same reference partitions but difference model partitions. | ||||||
398 | 60 | 169 | my $samePartitions = 0; | ||||
399 | 60 | 119 | my $sameRefDiffModel = 0; | ||||
400 | 60 | 144 | foreach my $subset (@$ReferencePartition) | ||||
401 | { | ||||||
402 | 28468 | 31039 | my %subsetModelPartitionCounts; | ||||
403 | |||||||
404 | # need to ensure the subset elements are unique. | ||||||
405 | { | ||||||
406 | 28468 | 28206 | my %elements; | ||||
28468 | 28361 | ||||||
407 | 28468 | 69247 | for (my $i = 0; $i < @$subset; $i++) | ||||
408 | { | ||||||
409 | 943332 | 50 | 2232888 | unless (exists ($elements{$subset->[$i]})) | |||
410 | { | ||||||
411 | 943332 | 1740494 | $elements{$subset->[$i]} = 1; | ||||
412 | 943332 | 3693942 | ++$subsetModelPartitionCounts{$modelId{$subset->[$i]}}; | ||||
413 | } | ||||||
414 | } | ||||||
415 | } | ||||||
416 | |||||||
417 | # get the sizes of the model partitions of the subset. | ||||||
418 | 28468 | 63875 | my @subsetPartitionSizes = values %subsetModelPartitionCounts; | ||||
419 | |||||||
420 | # count the number of edges having nodes in the same partitions. | ||||||
421 | 28468 | 41403 | foreach my $size (@subsetPartitionSizes) | ||||
422 | { | ||||||
423 | 29074 | 76346 | $samePartitions += ($size * ($size - 1)) / 2; | ||||
424 | } | ||||||
425 | |||||||
426 | # count the number of edges having nodes in the same reference partitions | ||||||
427 | # but different model partitions. | ||||||
428 | 28468 | 36292 | my $prefixSumOfSizes; | ||||
429 | 28468 | 50 | 59053 | $prefixSumOfSizes = $subsetPartitionSizes[0] if @subsetPartitionSizes; | |||
430 | 28468 | 118266 | for (my $i = 1; $i < @subsetPartitionSizes; $i++) | ||||
431 | { | ||||||
432 | 606 | 708 | $sameRefDiffModel += $prefixSumOfSizes * $subsetPartitionSizes[$i]; | ||||
433 | 606 | 1291 | $prefixSumOfSizes += $subsetPartitionSizes[$i]; | ||||
434 | } | ||||||
435 | } | ||||||
436 | |||||||
437 | 60 | 730766 | return ($samePartitions, $sameRefDiffModel); | ||||
438 | } | ||||||
439 | |||||||
440 | |||||||
441 | # for the set partitions to be valid, the union of sets of each partition | ||||||
442 | # must be equal. the routine returns true of they are, false if not. | ||||||
443 | sub doPartitionsHaveSameUnion | ||||||
444 | { | ||||||
445 | 30 | 30 | 0 | 109 | my ($ReferencePartition, $ModelPartition) = @_; | ||
446 | |||||||
447 | # add all the reference elements to the hash. | ||||||
448 | 30 | 249 | my %universe = map { ($_, 1) } map { @$_ } @$ReferencePartition; | ||||
471666 | 1347251 | ||||||
13946 | 253827 | ||||||
449 | |||||||
450 | # now check each subset of the model partition. | ||||||
451 | 30 | 211991 | foreach my $subset (@$ModelPartition) | ||||
452 | { | ||||||
453 | # return 0 if an element from the subset is missing. | ||||||
454 | 14522 | 23685 | foreach my $element (@$subset) | ||||
455 | { | ||||||
456 | 471666 | 50 | 1232215 | return 0 unless exists $universe{$element}; | |||
457 | } | ||||||
458 | |||||||
459 | # delete all the elements in the hash from the subset. | ||||||
460 | 14522 | 22088 | foreach my $element (@$subset) | ||||
461 | { | ||||||
462 | 471666 | 774656 | delete $universe{$element}; | ||||
463 | } | ||||||
464 | } | ||||||
465 | |||||||
466 | # if there are any elements remaining return 0. | ||||||
467 | 30 | 50 | 145 | return 0 if %universe; | |||
468 | |||||||
469 | 30 | 274 | return 1; | ||||
470 | } | ||||||
471 | |||||||
472 | |||||||
473 | # this routine checks that the two partitions have the same union and | ||||||
474 | # each partition is composed for sets that a mutually disjoint. the | ||||||
475 | # routine throws and exception is the partitions are invalid. | ||||||
476 | sub validatePartitionsOrDie | ||||||
477 | { | ||||||
478 | 30 | 30 | 0 | 102 | my ($ReferencePartition, $ModelPartition) = @_; | ||
479 | |||||||
480 | # make sure the reference partition is valid. | ||||||
481 | 30 | 50 | 99 | unless (areSubsetsDisjoint ($ReferencePartition)) | |||
482 | { | ||||||
483 | 0 | 0 | die "first partition is an invalid partition, an element occurs in two or more subsets.\n"; | ||||
484 | } | ||||||
485 | |||||||
486 | # make sure the model partition is valid. | ||||||
487 | 30 | 50 | 228 | unless (areSubsetsDisjoint ($ModelPartition)) | |||
488 | { | ||||||
489 | 0 | 0 | die "second partition is an invalid partition, an element occurs in two or more subsets.\n"; | ||||
490 | } | ||||||
491 | |||||||
492 | # make sure the partitions have the same universe. | ||||||
493 | 30 | 50 | 372 | unless (doPartitionsHaveSameUnion ($ReferencePartition, $ModelPartition)) | |||
494 | { | ||||||
495 | 0 | 0 | die "partitions are invalid, they have different set elements.\n"; | ||||
496 | } | ||||||
497 | |||||||
498 | 30 | 83 | return 1; | ||||
499 | } | ||||||
500 | |||||||
501 | =head1 EXAMPLE | ||||||
502 | |||||||
503 | The code following measures the I |
||||||
504 | equally into subsets of size C<$s> to the entire set. | ||||||
505 | |||||||
506 | use Set::Partitions::Similarity qw(getDistance); | ||||||
507 | my @p = ([0..511]); | ||||||
508 | for (my $s = 1; $s <= 512; $s += $s) | ||||||
509 | { | ||||||
510 | my @q = map { [$s*$_..($s*$_+$s-1)] } (0..(512/$s-1)); | ||||||
511 | print join (', ', $s, getDistance (\@p, \@q, 1)) . "\n"; | ||||||
512 | } | ||||||
513 | # dumps: | ||||||
514 | # 1, 1 | ||||||
515 | # 2, 0.998043052837573 | ||||||
516 | # 4, 0.99412915851272 | ||||||
517 | # 8, 0.986301369863014 | ||||||
518 | # 16, 0.970645792563601 | ||||||
519 | # 32, 0.939334637964775 | ||||||
520 | # 64, 0.876712328767123 | ||||||
521 | # 128, 0.75146771037182 | ||||||
522 | # 256, 0.500978473581213 | ||||||
523 | # 512, 0 | ||||||
524 | |||||||
525 | =head1 INSTALLATION | ||||||
526 | |||||||
527 | To install the module run the following commands: | ||||||
528 | |||||||
529 | perl Makefile.PL | ||||||
530 | make | ||||||
531 | make test | ||||||
532 | make install | ||||||
533 | |||||||
534 | If you are on a windows box you should use 'nmake' rather than 'make'. | ||||||
535 | |||||||
536 | =head1 BUGS | ||||||
537 | |||||||
538 | Please email bugs reports or feature requests to C |
||||||
539 | the web interface at L |
||||||
540 | will be notified and you can be automatically notified of progress on the bug fix or feature request. | ||||||
541 | |||||||
542 | =head1 AUTHOR | ||||||
543 | |||||||
544 | Jeff Kubina |
||||||
545 | |||||||
546 | =head1 COPYRIGHT | ||||||
547 | |||||||
548 | Copyright (c) 2009 Jeff Kubina. All rights reserved. | ||||||
549 | This program is free software; you can redistribute | ||||||
550 | it and/or modify it under the same terms as Perl itself. | ||||||
551 | |||||||
552 | The full text of the license can be found in the | ||||||
553 | LICENSE file included with this module. | ||||||
554 | |||||||
555 | =head1 KEYWORDS | ||||||
556 | |||||||
557 | accuracy, clustering, measure, metric, partitions, precision, set, similarity | ||||||
558 | |||||||
559 | =head1 SEE ALSO | ||||||
560 | |||||||
561 | =begin html | ||||||
562 | |||||||
563 | Concise explainations of many cluster validity measures (including set partition measures) are available on |
||||||
564 | the Cluster validity algorithms page | ||||||
565 | of the Machaon Clustering and Validation Environment web site | ||||||
566 | by Nadia Bolshakova. | ||||||
567 | |||||||
568 | The Wikipedia article Accuracy and precision has a good explaination |
||||||
569 | of the accuracy and precision measures when applied to | ||||||
570 | binary classifications. | ||||||
571 | |||||||
572 | The report Objective Criteria for the Evaluation of Clustering Methods (1971) |
||||||
573 | by W.M. Rand in the Journal of the American Statistical Association provides an excellent analysis of the accuracy | ||||||
574 | measure of partitions. | ||||||
575 | |||||||
576 | =end html | ||||||
577 | |||||||
578 | =cut | ||||||
579 | |||||||
580 | 1; | ||||||
581 | # The preceding line will help the module return a true value | ||||||
582 |