line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Data::Mining::AssociationRules;
|
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
29186
|
use strict;
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
39
|
|
4
|
1
|
|
|
1
|
|
6
|
use warnings;
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
34
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
BEGIN {
|
7
|
1
|
|
|
1
|
|
6
|
use Exporter ();
|
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
21
|
|
8
|
1
|
|
|
1
|
|
5
|
use vars qw ($AUTHOR $VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
|
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
154
|
|
9
|
1
|
|
|
1
|
|
2
|
$AUTHOR = 'Dan Frankowski ';
|
10
|
1
|
|
|
|
|
5
|
@EXPORT = @EXPORT_OK = qw(generate_frequent_sets
|
11
|
|
|
|
|
|
|
generate_rules
|
12
|
|
|
|
|
|
|
read_frequent_sets
|
13
|
|
|
|
|
|
|
read_transaction_file
|
14
|
|
|
|
|
|
|
set_debug);
|
15
|
|
|
|
|
|
|
|
16
|
1
|
|
|
|
|
3
|
%EXPORT_TAGS = ();
|
17
|
1
|
|
|
|
|
19
|
@ISA = qw(Exporter);
|
18
|
1
|
|
|
|
|
1900
|
$VERSION = 0.1;
|
19
|
|
|
|
|
|
|
}
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
my $debug = 0;
|
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=head1 NAME
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
Data::Mining:AssociationRules - Mine association rules and frequent
|
26
|
|
|
|
|
|
|
sets from data.
|
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
=head1 SYNOPSIS
|
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
use Data::Mining::AssociationRules;
|
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
my %transaction_map;
|
33
|
|
|
|
|
|
|
my $transaction_file = "foo.txt";
|
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
read_transaction_file(\%transaction_map, $transaction_file);
|
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
generate_frequent_sets(\%transaction_map, $output_file_prefix,
|
38
|
|
|
|
|
|
|
$support_threshold, $max_n);
|
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
generate_rules($output_file_prefix, $support_threshold,
|
41
|
|
|
|
|
|
|
$confidence_threshold, $max_n);
|
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
read_frequent_sets($set_map_ref, $file_prefix)
|
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
set_debug(1);
|
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
perl arm.pl -transaction-file foo.txt -support 2 -confidence-threshold 0.01 -max-set-size 6
|
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
See also FUNCTIONS, DESCRIPTION, and EXAMPLES below.
|
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
=head1 INSTALLATION
|
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
The typical:
|
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=over
|
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=item 0 perl Makefile.PL
|
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=item 0 make test
|
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
=item 0 make install
|
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=back
|
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=head1 FUNCTIONS
|
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=cut
|
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=pod
|
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=head2 read_transaction_file($transaction_map_ref, $transaction_file)
|
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
Read in a transaction map from a file which has lines of two
|
74
|
|
|
|
|
|
|
whitespace-separated columns:
|
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
=over
|
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
transaction-id item-id
|
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
=back
|
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
=cut
|
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
sub read_transaction_file {
|
85
|
1
|
|
|
1
|
1
|
37
|
my $transaction_map_ref = shift;
|
86
|
1
|
|
|
|
|
3
|
my $transaction_file = shift;
|
87
|
|
|
|
|
|
|
|
88
|
1
|
50
|
|
|
|
42
|
open(BFILE, $transaction_file) or die "Couldn't open $transaction_file: $!\n";
|
89
|
1
|
|
|
|
|
36
|
while ( ) {
|
90
|
10
|
|
|
|
|
27
|
my @data = split;
|
91
|
10
|
50
|
|
|
|
26
|
die "Expected 2 columns, found ", int(@data), "\n" if int(@data) != 2;
|
92
|
10
|
|
|
|
|
17
|
my ($tid, $item) = @data;
|
93
|
10
|
|
|
|
|
50
|
$$transaction_map_ref{$item}{$tid}++;
|
94
|
|
|
|
|
|
|
}
|
95
|
1
|
|
|
|
|
27
|
close(BFILE);
|
96
|
|
|
|
|
|
|
}
|
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
=pod
|
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=head2 generate_frequent_sets ($transaction_map_ref, $file_prefix, $support_threshold, $max_n)
|
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
Given
|
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=over
|
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
=item 0 a map of transactions
|
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=item 0 a file prefix
|
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=item 0 a support threshold
|
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=item 0 a maximum frequent set size to look for (optional)
|
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=back
|
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
generate the frequent sets in some files, one file per size of the set.
|
117
|
|
|
|
|
|
|
That is, all 1-sets are in a file, all 2-sets in another, etc.
|
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
The files are lines of the form:
|
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
=over
|
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
support-count item-set
|
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
=back
|
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
where
|
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
=over
|
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=item 0 support-count is the number of transactions in which the item-set appears
|
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=item 0 item-set is one or more space-separated items
|
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
=back
|
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
=cut
|
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
sub generate_frequent_sets {
|
140
|
3
|
|
|
3
|
1
|
1886
|
my $transaction_map_ref = shift;
|
141
|
3
|
|
|
|
|
5
|
my $file_prefix = shift;
|
142
|
3
|
|
|
|
|
6
|
my $support_threshold = shift;
|
143
|
3
|
|
|
|
|
5
|
my $max_n = shift;
|
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
# Generate 1-sets
|
146
|
3
|
|
|
|
|
4
|
my $n = 1;
|
147
|
3
|
|
|
|
|
9
|
my $out_nset = nset_filename($n, $file_prefix, $support_threshold);
|
148
|
3
|
50
|
|
|
|
245
|
open(OUT, ">$out_nset") or die "Couldn't open $out_nset for writing: $!\n";
|
149
|
3
|
|
|
|
|
6
|
while (my ($item, $item_map) = each %{$transaction_map_ref}) {
|
|
21
|
|
|
|
|
58
|
|
150
|
18
|
|
|
|
|
23
|
my $support = int(keys(%$item_map));
|
151
|
18
|
100
|
|
|
|
39
|
if ($support >= $support_threshold) {
|
152
|
14
|
|
|
|
|
40
|
print OUT "$support $item\n";
|
153
|
|
|
|
|
|
|
}
|
154
|
|
|
|
|
|
|
}
|
155
|
3
|
|
|
|
|
3
|
my $num_nsets = int(keys(%{$transaction_map_ref}));
|
|
3
|
|
|
|
|
7
|
|
156
|
3
|
50
|
|
|
|
7
|
print STDERR "$num_nsets $n-sets\n" if $debug;
|
157
|
3
|
|
|
|
|
141
|
close(OUT);
|
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
# Generate n-sets
|
160
|
3
|
|
|
|
|
6
|
my $done = 0;
|
161
|
3
|
|
|
|
|
13
|
while ($num_nsets > 0) {
|
162
|
7
|
|
|
|
|
8
|
$n++;
|
163
|
7
|
|
|
|
|
8
|
$num_nsets = 0;
|
164
|
|
|
|
|
|
|
|
165
|
7
|
50
|
33
|
|
|
22
|
last if defined($max_n) && ($n > $max_n);
|
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# Go through (n-1)-sets, pruning as you go
|
168
|
7
|
|
|
|
|
20
|
my $prior_nset = nset_filename($n-1, $file_prefix, $support_threshold);
|
169
|
7
|
50
|
|
|
|
211
|
open(PRIOR, $prior_nset) or die "Couldn't open $prior_nset: $!\n";
|
170
|
7
|
|
|
|
|
17
|
$out_nset = nset_filename($n, $file_prefix, $support_threshold);
|
171
|
7
|
50
|
|
|
|
399
|
open(OUT, ">$out_nset") or die "Couldn't open $out_nset: $!\n";
|
172
|
7
|
|
|
|
|
73
|
while ( ) {
|
173
|
22
|
|
|
|
|
69
|
my ($count, @set) = split;
|
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
# Create userset, which contains the intersection of $transaction{@set}
|
176
|
22
|
|
|
|
|
30
|
my %userset = % {$$transaction_map_ref{$set[0]}};
|
|
22
|
|
|
|
|
75
|
|
177
|
22
|
|
|
|
|
59
|
foreach my $item ( @set[1 .. $#set] ) {
|
178
|
10
|
|
|
|
|
31
|
while (my ($user, $dummy) = each %userset) {
|
179
|
14
|
100
|
|
|
|
63
|
if (!exists($$transaction_map_ref{$item}{$user})) {
|
180
|
4
|
|
|
|
|
16
|
delete($userset{$user});
|
181
|
|
|
|
|
|
|
}
|
182
|
|
|
|
|
|
|
}
|
183
|
|
|
|
|
|
|
}
|
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
# For each 1-set, intersect further, and spit out if > support_threshold
|
186
|
22
|
|
|
|
|
29
|
while (my ($item, $user_set) = each %{$transaction_map_ref}) {
|
|
154
|
|
|
|
|
486
|
|
187
|
|
|
|
|
|
|
# Only spit sets of non-decreasing elements
|
188
|
|
|
|
|
|
|
# This keeps out duplicates
|
189
|
132
|
|
|
|
|
130
|
my $dup_set = 0;
|
190
|
132
|
|
|
|
|
169
|
foreach my $set_item ( @set ) {
|
191
|
168
|
100
|
|
|
|
349
|
if ($set_item ge $item) {
|
192
|
86
|
|
|
|
|
87
|
$dup_set = 1;
|
193
|
86
|
|
|
|
|
95
|
last;
|
194
|
|
|
|
|
|
|
}
|
195
|
|
|
|
|
|
|
}
|
196
|
|
|
|
|
|
|
|
197
|
132
|
100
|
|
|
|
294
|
if (!$dup_set) {
|
198
|
46
|
|
|
|
|
134
|
my %newset = %userset;
|
199
|
46
|
|
|
|
|
122
|
while (my ($user, $dummy) = each %newset) {
|
200
|
70
|
100
|
|
|
|
175
|
if (!exists($$user_set{$user})) {
|
201
|
61
|
|
|
|
|
199
|
delete($newset{$user});
|
202
|
|
|
|
|
|
|
}
|
203
|
|
|
|
|
|
|
}
|
204
|
|
|
|
|
|
|
#print "newset is now " . map_str(\%newset) . "\n";
|
205
|
46
|
|
|
|
|
57
|
my $num_users = int(keys(%newset));
|
206
|
|
|
|
|
|
|
#print "item $item set @set numusers is $num_users\n";
|
207
|
46
|
100
|
|
|
|
128
|
if ($num_users >= $support_threshold) {
|
208
|
8
|
|
|
|
|
58
|
print OUT "$num_users @set $item\n";
|
209
|
8
|
|
|
|
|
19
|
$num_nsets++;
|
210
|
|
|
|
|
|
|
}
|
211
|
|
|
|
|
|
|
}
|
212
|
|
|
|
|
|
|
}
|
213
|
|
|
|
|
|
|
}
|
214
|
7
|
|
|
|
|
68
|
close(PRIOR);
|
215
|
7
|
|
|
|
|
280
|
close(OUT);
|
216
|
7
|
50
|
66
|
|
|
32
|
print STDERR "$num_nsets $n-sets\n" if ($num_nsets > 0) && $debug;
|
217
|
7
|
100
|
|
|
|
199
|
unlink($out_nset) if 0 == $num_nsets;
|
218
|
|
|
|
|
|
|
}
|
219
|
|
|
|
|
|
|
}
|
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=pod
|
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
=head2 read_frequent_sets($set_map_ref, $file_prefix)
|
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
Given
|
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
=over
|
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
=item 0 a set map
|
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
=item 0 a file prefix
|
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
=item 0 support threshold
|
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
=item 0 max frequent set size (optional)
|
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
=back
|
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
read all the frequent sets into a single map, which has as its key the
|
240
|
|
|
|
|
|
|
frequent set (joined by single spaces) and as its value the support.
|
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
=cut
|
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
sub read_frequent_sets {
|
245
|
9
|
|
|
9
|
1
|
41
|
my $set_map_ref = shift;
|
246
|
9
|
|
|
|
|
11
|
my $file_prefix = shift;
|
247
|
9
|
|
|
|
|
11
|
my $support_threshold = shift;
|
248
|
9
|
|
|
|
|
11
|
my $max_n = shift;
|
249
|
|
|
|
|
|
|
|
250
|
9
|
50
|
|
|
|
164
|
opendir(DIR, '.') || die "can't opendir '.': $!";
|
251
|
9
|
100
|
|
|
|
195
|
my @files = grep { /^$file_prefix/ && -f "./$_" } readdir(DIR);
|
|
120
|
|
|
|
|
1144
|
|
252
|
9
|
|
|
|
|
95
|
closedir DIR;
|
253
|
|
|
|
|
|
|
|
254
|
9
|
|
|
|
|
16
|
foreach my $file (@files) {
|
255
|
|
|
|
|
|
|
# print STDERR "Read file $file ..\n";
|
256
|
51
|
100
|
|
|
|
364
|
if ( $file =~ /${file_prefix}\-support\-(\d+)\-(\d+)set/ ) {
|
257
|
26
|
|
|
|
|
49
|
my $support = $1;
|
258
|
26
|
|
|
|
|
31
|
my $n = $2;
|
259
|
26
|
100
|
33
|
|
|
97
|
next if ($support != $support_threshold)
|
|
|
|
66
|
|
|
|
|
260
|
|
|
|
|
|
|
|| (defined($max_n) && ($n > $max_n));
|
261
|
|
|
|
|
|
|
|
262
|
21
|
50
|
|
|
|
557
|
open(SETS, $file) or die "Couldn't open $file: $!\n";
|
263
|
21
|
|
|
|
|
204
|
while ( ) {
|
264
|
66
|
|
|
|
|
183
|
my ($count, @set) = split;
|
265
|
66
|
|
|
|
|
350
|
$$set_map_ref{join(' ', @set)} = $count;
|
266
|
|
|
|
|
|
|
}
|
267
|
21
|
|
|
|
|
199
|
close(SETS);
|
268
|
|
|
|
|
|
|
}
|
269
|
|
|
|
|
|
|
}
|
270
|
|
|
|
|
|
|
}
|
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
# =pod
|
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
# =head2 nset_filename($n, $file_prefix, $support_threshold)
|
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
# Given
|
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
# =over
|
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
# =item 0 set size
|
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
# =item 0 a file prefix
|
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
# =item 0 a support threshold
|
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
# =back
|
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
# return the name of the file that contains the specified frequent sets.
|
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
# =cut
|
291
|
|
|
|
|
|
|
sub nset_filename {
|
292
|
17
|
|
|
17
|
0
|
22
|
my $n = shift;
|
293
|
17
|
|
|
|
|
20
|
my $file_prefix = shift;
|
294
|
17
|
|
|
|
|
20
|
my $support_threshold = shift;
|
295
|
|
|
|
|
|
|
|
296
|
17
|
|
|
|
|
65
|
return $file_prefix . "-support-" . $support_threshold . "-" . $n . "set.txt";
|
297
|
|
|
|
|
|
|
}
|
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
=pod
|
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
=head2 generate_rules($file_prefix, $support_threshold, $max_n)
|
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
Given
|
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
=over
|
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
=item 0 a file prefix
|
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
=item 0 a support threshold (optional)
|
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
=item 0 a confidence threshold (optional)
|
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
=item 0 maximum frequent set size to look for (optional)
|
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
=back
|
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
create a file with all association rules in it. The output file is of
|
318
|
|
|
|
|
|
|
the form:
|
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
support-count confidence left-hand-set-size right-hand-set-size frequent-set-size left-hand-set => right-hand-set
|
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
=cut
|
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
sub generate_rules {
|
325
|
3
|
|
|
3
|
1
|
1078
|
my $file_prefix = shift;
|
326
|
3
|
|
|
|
|
6
|
my $support_threshold = shift;
|
327
|
3
|
|
|
|
|
6
|
my $confidence_threshold = shift;
|
328
|
3
|
|
|
|
|
4
|
my $max_n = shift;
|
329
|
|
|
|
|
|
|
|
330
|
3
|
50
|
|
|
|
10
|
$support_threshold = 1 if !defined($support_threshold);
|
331
|
3
|
50
|
|
|
|
9
|
$confidence_threshold = 0 if !defined($confidence_threshold);
|
332
|
|
|
|
|
|
|
|
333
|
3
|
|
|
|
|
4
|
my $num_rules = 0;
|
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
# Read in frequent set supports
|
336
|
3
|
|
|
|
|
5
|
my %frequent_set;
|
337
|
3
|
|
|
|
|
7
|
read_frequent_sets(\%frequent_set, $file_prefix, $support_threshold, $max_n);
|
338
|
|
|
|
|
|
|
|
339
|
3
|
50
|
|
|
|
11
|
die "Found no frequent sets from file prefix $file_prefix support $support_threshold " if (0 == int(keys(%frequent_set)));
|
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
# Go through the sets computing stats
|
342
|
3
|
|
|
|
|
510
|
my $rulefile = $file_prefix . '-support-' . $support_threshold . '-conf-' .
|
343
|
|
|
|
|
|
|
$confidence_threshold . '-rules.txt';
|
344
|
3
|
50
|
|
|
|
221
|
open(RULES, ">$rulefile") or die "Couldn't open $rulefile: $!\n";
|
345
|
3
|
|
|
|
|
14
|
while (my ($set, $count) = each %frequent_set) {
|
346
|
|
|
|
|
|
|
# Traverse all subsets (save full and empty)
|
347
|
22
|
|
|
|
|
29
|
my $support = $frequent_set{$set};
|
348
|
22
|
50
|
|
|
|
36
|
die "Couldn't find frequent set '$set'" if !defined($support);
|
349
|
22
|
|
|
|
|
42
|
my @set = split('\s+', $set);
|
350
|
|
|
|
|
|
|
|
351
|
22
|
|
|
|
|
84
|
for my $lhs_selector (1..(1<
|
352
|
24
|
|
|
|
|
76
|
my @lhs_set = @set[grep $lhs_selector&1<<$_, 0..$#set];
|
353
|
24
|
|
|
|
|
39
|
my $all_ones = (1<
|
354
|
24
|
|
|
|
|
28
|
my $rhs_selector = $all_ones ^ $lhs_selector;
|
355
|
24
|
|
|
|
|
65
|
my @rhs_set = @set[grep $rhs_selector&1<<$_, 0..$#set];
|
356
|
|
|
|
|
|
|
# print "lhs_selector $lhs_selector 1<
|
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
# print "lhs_set @lhs_set ";
|
359
|
|
|
|
|
|
|
# print "rhs_set @rhs_set\n";
|
360
|
|
|
|
|
|
|
|
361
|
24
|
|
|
|
|
56
|
my $lhs_set = join(' ', @lhs_set);
|
362
|
24
|
|
|
|
|
29
|
my $rhs_set = join(' ', @rhs_set);
|
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
# Spit out rule
|
365
|
24
|
|
|
|
|
39
|
my $lhs_support = $frequent_set{$lhs_set};
|
366
|
|
|
|
|
|
|
#my $rhs_support = $frequent_set{$rhs_set};
|
367
|
24
|
50
|
|
|
|
42
|
die "Couldn't find frequent set '$lhs_set'" if !defined($lhs_support);
|
368
|
|
|
|
|
|
|
#die "Couldn't find frequent set '$rhs_set'" if !defined($rhs_support);
|
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
# For rule A => B, support = T(AB), conf = T(AB) / T(A)
|
371
|
24
|
|
|
|
|
31
|
my $conf = $support / $lhs_support;
|
372
|
|
|
|
|
|
|
|
373
|
24
|
50
|
|
|
|
45
|
if ($conf >= $confidence_threshold) {
|
374
|
24
|
|
|
|
|
21
|
$num_rules++;
|
375
|
24
|
|
|
|
|
225
|
print RULES "$support ", sprintf("%.3f ", $conf),
|
376
|
|
|
|
|
|
|
int(@lhs_set), ' ', int(@rhs_set), ' ', int(@set), ' ',
|
377
|
|
|
|
|
|
|
"$lhs_set => $rhs_set\n";
|
378
|
|
|
|
|
|
|
}
|
379
|
|
|
|
|
|
|
}
|
380
|
|
|
|
|
|
|
}
|
381
|
3
|
|
|
|
|
91
|
close(RULES);
|
382
|
3
|
50
|
|
|
|
18
|
print STDERR "$num_rules rules\n" if $debug;
|
383
|
|
|
|
|
|
|
}
|
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
sub set_debug {
|
386
|
0
|
|
|
0
|
0
|
|
$debug = $_[0];
|
387
|
|
|
|
|
|
|
}
|
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
1;
|
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
=head1 DESCRIPTION
|
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
This module contains some functions to do association rule mining from
|
394
|
|
|
|
|
|
|
text files. This sounds obscure, but really measures beautifully
|
395
|
|
|
|
|
|
|
simple things through counting.
|
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
=head2 FREQUENT SETS
|
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
Frequent sets answer the question, "Which events occur together more
|
400
|
|
|
|
|
|
|
than N times?"
|
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
=head3 The detail
|
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
The 'transaction file' contains items in transactions. A set of items
|
405
|
|
|
|
|
|
|
has 'support' s if all the items occur together in at least s
|
406
|
|
|
|
|
|
|
transactions. (In many papers, support is a number between 0 and 1
|
407
|
|
|
|
|
|
|
representing the fraction of total transactions. I found the absolute
|
408
|
|
|
|
|
|
|
number itself more interesting, so I use that instead. Sorry for the
|
409
|
|
|
|
|
|
|
confusion.) For an itemset "A B C", the support is sometimes notated
|
410
|
|
|
|
|
|
|
"T(A B C)" (the number of 'T'ransactions).
|
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
A set of items is called a 'frequent set' if it has support at least
|
413
|
|
|
|
|
|
|
the given support threshold. Generating frequent set produces all
|
414
|
|
|
|
|
|
|
frequent sets, and some information about each set (e.g., its
|
415
|
|
|
|
|
|
|
support).
|
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
=head2 RULES
|
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
Association rules answer the (related) question, "When these events
|
420
|
|
|
|
|
|
|
occur, how often do those events also occur?"
|
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
=head3 The detail
|
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
A rule has a left-hand set of items and a right-hand set
|
425
|
|
|
|
|
|
|
of items. A rule "LHS => RHS" with a support s and 'confidence' c means
|
426
|
|
|
|
|
|
|
that the underlying frequent set (LHS + RHS) occured together in at
|
427
|
|
|
|
|
|
|
least s transactions, and for all the transactions LHS occurred in,
|
428
|
|
|
|
|
|
|
RHS also occured in at least the fraction c (a number from 0 to 1).
|
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
Generating rules produces all rules with support at least the given
|
431
|
|
|
|
|
|
|
support threshold, and confidence at least the given confidence
|
432
|
|
|
|
|
|
|
threshold. The confidence is sometimes notated "conf(LHS => RHS) =
|
433
|
|
|
|
|
|
|
T(LHS + RHS) / T(LHS)". There is also related data with each rule
|
434
|
|
|
|
|
|
|
(e.g., the size of its LHS and RHS, the support, the confidence,
|
435
|
|
|
|
|
|
|
etc.).
|
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
=head3 FREQUENT SETS AND ASSOCIATION RULES GENERALLY USEFUL
|
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
Although association rule mining is often described in commercial
|
440
|
|
|
|
|
|
|
terms like "market baskets" or "transactions" (collections of events)
|
441
|
|
|
|
|
|
|
and "items" (events), one can imagine events that make this sort of
|
442
|
|
|
|
|
|
|
counting useful across many domains. Events could be
|
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
=over
|
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=item 0 stock market went down at time t
|
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
=item 0 patient had symptom X
|
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
=item 0 flower petal length was > 5mm
|
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
=back
|
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
For this reason, I believe counting frequent sets and looking at
|
455
|
|
|
|
|
|
|
association rules to be a fundamental tool of any data miner, someone
|
456
|
|
|
|
|
|
|
who is looking for patterns in pre-existing data, whether commercial
|
457
|
|
|
|
|
|
|
or not.
|
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
=head1 EXAMPLES
|
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
Given the following input file:
|
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
234 Orange
|
464
|
|
|
|
|
|
|
463 Strawberry
|
465
|
|
|
|
|
|
|
53 Apple
|
466
|
|
|
|
|
|
|
234 Banana
|
467
|
|
|
|
|
|
|
412 Peach
|
468
|
|
|
|
|
|
|
467 Pear
|
469
|
|
|
|
|
|
|
234 Pear
|
470
|
|
|
|
|
|
|
147 Pear
|
471
|
|
|
|
|
|
|
141 Orange
|
472
|
|
|
|
|
|
|
375 Orange
|
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
Generating frequent sets at support threshold 1 (a.k.a. 'at support 1')
|
475
|
|
|
|
|
|
|
produces three files:
|
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
The 1-sets:
|
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
1 Strawberry
|
480
|
|
|
|
|
|
|
1 Banana
|
481
|
|
|
|
|
|
|
1 Apple
|
482
|
|
|
|
|
|
|
3 Orange
|
483
|
|
|
|
|
|
|
1 Peach
|
484
|
|
|
|
|
|
|
3 Pear
|
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
The 2-sets:
|
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
1 Banana Orange
|
489
|
|
|
|
|
|
|
1 Banana Pear
|
490
|
|
|
|
|
|
|
1 Orange Pear
|
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
The 3-sets:
|
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
1 Banana Orange Pear
|
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
Generating the rules at support 1 produces the following:
|
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
1 0.333 1 1 2 Orange => Pear
|
499
|
|
|
|
|
|
|
1 0.333 1 1 2 Pear => Orange
|
500
|
|
|
|
|
|
|
1 1.000 1 2 3 Banana => Orange Pear
|
501
|
|
|
|
|
|
|
1 0.333 1 2 3 Orange => Banana Pear
|
502
|
|
|
|
|
|
|
1 1.000 2 1 3 Banana Orange => Pear
|
503
|
|
|
|
|
|
|
1 0.333 1 2 3 Pear => Banana Orange
|
504
|
|
|
|
|
|
|
1 1.000 2 1 3 Banana Pear => Orange
|
505
|
|
|
|
|
|
|
1 1.000 2 1 3 Orange Pear => Banana
|
506
|
|
|
|
|
|
|
1 1.000 1 1 2 Banana => Orange
|
507
|
|
|
|
|
|
|
1 0.333 1 1 2 Orange => Banana
|
508
|
|
|
|
|
|
|
1 1.000 1 1 2 Banana => Pear
|
509
|
|
|
|
|
|
|
1 0.333 1 1 2 Pear => Banana
|
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
Generating frequent sets at support 2 produces one file:
|
512
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
3 Orange
|
514
|
|
|
|
|
|
|
3 Pear
|
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
Generating rules at support 2 produces nothing.
|
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
Generating rules at support 1 and confidence 0.5 produces:
|
519
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
1 1.000 1 2 3 Banana => Orange Pear
|
521
|
|
|
|
|
|
|
1 1.000 2 1 3 Banana Orange => Pear
|
522
|
|
|
|
|
|
|
1 1.000 2 1 3 Banana Pear => Orange
|
523
|
|
|
|
|
|
|
1 1.000 2 1 3 Orange Pear => Banana
|
524
|
|
|
|
|
|
|
1 1.000 1 1 2 Banana => Orange
|
525
|
|
|
|
|
|
|
1 1.000 1 1 2 Banana => Pear
|
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
Note all the lower confidence rules are gone.
|
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
=head1 ALGORITHM
|
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
=head2 Generating frequent sets
|
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
Generating frequent sets is straight-up Apriori. See for example:
|
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
http://www.almaden.ibm.com/software/quest/Publications/papers/vldb94_rj.pdf
|
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
I have not optimized. It depends on having the transactions all in
|
538
|
|
|
|
|
|
|
memory. However, given that, it still might scale decently (millions
|
539
|
|
|
|
|
|
|
of transactions).
|
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
=head2 Generating rules
|
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
Generating rules is a very vanilla implementation. It requires
|
544
|
|
|
|
|
|
|
reading all the frequent sets into memory, which does not scale at
|
545
|
|
|
|
|
|
|
all. Given that, since computers have lots of memory these days, you
|
546
|
|
|
|
|
|
|
might still be able to get away with millions of frequent sets (which
|
547
|
|
|
|
|
|
|
is <
|
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
=head1 BUGS
|
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
There is an existing tool (written in C) to mine frequent sets I kept
|
552
|
|
|
|
|
|
|
running across:
|
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
http://fuzzy.cs.uni-magdeburg.de/~borgelt/software.html#assoc
|
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
I should check it out to see if it is easy or desirable to be
|
557
|
|
|
|
|
|
|
file-level compatible with it.
|
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
One could imagine wrapping it in Perl, but the Perl-C/C++ barrier is
|
560
|
|
|
|
|
|
|
where I have encountered all my troubles in the past, so I wouldn't
|
561
|
|
|
|
|
|
|
personally pursue that.
|
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
=head1 VERSION
|
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
This document describes Data::Mining::AssociationRules version 0.1.
|
566
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
=head1 AUTHOR
|
568
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
Dan Frankowski
|
570
|
|
|
|
|
|
|
dfrankow@winternet.com
|
571
|
|
|
|
|
|
|
http://www.winternet.com/~dfrankow
|
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
Hey, if you download this module, drop me an email! That's the fun
|
574
|
|
|
|
|
|
|
part of this whole open source thing.
|
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
=head1 LICENSE
|
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
579
|
|
|
|
|
|
|
it under the same terms as Perl itself.
|
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
The full text of the license can be found in the LICENSE file included
|
582
|
|
|
|
|
|
|
in the distribution and available in the CPAN listing for
|
583
|
|
|
|
|
|
|
Data::Mining::AssociationRules (see www.cpan.org or search.cpan.org).
|
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
=head1 DISCLAIMER
|
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
To the maximum extent permitted by applicable law, the author of this
|
588
|
|
|
|
|
|
|
module disclaims all warranties, either express or implied, including
|
589
|
|
|
|
|
|
|
but not limited to implied warranties of merchantability and fitness
|
590
|
|
|
|
|
|
|
for a particular purpose, with regard to the software and the
|
591
|
|
|
|
|
|
|
accompanying documentation.
|
592
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
=cut
|