line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package DBIx::DataAudit; |
2
|
3
|
|
|
3
|
|
4215
|
use strict; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
334
|
|
3
|
3
|
|
|
3
|
|
48
|
use Carp qw(croak carp); |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
598
|
|
4
|
3
|
|
|
3
|
|
7472
|
use DBI; |
|
3
|
|
|
|
|
77995
|
|
|
3
|
|
|
|
|
397
|
|
5
|
3
|
|
|
3
|
|
4381
|
use parent 'Class::Accessor'; |
|
3
|
|
|
|
|
890
|
|
|
3
|
|
|
|
|
15
|
|
6
|
3
|
|
|
3
|
|
9842
|
use vars '$VERSION'; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
222
|
|
7
|
|
|
|
|
|
|
$VERSION = '0.13'; |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=head1 NAME |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
DBIx::DataAudit - summarize column data for a table |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=head1 SYNOPSIS |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
use DBIx::DataAudit; |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
warn "Running audit for table $table"; |
18
|
|
|
|
|
|
|
my $audit = DBIx::DataAudit->audit( dsn => 'dbi:SQLite:dbname=test.sqlite', table => 'test' ); |
19
|
|
|
|
|
|
|
print $audit->as_text; |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# or |
22
|
|
|
|
|
|
|
print $audit->as_html; |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
This module provides a summary about the data contained in a table. It provides |
25
|
|
|
|
|
|
|
the descriptive statistics for every column. It's surprising |
26
|
|
|
|
|
|
|
how much bad data you find by looking at the minimum and maximum |
27
|
|
|
|
|
|
|
values of a column alone. |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
It tries to get the information in one table scan. |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=head1 HOW IT WORKS |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
The module works by constructing an SQL statement that collects the information |
34
|
|
|
|
|
|
|
about the columns in a single full table scan. |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=head1 COLUMN TRAITS |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
You can specify which information is collected about every column by specifying the traits. |
39
|
|
|
|
|
|
|
The hierarchy of traits is as follows: |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
any < ordered < numeric |
42
|
|
|
|
|
|
|
< string |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
The following traits are collected for every column by default: |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=over 4 |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
=item * C |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
Number of rows in the column |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=item * C |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
Number of distinct values in the column |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=item * C |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
Number of C values for the column |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=back |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
For columns that are recognized as ordered, the following additional traits are collected: |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=over 4 |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=item * C |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
Minimum value for the column |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=item * C |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
Maximum value for the column |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
=back |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
For columns that are recognized as numeric, the following additional traits are collected: |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
=over 4 |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
=item * C |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
Average value for the column |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
=back |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
For columns that are recognized as string, the following additional traits are collected: |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=over 4 |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
=item * C |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
Number of values that consist only of blanks (C) |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
=item * C |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
Number of values that consist only of the empty string (C<''>) |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
=item * C |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
Number of values that consist only of the empty string (C<''>), |
101
|
|
|
|
|
|
|
are blank (C) or are C |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
=back |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=cut |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
=head1 GLOBAL VARIABLES |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
To customize some default behaviour, the some global variables |
110
|
|
|
|
|
|
|
are defined. Read the source to find their names. |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=cut |
113
|
|
|
|
|
|
|
|
114
|
3
|
|
|
3
|
|
13
|
use vars qw'@default_traits %trait_type %trait_hierarchy $trait_inapplicable %sql_type_map'; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
6181
|
|
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
@default_traits = qw[min max count values null avg blank empty missing ]; |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
%trait_type = ( |
119
|
|
|
|
|
|
|
count => ['any','count(%s)'], |
120
|
|
|
|
|
|
|
values => ['any','count(distinct %s)'], |
121
|
|
|
|
|
|
|
null => ['any','sum(case when %s is null then 1 else 0 end)'], |
122
|
|
|
|
|
|
|
min => ['ordered','min(%s)'], |
123
|
|
|
|
|
|
|
max => ['ordered','max(%s)'], |
124
|
|
|
|
|
|
|
avg => ['numeric','avg(%s)'], |
125
|
|
|
|
|
|
|
#modus => ['any','sum(1)group by %s'], # find the element that occurs the most |
126
|
|
|
|
|
|
|
# Possibly with only a single table scan |
127
|
|
|
|
|
|
|
blank => ['string',"sum(case when trim(%s)='' then 1 else 0 end)"], |
128
|
|
|
|
|
|
|
empty => ['string',"sum(case when %s='' then 1 else 0 end)"], |
129
|
|
|
|
|
|
|
missing => ['string',"sum(case when trim(%s)='' then 1 when %s is null then 1 else 0 end)"], |
130
|
|
|
|
|
|
|
); |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
%trait_hierarchy = ( |
133
|
|
|
|
|
|
|
any => [], |
134
|
|
|
|
|
|
|
ordered => ['any'], |
135
|
|
|
|
|
|
|
numeric => ['ordered','any'], |
136
|
|
|
|
|
|
|
string => ['ordered','any'], |
137
|
|
|
|
|
|
|
); |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
$trait_inapplicable = 'NULL'; |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
%sql_type_map = ( |
142
|
|
|
|
|
|
|
BIGINT => 'numeric', |
143
|
|
|
|
|
|
|
BOOLEAN => 'any', |
144
|
|
|
|
|
|
|
CHAR => 'string', |
145
|
|
|
|
|
|
|
'CHARACTER VARYING' => 'string', |
146
|
|
|
|
|
|
|
DATETIME => 'ordered', |
147
|
|
|
|
|
|
|
DATE => 'ordered', |
148
|
|
|
|
|
|
|
DECIMAL => 'numeric', |
149
|
|
|
|
|
|
|
ENUM => 'ordered', |
150
|
|
|
|
|
|
|
INET => 'any', |
151
|
|
|
|
|
|
|
INTEGER => 'numeric', |
152
|
|
|
|
|
|
|
INT => 'numeric', |
153
|
|
|
|
|
|
|
NUMERIC => 'numeric', |
154
|
|
|
|
|
|
|
SMALLINT => 'numeric', |
155
|
|
|
|
|
|
|
TEXT => 'string', |
156
|
|
|
|
|
|
|
TIME => 'ordered', |
157
|
|
|
|
|
|
|
'TIMESTAMP WITHOUT TIME ZONE' => 'ordered', |
158
|
|
|
|
|
|
|
TIMESTAMP => 'ordered', |
159
|
|
|
|
|
|
|
TINYINT => 'numeric', |
160
|
|
|
|
|
|
|
'UNSIGNED BIGINT' => 'numeric', |
161
|
|
|
|
|
|
|
VARCHAR => 'string', |
162
|
|
|
|
|
|
|
); |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=head1 METHODS |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
The class implements the following methods: |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
=cut |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
__PACKAGE__->mk_accessors(qw(table dbh dsn columns traits results where)); |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=head2 C<< __PACKAGE__->audit ARGS >> |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
Performs the data audit. Valid arguments are: |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=over 4 |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
=item * C
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
Name of the table to audit. No default. |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
=item * C |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
Array reference to the traits. Default traits are |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
min max count null avg blank empty missing |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
=item * C |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
Names of the columns to audit. Default are all columns of the table. |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
=item * C |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
Database handle. If missing, hopefully you have specified the C. |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=item * C |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
DSN to use. Can be omitted if you pass in a valid C instead. |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=item * C |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
Column information, in the same format as the DBI returns it. |
203
|
|
|
|
|
|
|
By default, this will be read in via DBI. |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
=back |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=cut |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
sub audit { |
210
|
0
|
|
|
0
|
1
|
0
|
my ($class, %args) = @_; |
211
|
|
|
|
|
|
|
|
212
|
0
|
|
0
|
|
|
0
|
$args{traits} ||= [ @default_traits ]; |
213
|
0
|
0
|
|
|
|
0
|
if (! @{$args{traits}}) { |
|
0
|
|
|
|
|
0
|
|
214
|
0
|
|
|
|
|
0
|
$args{traits} = [ @default_traits ]; |
215
|
|
|
|
|
|
|
}; |
216
|
0
|
|
0
|
|
|
0
|
$args{dbh} ||= DBI->connect( $args{dsn}, undef, undef, {RaiseError => 1}); |
217
|
|
|
|
|
|
|
|
218
|
0
|
|
|
|
|
0
|
my $self = \%args; |
219
|
0
|
|
|
|
|
0
|
bless $self => $class; |
220
|
0
|
|
0
|
|
|
0
|
$self->{columns} ||= [$self->get_columns]; |
221
|
0
|
0
|
|
|
|
0
|
if (! @{ $self->{columns}}) { |
|
0
|
|
|
|
|
0
|
|
222
|
0
|
|
|
|
|
0
|
croak "Couldn't retrieve column information for table '$args{table}'. Does your DBD implement ->column_info?"; |
223
|
|
|
|
|
|
|
}; |
224
|
0
|
|
0
|
|
|
0
|
$self->{column_info} ||= $self->collect_column_info; |
225
|
|
|
|
|
|
|
|
226
|
0
|
|
|
|
|
0
|
$self |
227
|
|
|
|
|
|
|
}; |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
=head2 C<< $audit->as_text RESULTS >> |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
Returns a table drawn as text with the results. |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
=cut |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
sub as_text { |
236
|
0
|
|
|
0
|
1
|
0
|
my ($self,$results) = @_; |
237
|
|
|
|
|
|
|
|
238
|
0
|
|
|
|
|
0
|
require Text::Table; |
239
|
0
|
|
|
|
|
0
|
my $data = $self->template_data($results); |
240
|
0
|
|
|
|
|
0
|
my $table = Text::Table->new( @{$data->{headings}} ); |
|
0
|
|
|
|
|
0
|
|
241
|
0
|
|
|
|
|
0
|
$table->load( @{$data->{rows}} ); |
|
0
|
|
|
|
|
0
|
|
242
|
|
|
|
|
|
|
|
243
|
0
|
|
|
|
|
0
|
"Data analysis for $data->{table}:\n\n" . $table->table; |
244
|
|
|
|
|
|
|
}; |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
=head2 C<< $audit->as_html RESULTS, TEMPLATE >> |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
Returns a HTML page with the results. |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
You can pass in a custom resultset or C if you want |
251
|
|
|
|
|
|
|
the module to determine the results. |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
You can pass in a custom (L) template |
254
|
|
|
|
|
|
|
if you want fancier rendering. |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
=cut |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
sub as_html { |
259
|
0
|
|
|
0
|
1
|
0
|
my ($self,$results,$template) = @_; |
260
|
0
|
|
|
|
|
0
|
require Template; |
261
|
0
|
|
0
|
|
|
0
|
$template ||= <
|
262
|
|
|
|
|
|
|
Data audit of table '[% table %]' |
263
|
|
|
|
|
|
|
Data audit of table '[% table %]' |
264
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
| [% FOR h IN headings %][%h%] | [%END%]
267
|
|
|
|
|
|
|
| |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
[% FOR r IN rows %] |
270
|
|
|
|
|
|
|
| [% FOR v IN r %][%v FILTER html_entity%] | [%END%]
271
|
|
|
|
|
|
|
[% END %] |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
| |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
TEMPLATE |
276
|
|
|
|
|
|
|
|
277
|
0
|
|
|
|
|
0
|
my $t = Template->new(); |
278
|
0
|
|
|
|
|
0
|
my $data = $self->template_data($results); |
279
|
|
|
|
|
|
|
|
280
|
0
|
0
|
|
|
|
0
|
$t->process(\$template,$data,\my $result) |
281
|
|
|
|
|
|
|
|| croak $t->error; |
282
|
0
|
|
|
|
|
0
|
$result |
283
|
|
|
|
|
|
|
}; |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
=head2 C<< $audit->template_data >> |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
Returns a hash with the following three keys, suitable |
288
|
|
|
|
|
|
|
for using with whatever templating system you have: |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
=over 4 |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
=item * |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
C - the name of the table
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
=item * |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
C - the headings of the columns |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
=item * |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
C - the values of the traits of every column |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
=back |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
=cut |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
sub template_data { |
309
|
0
|
|
|
0
|
1
|
0
|
my ($self,$results) = @_; |
310
|
0
|
|
0
|
|
|
0
|
$results ||= $self->{results} || $self->run_audit; |
|
|
|
0
|
|
|
|
|
311
|
0
|
|
|
|
|
0
|
my @results = @{ $results->[0] }; |
|
0
|
|
|
|
|
0
|
|
312
|
|
|
|
|
|
|
|
313
|
0
|
|
|
|
|
0
|
my @headings = (@{ $self->traits }); |
|
0
|
|
|
|
|
0
|
|
314
|
0
|
|
|
|
|
0
|
my @rows; |
315
|
0
|
|
|
|
|
0
|
for my $column (@{ $self->columns }) { |
|
0
|
|
|
|
|
0
|
|
316
|
0
|
|
|
|
|
0
|
my @row = $column; |
317
|
0
|
|
|
|
|
0
|
for my $trait (@headings) { |
318
|
0
|
|
|
|
|
0
|
my $val = shift @results; |
319
|
0
|
0
|
|
|
|
0
|
if (defined $val) { |
320
|
0
|
0
|
|
|
|
0
|
if (length($val) > 20) { |
321
|
0
|
|
|
|
|
0
|
$val = substr($val,0,20); |
322
|
|
|
|
|
|
|
}; |
323
|
0
|
|
|
|
|
0
|
$val =~ s/[\x00-\x1f]/./g; |
324
|
|
|
|
|
|
|
}; |
325
|
0
|
0
|
|
|
|
0
|
push @row, defined $val ? $val : 'n/a'; |
326
|
|
|
|
|
|
|
}; |
327
|
0
|
|
|
|
|
0
|
push @rows, \@row; |
328
|
|
|
|
|
|
|
}; |
329
|
|
|
|
|
|
|
|
330
|
0
|
|
|
|
|
0
|
my $res = { |
331
|
|
|
|
|
|
|
table => $self->table, |
332
|
|
|
|
|
|
|
headings => ['column',@headings], |
333
|
|
|
|
|
|
|
rows => \@rows, |
334
|
|
|
|
|
|
|
}; |
335
|
|
|
|
|
|
|
}; |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
=head2 C<< $audit->run_audit >> |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
Actually runs the SQL in the database. |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
=cut |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
sub run_audit { |
344
|
0
|
|
|
0
|
1
|
0
|
my ($self) = @_; |
345
|
|
|
|
|
|
|
|
346
|
0
|
|
|
|
|
0
|
my $sql = $self->get_sql; |
347
|
0
|
|
|
|
|
0
|
$self->{results} = $self->dbh->selectall_arrayref($sql,{}); |
348
|
|
|
|
|
|
|
}; |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
=head2 C<< $audit->column_type COLUMN >> |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
Returns the type for the column. The four valid types are C, C, C and C. |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
=cut |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
sub column_type { |
357
|
0
|
|
|
0
|
1
|
0
|
my ($self,$column) = @_; |
358
|
0
|
0
|
|
|
|
0
|
if (! $self->{column_info}) { |
359
|
0
|
|
|
|
|
0
|
$self->{column_info} = $self->collect_column_info; |
360
|
|
|
|
|
|
|
}; |
361
|
0
|
|
|
|
|
0
|
my $info = $self->{column_info}; |
362
|
0
|
|
|
|
|
0
|
map { |
363
|
0
|
|
|
|
|
0
|
$_->{trait_type}; |
364
|
0
|
|
|
|
|
0
|
} grep { $_->{COLUMN_NAME} eq $column } @$info; |
365
|
|
|
|
|
|
|
}; |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
=head2 C<< $audit->get_columns TABLE >> |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
Returns the names of the columns for the table C.
370
|
|
|
|
|
|
|
By default, the value of C will be taken from the value
371
|
|
|
|
|
|
|
passed to the constructor C. |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
=cut |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
sub get_columns { |
376
|
0
|
|
|
0
|
1
|
0
|
my ($self,$table) = @_; |
377
|
0
|
|
0
|
|
|
0
|
$table ||= $self->table; |
378
|
0
|
0
|
|
|
|
0
|
if (! $self->{column_info}) { |
379
|
0
|
|
|
|
|
0
|
$self->{column_info} = $self->collect_column_info; |
380
|
|
|
|
|
|
|
}; |
381
|
0
|
|
|
|
|
0
|
my $info = $self->{column_info}; |
382
|
0
|
|
|
|
|
0
|
my @sorted = @$info; |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
# Order the columns in the "right" order, if possible |
385
|
0
|
0
|
0
|
|
|
0
|
if (exists $sorted[0]->{ORDINAL_POSITION} && defined $sorted[0]->{ORDINAL_POSITION}) { |
386
|
0
|
|
|
|
|
0
|
@sorted = sort { $a->{ORDINAL_POSITION} <=> $b->{ORDINAL_POSITION} } @sorted; |
|
0
|
|
|
|
|
0
|
|
387
|
|
|
|
|
|
|
}; |
388
|
0
|
|
|
|
|
0
|
map { |
389
|
0
|
|
|
|
|
0
|
$_->{COLUMN_NAME}; |
390
|
|
|
|
|
|
|
} @sorted; |
391
|
|
|
|
|
|
|
}; |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
=head2 C<< $audit->collect_column_info TABLE >> |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
Collects the information about the columns for the table C
396
|
|
|
|
|
|
|
from the DBI. By default, C will be taken from the
397
|
|
|
|
|
|
|
value passed to the constructor C. |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
If your database driver does not implement the C<< ->column_info >> |
400
|
|
|
|
|
|
|
method you are out of luck. A fatal error is raised by this method |
401
|
|
|
|
|
|
|
if C<< ->column_info >> does not return anything. |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
This method will raise warnings if it encounters a data type that |
404
|
|
|
|
|
|
|
it doesn't know yet. You can either patch the |
405
|
|
|
|
|
|
|
global variable C<%sql_type_map> to add the type or submit a patch |
406
|
|
|
|
|
|
|
to me to add the type and its interpretation. |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
=cut |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
sub collect_column_info { |
411
|
0
|
|
|
0
|
1
|
0
|
my ($self,$table) = @_; |
412
|
0
|
|
0
|
|
|
0
|
$table ||= $self->table; |
413
|
0
|
|
|
|
|
0
|
my $schema; |
414
|
0
|
0
|
|
|
|
0
|
if ($table =~ s/^(.*)\.//) { |
415
|
0
|
|
|
|
|
0
|
$schema = $1; |
416
|
|
|
|
|
|
|
}; |
417
|
0
|
|
|
|
|
0
|
my $sth = $self->dbh->column_info(undef,$schema,$table,undef); |
418
|
0
|
0
|
|
|
|
0
|
if (! $sth) { |
419
|
0
|
0
|
|
|
|
0
|
if( $schema ) { |
420
|
0
|
|
|
|
|
0
|
$schema= "$schema."; |
421
|
|
|
|
|
|
|
} else { |
422
|
0
|
|
|
|
|
0
|
$schema= ''; |
423
|
|
|
|
|
|
|
}; |
424
|
0
|
|
|
|
|
0
|
croak "Couldn't collect column information for table '$schema$table'. Does your DBD implement ->column_info?"; |
425
|
|
|
|
|
|
|
}; |
426
|
0
|
|
|
|
|
0
|
my $info = $sth->fetchall_arrayref({}); |
427
|
|
|
|
|
|
|
|
428
|
0
|
0
|
|
|
|
0
|
if( !@$info ) { |
429
|
0
|
|
|
|
|
0
|
croak "'$schema$table' seems to have no columns. Does your DBD implement ->column_info?"; |
430
|
|
|
|
|
|
|
}; |
431
|
|
|
|
|
|
|
|
432
|
0
|
|
|
|
|
0
|
for my $i (@$info) { |
433
|
0
|
|
|
|
|
0
|
my $sqltype = uc $i->{TYPE_NAME}; |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
# Fix for Pg - convert enum types to "ENUM": |
436
|
0
|
0
|
0
|
|
|
0
|
if (exists $i->{pg_enum_values} && defined $i->{pg_enum_values}) { |
437
|
0
|
|
|
|
|
0
|
$sqltype = 'ENUM'; |
438
|
|
|
|
|
|
|
}; |
439
|
|
|
|
|
|
|
|
440
|
0
|
0
|
|
|
|
0
|
if (not exists $sql_type_map{ $sqltype }) { |
441
|
0
|
|
|
|
|
0
|
warn sprintf q{Unknown SQL data type '%s' for column "%s.%s"; some traits will be unavailable\n}, |
442
|
|
|
|
|
|
|
$sqltype, $table, $i->{COLUMN_NAME}; |
443
|
|
|
|
|
|
|
}; |
444
|
0
|
|
0
|
|
|
0
|
$i->{trait_type} = $sql_type_map{ $sqltype } || 'any'; |
445
|
|
|
|
|
|
|
}; |
446
|
|
|
|
|
|
|
|
447
|
0
|
|
|
|
|
0
|
$info |
448
|
|
|
|
|
|
|
}; |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
=head2 C<< $audit->get_sql TABLE >> |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
Creates the SQL statement to collect the information. |
453
|
|
|
|
|
|
|
The default value for C will be the table passed
454
|
|
|
|
|
|
|
to the constructor C. |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
If you encounter errors from your SQL engine, you may want |
457
|
|
|
|
|
|
|
to print the result of this method out. |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
=cut |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
sub get_sql { |
462
|
0
|
|
|
0
|
1
|
0
|
my ($self,$table) = @_; |
463
|
0
|
|
0
|
|
|
0
|
$table ||= $self->table; |
464
|
0
|
|
|
|
|
0
|
my @columns = @{ $self->columns }; |
|
0
|
|
|
|
|
0
|
|
465
|
0
|
|
|
|
|
0
|
my @traits = @{$self->traits}; |
|
0
|
|
|
|
|
0
|
|
466
|
|
|
|
|
|
|
|
467
|
0
|
|
|
|
|
0
|
my @resultset; |
468
|
0
|
|
|
|
|
0
|
for my $column (@columns) { |
469
|
0
|
|
|
|
|
0
|
for my $trait (@traits) { |
470
|
0
|
|
|
|
|
0
|
my $name = "${column}_${trait}"; |
471
|
0
|
|
|
|
|
0
|
$name =~ s/"//g; # unquote quoted columns |
472
|
0
|
0
|
|
|
|
0
|
if ($self->trait_applies( $trait, $column )) { |
473
|
0
|
|
|
|
|
0
|
my $tmpl = $trait_type{$trait}->[1]; |
474
|
0
|
|
|
|
|
0
|
$tmpl =~ s/%s/$column/g; |
475
|
0
|
|
|
|
|
0
|
push @resultset, "$tmpl as $name"; |
476
|
|
|
|
|
|
|
} else { |
477
|
0
|
|
|
|
|
0
|
push @resultset, "NULL as $name"; |
478
|
|
|
|
|
|
|
}; |
479
|
|
|
|
|
|
|
}; |
480
|
|
|
|
|
|
|
}; |
481
|
0
|
0
|
|
|
|
0
|
my $where = $self->where ? "WHERE " . $self->where : ''; |
482
|
0
|
|
|
|
|
0
|
my $statement = sprintf "SELECT %s FROM %s\n%s", join("\n ,", @resultset), $table, $where; |
483
|
0
|
|
|
|
|
0
|
return $statement |
484
|
|
|
|
|
|
|
}; |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
=head2 C<< $audit->trait_applies TRAIT, COLUMN >> |
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
Checks whether a trait applies to a column. |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
A trait applies to a column if the trait type is C |
491
|
|
|
|
|
|
|
or if it is the same type as the column type as returned |
492
|
|
|
|
|
|
|
by C. |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
The method will raise an error if it is passed an unknown |
495
|
|
|
|
|
|
|
trait name. See the source code for how to add custom |
496
|
|
|
|
|
|
|
traits. |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
=cut |
499
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
sub trait_applies { |
501
|
36
|
|
|
36
|
1
|
9370
|
my ($self, $trait, $column) = @_; |
502
|
36
|
50
|
|
|
|
94
|
if (not exists $trait_type{$trait}) { |
503
|
0
|
|
|
|
|
0
|
carp "Unknown trait '$trait'"; |
504
|
|
|
|
|
|
|
}; |
505
|
36
|
|
50
|
|
|
114
|
my $trait_type = $trait_type{$trait}->[0] || ''; |
506
|
|
|
|
|
|
|
|
507
|
36
|
100
|
|
|
|
81
|
return 1 if ($trait_type eq 'any'); |
508
|
|
|
|
|
|
|
|
509
|
24
|
|
|
|
|
68
|
(my $type) = $self->column_type($column); |
510
|
24
|
|
|
|
|
69
|
my @subtypes = @{ $trait_hierarchy{ $type } }; |
|
24
|
|
|
|
|
56
|
|
511
|
|
|
|
|
|
|
|
512
|
24
|
|
|
|
|
34
|
return scalar grep { $trait_type eq $_ } ($type,@subtypes); |
|
54
|
|
|
|
|
132
|
|
513
|
|
|
|
|
|
|
}; |
514
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
=head1 COMMAND LINE USAGE |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
You can use this mail from the command line if you need a quick check of data: |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
perl -MDBIx::DataAudit=dbi:SQLite:dbname=some/db.sqlite my_table [traits] |
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
This could also incredibly useful if you want a breakdown of a csv-file: |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
perl -MDBIx::DataAudit=dbi:AnyData:dbname=some/db.sqlite my_table [traits] |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
Unfortunately, that does not work yet, as I haven't found a convenient |
526
|
|
|
|
|
|
|
oneliner way to make a CSV file appear as database. |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
=cut |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
sub import { |
531
|
2
|
|
|
2
|
|
20
|
my ($class, $dsn) = @_; |
532
|
2
|
|
|
|
|
5
|
(my $target) = caller; |
533
|
2
|
50
|
33
|
|
|
53
|
if ($target eq 'main' and $dsn) { |
534
|
0
|
|
|
|
|
|
my ($table,@traits) = @ARGV; |
535
|
0
|
|
|
|
|
|
my @tables = split /,/,$table; |
536
|
0
|
0
|
|
|
|
|
if (! @traits) { |
537
|
0
|
|
|
|
|
|
@traits = @default_traits; |
538
|
|
|
|
|
|
|
}; |
539
|
0
|
|
|
|
|
|
for my $table (@tables) { |
540
|
0
|
|
|
|
|
|
my $self = $class->audit(dsn => $dsn, table => $table, traits => \@traits); |
541
|
0
|
|
|
|
|
|
print "Data audit for table '$table'\n\n"; |
542
|
0
|
|
|
|
|
|
print $self->as_text; |
543
|
|
|
|
|
|
|
}; |
544
|
|
|
|
|
|
|
}; |
545
|
|
|
|
|
|
|
}; |
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
1; |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
__END__ |
| | | | | | |