File Coverage

blib/lib/Pg/Explain/StringAnonymizer.pm

Criterion	Covered	Total	%
statement	115	115	100.0
branch	16	20	80.0
condition			n/a
subroutine	21	21	100.0
pod	6	6	100.0
total	158	162	97.5

line	stmt	bran	sub	pod	time	code
1						package Pg::Explain::StringAnonymizer;
2
3						# UTF8 boilerplace, per http://stackoverflow.com/questions/6162484/why-does-modern-perl-avoid-utf-8-by-default/
4	80		80		121810	use v5.18;
	80				442
5	80		80		694	use strict;
	80				289
	80				2946
6	80		80		506	use warnings;
	80				190
	80				6021
7	80		80		535	use warnings qw( FATAL utf8 );
	80				158
	80				4893
8	80		80		1117	use utf8;
	80				527
	80				1670
9	80		80		4128	use open qw( :std :utf8 );
	80				1758
	80				651
10	80		80		17291	use Unicode::Normalize qw( NFC );
	80				5346
	80				5316
11	80		80		1461	use Unicode::Collate;
	80				14514
	80				2970
12	80		80		1144	use Encode qw( decode );
	80				20759
	80				13814
13
14						if ( grep /\P{ASCII}/ => @ARGV ) {
15						@ARGV = map { decode( 'UTF-8', $_ ) } @ARGV;
16						}
17
18						# UTF8 boilerplace, per http://stackoverflow.com/questions/6162484/why-does-modern-perl-avoid-utf-8-by-default/
19
20	80		80		4665	use Carp;
	80				161
	80				6371
21	80		80		50298	use Digest::SHA qw( sha1 );
	80				309833
	80				135217
22
23						=head1 NAME
24
25						Pg::Explain::StringAnonymizer - Class to anonymize sets of strings
26
27						=head1 VERSION
28
29						Version 2.9
30
31						=cut
32
33						our $VERSION = '2.9';
34
35						=head1 SYNOPSIS
36
37						This module provides a way to turn defined set of strings into anonymized version of it, that has 4 properties:
38
39						=over
40
41						=item * the same original string should give the same output string (within the same input set)
42
43						=item * strings shouldn't be very long
44
45						=item * it shouldn't be possible to reverse the operation
46
47						=item * generated strings should be easy to read, and easy to distinguish between themselves.
48
49						=back
50
51						Points first and third can be done easily with some hashing function (md5, sha), but generated hashes violate fourth point, and sometimes also second.
52
53						Example of usage:
54
55						my $anonymizer = Pg::Explain::StringAnonymizer->new();
56						$anonymizer->add( 'a', 'b', 'c');
57						$anonymizer->add( 'depesz' );
58						$anonymizer->add( [ "any strings, "are possible" ] );
59						$anonymizer->finalize();
60
61						print $anonymizer->anonymized( 'a' ), "\n";
62
63						my $full_dictionary = $anonymizer->anonymization_dictionary();
64
65						=head1 METHODS
66
67						=head2 new
68
69						Object constructor, doesn't take any arguments.
70
71						=cut
72
73						sub new {
74	18		18	1	152158	my $class = shift;
75	18				62	my $self = bless {}, $class;
76	18				119	$self->{ 'strings' } = {};
77	18				71	return $self;
78						}
79
80						=head2 add
81
82						Adds new string(s) to anonymization list.
83
84						Strings can be given either as list of ArrayRef.
85
86						It is important to note, that one cannot add() more elements to anonymized set after finalization (call to finalize() method).
87
88						If such call will be made (add() after finalize()) it will raise exception.
89
90						=cut
91
92						sub add {
93	682		682	1	3211	my $self = shift;
94	682	100			2097	croak( "Cannot run ->add() after finalization.\n" ) if $self->{ 'is_finalized' };
95
96	681				1997	my @input = @_;
97	681	100			1810	@input = @{ $input[ 0 ] } if 'ARRAY' eq ref( $input[ 0 ] );
	3				9
98	681				1651	for my $string ( @input ) {
99	705	100			2728	next if $self->{ 'strings' }->{ $string };
100	135				472	$self->{ 'strings' }->{ $string } = $self->_hash( $string );
101						}
102	681				2991	return;
103						}
104
105						=head2 finalize
106
107						Finalizes string set creation, and creates anonymized versions.
108
109						It has to be called after some number of add() calls, so that it will have something to work on.
110
111						After running finalize() one cannot add() more string.
112
113						Also, before finalize() you cannot run anonymized() or anonymization_dictionary() methods.
114
115						=cut
116
117						sub finalize {
118	18		18	1	13432	my $self = shift;
119	18	50			119	return if $self->{ 'is_finalized' };
120	18				66	$self->{ 'is_finalized' } = 1;
121
122						$self->_make_prefixes(
123	18				47	'keys' => [ keys %{ $self->{ 'strings' } } ],
	18				256
124						'level' => 0,
125						);
126
127	18				133	$self->_stringify();
128
129	18				109	my @keys_sorted = sort { length( $b ) <=> length( $a ) } keys %{ $self->{ 'strings' } };
	393				884
	18				191
130	18				81	$self->{ 'keys_re' } = join '\|', map { qr{\Q$_\E} } @keys_sorted;
	135				1492
131
132	18				234	return;
133						}
134
135						=head2 anonymized
136
137						Returns anonymized version of given string, or undef if the string wasn't previously added to anonymization set.
138
139						If it will be called before finalize() it will raise exception.
140
141						=cut
142
143						sub anonymized {
144	700		700	1	18017	my $self = shift;
145	700	100			2146	croak( "Cannot run ->anonymized() before finalization.\n" ) unless $self->{ 'is_finalized' };
146	699				1372	my $input = shift;
147	699				4312	return $self->{ 'strings' }->{ $input };
148						}
149
150						=head2 anonymize_text
151
152						Anonymize given text using loaded dictionary of substiturions.
153
154						=cut
155
156						sub anonymize_text {
157	3		3	1	9	my $self = shift;
158	3				8	my $text = shift;
159	3				12	my $re = $self->{ 'keys_re' };
160	3				399	$text =~ s{(\b\|\s)($re)(\b\|\s)}{ $1 . $self->{'strings'}->{$2} . $3 }mge;
	30				156
161	3				34	return $text;
162						}
163
164						=head2 anonymization_dictionary
165
166						Returns hash reference containing all input strings and their anonymized versions, like:
167
168						{
169						'original1' => 'anon1',
170						'original2' => 'anon2',
171						...
172						'originalN' => 'anonN',
173						}
174
175						If it will be called before finalize() it will raise exception.
176
177						=cut
178
179						sub anonymization_dictionary {
180	2		2	1	1055	my $self = shift;
181	2	100			13	croak( "Cannot run ->anonymization_dictionary() before finalization.\n" ) unless $self->{ 'is_finalized' };
182	1				5	return $self->{ 'strings' };
183						}
184
185						=head1 INTERNAL METHODS
186
187						=head2 _hash
188
189						Converts given string into array of 32 integers in range 0..31.
190
191						This is done by taking sha1 checksum of string, splitting it into 32 5-bit
192						long "segments", and transposing each segment into integer.
193
194						=cut
195
196						sub _hash {
197	135		135		228	my $self = shift;
198	135				581	my $input = shift;
199
200	135				1239	my $hash = sha1( $input );
201
202						# sha1() (20 bytes) to 32 integers (0..31) transformation thanks to
203						# mauke and LeoNerd on #perl on irc.freenode.net
204
205	135				624	my $binary_hash = unpack( "B*", $hash );
206	135				2315	my @segments = unpack "(a5)*", $binary_hash;
207	135				397	return [ map { oct "0b$_" } @segments ];
	4320				8935
208						}
209
210						=head2 _word
211
212						Returns n-th word from number-to-word translation dictionary.
213
214						=cut
215
216						sub _word {
217	207		207		405	my $self = shift;
218	207				335	my $n = shift;
219	207	50			432	$n = 0 unless defined $n;
220	207	50			463	$n = 0 if $n < 0;
221	207	50			426	$n = 31 if $n > 31;
222	207				1101	my @words = qw(
223						alpha bravo charlie delta
224						echo foxtrot golf hotel
225						india juliet kilo lima
226						mike november oscar papa
227						quebec romeo sierra tango
228						uniform victor whiskey xray
229						yankee zulu two three
230						four five six seven
231						);
232	207				811	return $words[ $n ];
233						}
234
235						=head2 _make_prefixes
236
237						Scan given keys, and changes their values (in ->{'strings'} hash) to
238						shortest unique prefix.
239
240						=cut
241
242						sub _make_prefixes {
243	48		48		93	my $self = shift;
244	48				215	my %args = @_;
245
246	48				108	my $S = $self->{ 'strings' };
247
248	48				153	my %unique_ints = ();
249
250	48				122	for my $key ( @{ $args{ 'keys' } } ) {
	48				118
251	207				430	my $KA = $S->{ $key };
252	207				469	my $interesting_int = $KA->[ $args{ 'level' } ];
253	207				673	$unique_ints{ $interesting_int }++;
254						}
255
256						# At this moment, we know how many times given int happened at given
257						# level, so we can make sensible decisions
258
259	48				95	my %to_redo = ();
260
261	48				91	for my $key ( @{ $args{ 'keys' } } ) {
	48				106
262	207				470	my $KA = $S->{ $key };
263	207				386	my $interesting_int = $KA->[ $args{ 'level' } ];
264	207	100			496	if ( 1 == $unique_ints{ $interesting_int } ) {
265	135				202	splice @{ $KA }, 1 + $args{ 'level' };
	135				423
266	135				291	next;
267						}
268	72				106	push @{ $to_redo{ $interesting_int } }, $key;
	72				198
269						}
270
271						# In to_redo, we have blocks of keys, that share prefix (up to given
272						# level), so they have to be further processed.
273
274	48				185	for my $key_group ( values %to_redo ) {
275						$self->_make_prefixes(
276						'keys' => $key_group,
277	30				124	'level' => $args{ 'level' } + 1,
278						);
279						}
280
281	48				192	return;
282						}
283
284						=head2 _stringify
285
286						Converts arrays of ints (prefixes for hashed words) into strings
287
288						=cut
289
290						sub _stringify {
291	18		18		47	my $self = shift;
292
293	18				38	for my $key ( keys %{ $self->{ 'strings' } } ) {
	18				111
294	135				309	my $ints = $self->{ 'strings' }->{ $key };
295	135				237	my @words = map { $self->_word( $_ ) } @{ $ints };
	207				456
	135				249
296	135				652	$self->{ 'strings' }->{ $key } = join( '_', @words );
297						}
298						}
299
300						=head1 AUTHOR
301
302						hubert depesz lubaczewski, C<< >>
303
304						=head1 BUGS
305
306						Please report any bugs or feature requests to C.
307
308						=head1 SUPPORT
309
310						You can find documentation for this module with the perldoc command.
311
312						perldoc Pg::Explain::StringAnonymizer
313
314						=head1 COPYRIGHT & LICENSE
315
316						Copyright 2008-2023 hubert depesz lubaczewski, all rights reserved.
317
318						This program is free software; you can redistribute it and/or modify it
319						under the same terms as Perl itself.
320
321
322						=cut
323
324						1; # End of Pg::Explain::StringAnonymizer