File Coverage

blib/lib/Text/Record/Deduper.pm
Criterion Covered Total %
statement 225 326 69.0
branch 64 114 56.1
condition 4 18 22.2
subroutine 22 27 81.4
pod 7 7 100.0
total 322 492 65.4


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Text::Record::Deduper - Separate complete, partial and near duplicate text records
4              
5             =head1 SYNOPSIS
6              
7             use Text::Record::Deduper;
8              
9             my $deduper = new Text::Record::Deduper;
10              
11             # Find and remove entire lines that are duplicated
12             $deduper->dedupe_file("orig.txt");
13              
14             # Dedupe comma separated records, duplicates defined by several fields
15             $deduper->field_separator(',');
16             $deduper->add_key(field_number => 1, ignore_case => 1 );
17             $deduper->add_key(field_number => 2, ignore_whitespace => 1);
18             # unique records go to file names_uniqs.csv, dupes to names_dupes.csv
19             $deduper->dedupe_file('names.csv');
20              
21             # Find 'near' dupes by allowing for given name aliases
22             my %nick_names = (Bob => 'Robert',Rob => 'Robert');
23             my $near_deduper = new Text::Record::Deduper();
24             $near_deduper->add_key(field_number => 2, alias => \%nick_names) or die;
25             $near_deduper->dedupe_file('names.txt');
26              
27             # Create a text report, names_report.txt to identify all duplicates
28             $near_deduper->report_file('names.txt',all_records => 1);
29              
30             # Find 'near' dupes in an array of records, returning references
31             # to a unique and a duplicate array
32             my ($uniqs,$dupes) = $near_deduper->dedupe_array(\@some_records);
33              
34             # Create a report on unique and duplicate records
35             $deduper->report_file("orig.txt",all_records => 0);
36              
37              
38             =head1 DESCRIPTION
39              
40             This module allows you to take a text file of records and split it into
41             a file of unique and a file of duplicate records. Deduping of arrays is
42             also possible.
43              
44             Records are defined as a set of fields. Fields may be separated by spaces,
45             commas, tabs or any other delimiter. Records are separated by a new line.
46              
47             If no options are specifed, a duplicate will be created only when all the
48             fields in a record (the entire line) are duplicated.
49              
50             By specifying options a duplicate record is defined by which fields or partial
51             fields must not occur more than once per record. There are also options to
52             ignore case sensitivity, leading and trailing white space.
53              
54             Additionally 'near' or 'fuzzy' duplicates can be defined. This is done by creating
55             aliases, such as Bob => Robert.
56              
57             This module is useful for finding duplicates that have been created by
58             multiple data entry, or merging of similar records
59              
60              
61             =head1 METHODS
62              
63             =head2 new
64              
65             The C method creates an instance of a deduping object. This must be
66             called before any of the following methods are invoked.
67              
68             =head2 field_separator
69              
70             Sets the token to use as the field delimiter. Accepts any character as well as
71             Perl escaped characters such as "\t" etc. If this method ins not called the
72             deduper assumes you have fixed width fields .
73              
74             $deduper->field_separator(',');
75              
76              
77             =head2 add_key
78              
79             Lets you add a field to the definition of a duplicate record. If no keys
80             have been added, the entire record will become the key, so that only records
81             duplicated in their entirity are removed.
82              
83             $deduper->add_key
84             (
85             field_number => 1,
86             key_length => 5,
87             ignore_case => 1,
88             ignore_whitespace => 1,
89             alias => \%nick_names
90             );
91              
92             =over 4
93              
94             =item field_number
95              
96             Specifies the number of the field in the record to add to the key (1,2 ...).
97             Note that this option only applies to character separated data. You will get a
98             warning if you try to specify a field_number for fixed width data.
99              
100             =item start_pos
101              
102             Specifies the position of the field in characters to add to the key. Note that
103             this option only applies to fixed width data. You will get a warning if you
104             try to specify a start_pos for character separated data. You must also specify
105             a key_length.
106              
107             Note that the first column is numbered 1, not 0.
108              
109              
110             =item key_length
111              
112             The length of a key field. This must be specifed if you are using fixed width
113             data (along with a start_pos). It is optional for character separated data.
114              
115             =item ignore_case
116              
117             When defining a duplicate, ignore the case of characters, so Robert and ROBERT
118             are equivalent.
119              
120             =item ignore_whitespace
121              
122             When defining a duplicate, ignore white space that leasd or trails a field's data.
123              
124             =item alias
125              
126             When defining a duplicate, allow for aliases substitution. For example
127              
128             my %nick_names = (Bob => 'Robert',Rob => 'Robert');
129             $near_deduper->add_key(field_number => 2, alias => \%nick_names) or die;
130              
131             Whenever field 2 contains 'Bob', it will be treated as a duplicate of a record
132             where field 2 contains 'Robert'.
133              
134             =back
135              
136              
137             =head2 dedupe_file
138              
139             This method takes a file name F as it's only argument. The file is
140             processed to detect duplicates, as defined by the methods above. Unique records
141             are place in a file named F and duplicates in a file named
142             F. Note that If either of this output files exist, they are
143             over written The orignal file is left intact.
144              
145             $deduper->dedupe_file("orig.txt");
146              
147              
148             =head2 dedupe_array
149              
150             This method takes an array reference as it's only argument. The array is
151             processed to detect duplicates, as defined by the methods above. Two array
152             references are retuned, the first to the set of unique records and the second
153             to the set of duplicates.
154              
155             Note that the memory constraints of your system may prevent you from processing
156             very large arrays.
157              
158             my ($unique_records,duplicate_records) = $deduper->dedupe_array(\@some_records);
159              
160              
161             =head2 report_file
162              
163             This method takes a file name F as it's initial argument.
164              
165             A text report is produced with the following columns
166              
167             record number : the line number of the record
168              
169             key : the key values that define record uniqueness
170              
171             type: the type of record
172             unique : record only occurs once
173             identical : record occurs more than once, first occurence has parent record number of 0
174             alias : record occurs more than once, after alias substitutions have been applied
175              
176             parent record number : the line number of the record that THIS record is a duplicate of.
177              
178             By default, the report file name is F.
179              
180             Various setup options may be defined in a hash that is passed as an optional argument to
181             the C method. Note that all the arguments are optional. They include
182              
183             =over 4
184              
185             =item all_records
186              
187             When this option is set to a positive value, all records will be included in
188             the report. If this value is not set, only the duplicate records will be included
189             in the report
190              
191             =back
192              
193              
194             $deduper->report_file("orig.txt",all_records => 0)
195              
196             =head2 report_array
197              
198             This method takes an array as it's initial argument. The behaviour is the same as
199             C above except that the report file is named F
200              
201             =head1 EXAMPLES
202              
203             =head2 Dedupe an array of single records
204              
205             Given an array of strings:
206              
207             my @emails =
208             (
209             'John.Smith@xyz.com',
210             'Bob.Smith@xyz.com',
211             'John.Brown@xyz.com.au,
212             'John.Smith@xyz.com'
213             );
214              
215             use Text::Record::Deduper;
216              
217             my $deduper = new Text::Record::Deduper();
218             my ($uniq,$dupe);
219             ($uniq,$dupe) = $deduper->dedupe_array(\@emails);
220              
221             The array reference $uniq now contains
222              
223             'John.Smith@xyz.com',
224             'Bob.Smith@xyz.com',
225             'John.Brown@xyz.com.au'
226              
227             The array reference $dupe now contains
228              
229             'John.Smith@xyz.com'
230              
231              
232             =head2 Dedupe a file of fixed width records
233              
234             Given a text file F with space separated values and duplicates defined
235             by the second and third columns:
236              
237             100 Bob Smith
238             101 Robert Smith
239             102 John Brown
240             103 Jack White
241             104 Bob Smythe
242             105 Robert Smith
243              
244              
245             use Text::Record::Deduper;
246              
247             my %nick_names = (Bob => 'Robert',Rob => 'Robert');
248             my $near_deduper = new Text::Record::Deduper();
249             $near_deduper->add_key(start_pos => 5, key_length => 9, ignore_whitespace => 1, alias => \%nick_names) or die;
250             $near_deduper->add_key(start_pos => 14, key_length => 9,) or die;
251             $near_deduper->dedupe_file("names.txt");
252             $near_deduper->report_file("names.txt");
253              
254              
255             Text::Record::Deduper will produce a file of unique records, F
256             in the same directory as F.
257              
258             101 Robert Smith
259             102 John Brown
260             103 Jack White
261             104 Bob Smythe
262            
263              
264             and a file of duplicates, F in the same directory as F
265              
266             100 Bob Smith
267             105 Robert Smith
268              
269             The original file, F is left intact.
270              
271             A report file F, is created in the same directory as F
272              
273             Number Key Type Parent Parent Key
274             --------------------------------------------------------------------------------
275             1 Bob_Smith alias 2 Robert_Smith
276             2 Robert_Smith identical 0
277             3 John_Brown unique 0
278             4 Jack_White unique 0
279             5 Bob_Smythe unique 0
280             6 Robert_Smith identical 2 Robert_Smith
281              
282              
283             =head1 TO DO
284              
285             Allow for multi line records
286             Add batch mode driven by config file or command line options
287             Allow option to warn user when over writing output files
288             Allow user to customise suffix for uniq and dupe output files
289              
290              
291             =head1 SEE ALSO
292              
293             sort(3), uniq(3), L, L, L
294              
295              
296             =head1 AUTHOR
297              
298             Text::Record::Deduper was written by Kim Ryan
299              
300              
301             =head1 COPYRIGHT AND LICENSE
302              
303             Copyright (C) 2011 Kim Ryan.
304              
305              
306             This library is free software; you can redistribute it and/or modify
307             it under the same terms as Perl itself.
308              
309              
310             =cut
311              
312             package Text::Record::Deduper;
313 1     1   25002 use FileHandle;
  1         13962  
  1         7  
314 1     1   439 use File::Basename;
  1         2  
  1         102  
315 1     1   966 use Text::ParseWords;
  1         1524  
  1         70  
316 1     1   1258 use Data::Dumper;
  1         11616  
  1         84  
317              
318              
319              
320 1     1   9 use strict;
  1         2  
  1         36  
321 1     1   6 use warnings;
  1         2  
  1         4009  
322              
323             require Exporter;
324              
325             our @ISA = qw(Exporter);
326             our $VERSION = '0.07';
327              
328              
329             #-------------------------------------------------------------------------------
330             # Create a new instance of a deduping object.
331              
332             sub new
333             {
334 3     3 1 2091 my $class = shift;
335 3         8 my %args = @_;
336              
337 3         9 my $deduper = {};
338 3         8 bless($deduper,$class);
339              
340             # Default to no separator, until we find otherwise
341 3         13 $deduper->{field_separator} = '';
342              
343 3         9 return ($deduper);
344             }
345             #-------------------------------------------------------------------------------
346             # Create a new instance of a deduping object.
347              
348             sub field_separator
349             {
350 1     1 1 7 my $deduper = shift;
351              
352 1         2 my ($field_separator) = @_;
353              
354             # Escape pipe symbol so it does get interpreted as alternation character
355             # when splitting fields in _get_key_fields
356 1 50       5 $field_separator eq '|' and $field_separator = '\|';
357              
358             # add more error checking here
359              
360 1         3 $deduper->{field_separator} = $field_separator;
361 1         2 return($deduper);
362             }
363             #-------------------------------------------------------------------------------
364             # Dewfine a key field in the record
365             sub add_key
366             {
367 4     4 1 25 my $deduper = shift;
368 4         18 my %args = @_;
369              
370              
371 4         9 $deduper->{key_counter}++;
372              
373 4 100       14 if ( $args{field_number} )
    50          
374             {
375 2 50       8 unless ( $deduper->{field_separator} )
376             {
377 0         0 warn "Cannot use field_number on fixed width lines";
378 0         0 return;
379             }
380             }
381             elsif ( $args{start_pos} )
382             {
383 2 50       9 if ( $deduper->{field_separator} )
384             {
385 0         0 warn "Cannot use start_pos on character separated records";
386 0         0 return;
387             }
388             else
389             {
390 2 50       9 unless ( $args{key_length} )
391             {
392 0         0 warn "No key_length defined for start_pos: $args{start_pos}";
393 0         0 return;
394             }
395             }
396             }
397              
398 4         13 foreach my $current_key (keys %args)
399             {
400 15 100       38 if ($current_key eq 'ignore_case' )
401             {
402 4         13 $deduper->{ignore_case}{$deduper->{key_counter}} = 1;
403             }
404 15 100       36 if ($current_key eq 'ignore_whitespace' )
405             {
406 3         11 $deduper->{ignore_whitespace}{$deduper->{key_counter}} = 1;
407             }
408 15 100       33 if ($current_key eq 'alias' )
409             {
410 2 50       6 if ( $args{ignore_case} )
411             {
412             # if ignore case, fold all of alias to upper case
413 2         4 my %current_alias = %{ $args{$current_key} };
  2         10  
414 2         5 my %corrected_alias;
415 2         7 foreach my $current_alias_key ( keys %current_alias )
416             {
417 6         30 $corrected_alias{uc($current_alias_key)} = uc($current_alias{$current_alias_key});
418            
419             }
420 2         10 $deduper->{alias}{$deduper->{key_counter}} = \%corrected_alias;
421             }
422             else
423             {
424 0         0 $deduper->{alias}{$deduper->{key_counter}} = $args{$current_key};
425             }
426             }
427 15 100       69 if ($current_key =~ /field_number|start_pos|key_length/ )
428             {
429 6         26 $deduper->{key}{$deduper->{key_counter}}{$current_key} = $args{$current_key};
430             }
431             }
432              
433 4         25 return ($deduper);
434             }
435             #-------------------------------------------------------------------------------
436             #
437             sub dedupe_file
438             {
439 0     0 1 0 my $deduper = shift;
440 0         0 my ($input_file_name) = @_;
441              
442 0 0 0     0 unless ( -T $input_file_name and -s $input_file_name )
443             {
444 0         0 warn("Could not find input file: $input_file_name");
445 0         0 return(0);
446             }
447              
448 0         0 my $input_fh = new FileHandle "<$input_file_name";
449 0 0       0 unless ($input_fh)
450             {
451 0         0 warn "Could not open input file: $input_file_name";
452 0         0 return(0);
453             }
454              
455 0         0 my ($file_name,$path,$suffix) = File::Basename::fileparse($input_file_name,qr{\..*});
456 0         0 my $file_name_unique_records = "$path$file_name\_uniqs$suffix";
457 0         0 my $file_name_duplicate_records = "$path$file_name\_dupes$suffix";
458              
459             # TO DO!!! test for overwriting of previous Deduper output
460 0         0 my $unique_fh = new FileHandle ">$file_name_unique_records";
461 0 0       0 unless($unique_fh)
462             {
463 0         0 warn "Could not open file: $file_name_unique_records: $!";
464 0         0 return(0);
465             }
466              
467 0         0 my $dupes_fh = new FileHandle ">$file_name_duplicate_records";
468 0 0       0 unless ( $dupes_fh )
469             {
470 0         0 warn "Could not open file: $file_name_duplicate_records: $!";
471 0         0 return(0);
472             }
473              
474 0         0 my ($master_record_index) = $deduper->_analyse('file',undef,$input_fh);
475 0         0 $deduper->_separate('file',$master_record_index,undef,$input_fh,$unique_fh,$dupes_fh);
476              
477 0         0 $input_fh->close;
478 0         0 $unique_fh->close;
479 0         0 $dupes_fh->close;
480             }
481             #-------------------------------------------------------------------------------
482              
483             sub _rewind_file
484             {
485 0     0   0 my ($input_fh) = @_;
486 0         0 $input_fh->seek(0,0); # rewind file
487             }
488             #-------------------------------------------------------------------------------
489             #
490             sub dedupe_array
491             {
492 3     3 1 17 my $deduper = shift;
493              
494 3         5 my ($input_array_ref) = @_;
495 3         10 my ($master_record_index) = $deduper->_analyse('array',$input_array_ref,undef);
496 3         11 my ($uniq_array_ref,$dupe_array_ref) = $deduper->_separate('array',$master_record_index,$input_array_ref,undef);
497              
498 3         21 return($uniq_array_ref,$dupe_array_ref);
499             }
500             #-------------------------------------------------------------------------------
501             # Produce a text report on deduping statistics
502             sub report_file
503             {
504              
505 0     0 1 0 my $deduper = shift;
506 0         0 my ($input_file_name,%report_options) = @_;
507              
508 0 0 0     0 unless ( -T $input_file_name and -s $input_file_name )
509             {
510 0         0 warn("Could not find input file: $input_file_name");
511 0         0 return(0);
512             }
513              
514 0         0 my $input_fh = new FileHandle "<$input_file_name";
515 0 0       0 unless ($input_fh)
516             {
517 0         0 warn "Could not open input file: $input_file_name";
518 0         0 return(0);
519             }
520              
521 0         0 my ($master_record_index_ref) = $deduper->_analyse('file',undef,$input_fh);
522              
523 0         0 my $report_file_name;
524 0 0       0 if ( $report_options{file_name} )
525             {
526             # user has specified name and path to report file
527 0         0 $report_file_name = $report_options{file_name};
528             }
529             else
530             {
531             # use base of input file, append _report.txt to report file name
532 0         0 my ($file_name,$path,$suffix) = File::Basename::fileparse($input_file_name,qr{\..*});
533 0         0 $report_file_name = "$path$file_name\_report\.txt";
534             }
535              
536              
537 0         0 $deduper->_report($master_record_index_ref,$report_file_name,%report_options);
538 0         0 $input_fh->close;
539              
540             }
541             #-------------------------------------------------------------------------------
542             #
543             sub report_array
544             {
545              
546 0     0 1 0 my $deduper = shift;
547 0         0 my ($input_array_ref,%report_options) = @_;
548              
549 0         0 my ($master_record_index_ref) = $deduper->_analyse('array',$input_array_ref,undef);
550              
551 0         0 my $report_file_name;
552 0 0       0 if ( $report_options{file_name} )
553             {
554             # user has specified name and path to report file
555 0         0 $report_file_name = $report_options{file_name};
556             }
557             else
558             {
559             # TO DO, make name more unique, eg add time stamp
560 0         0 $report_file_name = "./deduper_array_report.txt";
561             }
562              
563 0         0 $deduper->_report($master_record_index_ref,$report_file_name,%report_options);
564              
565             }
566             #-------------------------------------------------------------------------------
567             #
568             sub _report
569             {
570 0     0   0 my $deduper = shift;
571 0         0 my ($master_record_index_ref,$report_file_name,%report_options) = @_;
572              
573            
574 0         0 my $report_fh = new FileHandle ">$report_file_name";
575 0 0       0 unless($report_fh)
576             {
577 0         0 warn "Could not open report file: $report_file_name: $!";
578 0         0 return(0);
579             }
580              
581             # TO DO, report format, side be side or interleaved?
582             # options, all records or dupes (default), group all dupes even first
583             # full record dump, not just key
584              
585 0         0 my $current_line = sprintf("%6s %-30.30s %-10.10s %6s %-30.30s\n",
586             'Number', 'Key','Type','Parent','Parent Key');
587 0         0 $report_fh->print($current_line);
588 0         0 $report_fh->print('-' x 80,"\n");
589              
590 0         0 foreach my $record_num ( sort { $a <=> $b } keys %$master_record_index_ref )
  0         0  
591             {
592            
593 0 0 0     0 if ( $report_options{all_records} or
      0        
594             ($master_record_index_ref->{$record_num}->{type} ne 'unique' and
595             $master_record_index_ref->{$record_num}->{parent} > 0 ) )
596             {
597 0         0 my $parent_record_key = '';
598 0 0       0 if ( $master_record_index_ref->{$record_num}->{parent} )
599             {
600 0         0 $parent_record_key = $master_record_index_ref->{$master_record_index_ref->{$record_num}->{parent}}->{key};
601             }
602 0         0 my $current_line = sprintf("%6d %-30.30s %-10.10s %6d %-30.30s\n",
603             $record_num,
604             $master_record_index_ref->{$record_num}->{key},
605             $master_record_index_ref->{$record_num}->{type},
606             $master_record_index_ref->{$record_num}->{parent},
607             $parent_record_key);
608              
609 0         0 $report_fh->print($current_line);
610             }
611             }
612 0         0 $report_fh->close;
613             }
614             #-------------------------------------------------------------------------------
615             #
616             sub _analyse
617             {
618 3     3   5 my $deduper = shift;
619 3         13 my ($storage_type,$input_array_ref,$input_fh) = @_;
620              
621 3         5 my $current_record_number = 0;
622 3         4 my $current_line;
623 3         4 my $finished = 0;
624              
625              
626 3         4 my %alias_candidates;
627 3 50       11 if ( $deduper->{alias} )
628             {
629 3         12 my %all_alias_values = $deduper->_get_all_alias_values;
630 3         8 while ( not $finished )
631             {
632 16         39 ($current_line,$finished) = _read_one_record($storage_type,$current_record_number,$input_array_ref,$input_fh);
633 16         25 $current_record_number++;
634 16         43 my $alias_candidate_key = $deduper->_alias_candidate($current_line,%all_alias_values);
635 16 100 100     81 if ( $alias_candidate_key and not $alias_candidates{$alias_candidate_key} )
636             {
637 2         9 $alias_candidates{$alias_candidate_key} = $current_record_number;
638             }
639             }
640             }
641             # print(Dumper(\%alias_candidates));
642             # die;
643              
644 3         4 my %seen_exact_dupes;
645 3         5 my $unique_ref = [];
646 3         6 my $dupe_ref = [];
647 3         4 $current_record_number = 0;
648 3         4 my %master_record_index;
649              
650 3         4 $finished = 0;
651 3 50 33     12 if ( $storage_type eq 'file' and $deduper->{alias} )
652             {
653 0         0 _rewind_file($input_fh);
654             }
655              
656 3         8 while ( not $finished )
657             {
658 16         32 ($current_line,$finished) = _read_one_record($storage_type,$current_record_number,$input_array_ref,$input_fh);
659 16         20 $current_record_number++;
660              
661 16         17 my $dupe_type;
662 16         36 my %record_keys = $deduper->_get_key_fields($current_line);
663            
664              
665 16         47 %record_keys = $deduper->_transform_key_fields(%record_keys);
666 16         59 my $full_key = _assemble_full_key(%record_keys);
667              
668 16         23 my $parent_record_number;
669 16 100       44 if ( $parent_record_number = $deduper->_alias_dupe(\%alias_candidates,%record_keys) )
    100          
670             {
671 4         5 $dupe_type = 'alias';
672             }
673             # add soundex dupe
674             # add string approx dupe
675             elsif ( $parent_record_number = _exact_dupe($current_line,$full_key,%seen_exact_dupes) )
676             {
677 5         8 $dupe_type = 'identical';
678             }
679             else
680             {
681 7         10 $dupe_type = 'unique';
682             # retain the record number of dupe, useful for detailed reporting and grouping
683 7         15 $seen_exact_dupes{$full_key} = $current_record_number;
684 7         8 $parent_record_number = 0;
685             }
686              
687 16         41 _classify_record($dupe_type,$parent_record_number,$current_record_number,$full_key,\%master_record_index);
688             }
689 3         14 return(\%master_record_index);
690            
691             }
692              
693             #-------------------------------------------------------------------------------
694             sub _alias_candidate
695             {
696 16     16   19 my $deduper = shift;
697 16         31 my ($current_line,%all_alias_values) = @_;
698              
699 16         39 my %record_keys = $deduper->_get_key_fields($current_line);
700 16         54 %record_keys = $deduper->_transform_key_fields(%record_keys);
701            
702 16         31 my $alias_candidate_key = '';
703 16         41 foreach my $current_key ( sort keys %record_keys )
704             {
705              
706 20         32 my $current_key_data = $record_keys{$current_key};
707 20 100       49 if ( $deduper->{alias}{$current_key} )
708             {
709 16 50       38 not $all_alias_values{$current_key} and next;
710 16 100       19 if ( grep(/^$current_key_data$/,@{ $all_alias_values{$current_key} }) )
  16         204  
711             {
712 4         12 $alias_candidate_key .= $current_key_data . ':';
713             }
714             else
715             {
716 12         44 return(0);
717             }
718             }
719             else
720             {
721 4         13 $alias_candidate_key .= $current_key_data . ':';
722             }
723             }
724 4         16 return($alias_candidate_key);
725             }
726             #-------------------------------------------------------------------------------
727             #
728             sub _get_all_alias_values
729             {
730 3     3   4 my $deduper = shift;
731              
732 3         4 my %all_alias_values;
733              
734 3         5 foreach my $key_number ( sort keys %{$deduper->{alias}} )
  3         13  
735             {
736 3         4 my %current_alias = %{ $deduper->{alias}{$key_number} };
  3         35  
737 3         5 my (@current_alias_values,%seen_alias_values);
738 3         8 foreach my $current_alias_value ( values %current_alias )
739             {
740 9 100       26 unless ( $seen_alias_values{$current_alias_value} )
741             {
742 6         8 push(@current_alias_values,$current_alias_value);
743 6         18 $seen_alias_values{$current_alias_value}++;
744             }
745             }
746 3         20 $all_alias_values{$key_number}= [ @current_alias_values ];
747             }
748 3         12 return(%all_alias_values)
749             }
750              
751             #-------------------------------------------------------------------------------
752             #
753              
754             sub _get_key_fields
755             {
756              
757 32     32   36 my $deduper = shift;
758 32         45 my ($current_line) = @_;
759              
760 32         34 my %record_keys;
761              
762              
763 32 50       69 if ( $deduper->{key} )
764             {
765 32 100       89 if ( $deduper->{field_separator} )
766             {
767              
768             # The ParseWords module will not handle single quotes within fields,
769             # so add an escape sequence between any apostrophe bounded by a
770             # letter on each side. Note that this applies even if there are no
771             # quotes in your data, the module needs balanced quotes.
772 12 100       40 if ( $current_line =~ /\w'\w/ )
773             {
774             # check for names with apostrophes, like O'Reilly
775 4         32 $current_line =~ s/(\w)'(\w)/$1\\'$2/g;
776             }
777              
778             # Use ParseWords module to spearate delimited field.
779             # '0' option means don't return any quotes enclosing a field
780 12         45 my (@field_data) = &Text::ParseWords::parse_line($deduper->{field_separator},0,$current_line);
781              
782            
783 12         1427 foreach my $key_number ( sort keys %{$deduper->{key}} )
  12         49  
784             {
785 24         55 my $current_field_data = $field_data[$deduper->{key}->{$key_number}->{field_number} - 1];
786 24 50       52 unless ( $current_field_data )
787             {
788             # A record has less fields then we were expecting, so no
789             # point searching for anymore.
790 0         0 print("Short record\n");
791 0         0 print("Current line : $current_line\n");
792 0         0 print("All fields :", @field_data,"\n");
793 0         0 last;
794             # TO DO, add a warning if user specifies records that must have
795             # a full set of fields??
796             }
797              
798 24 50       56 if ( $deduper->{key}->{$key_number}->{key_length} )
799             {
800 0         0 $current_field_data = substr($current_field_data,0,$deduper->{key}->{$key_number}->{key_length});
801             }
802 24         81 $record_keys{$key_number} = $current_field_data;
803             }
804             }
805             else
806             {
807 20         25 foreach my $key_number ( sort keys %{$deduper->{key}} )
  20         63  
808             {
809 40         196 my $current_field_data = substr($current_line,$deduper->{key}->{$key_number}->{start_pos} - 1,
810             $deduper->{key}->{$key_number}->{key_length});
811 40 50       61 if ( $current_field_data )
812             {
813 40         107 $record_keys{$key_number} = $current_field_data;
814             }
815             else
816             {
817 0         0 print("Short record\n");
818 0         0 print("Current line : $current_line\n");
819 0         0 last;
820             # TO DO, add a warning if user specifies records must have
821             # a full set of fields??
822             }
823             }
824             }
825             }
826             else
827             {
828             # no key fileds defined, use whole line as key
829 0         0 $record_keys{1} = $current_line;
830             }
831 32         314 return(%record_keys);
832             }
833             #-------------------------------------------------------------------------------
834             #
835              
836             sub _transform_key_fields
837             {
838 32     32   43 my $deduper = shift;
839 32         64 my (%record_keys) = @_;
840              
841 32 50       84 if ( $deduper->{ignore_whitespace} )
842             {
843 32         35 foreach my $key_number ( keys %{$deduper->{ignore_whitespace}} )
  32         75  
844             {
845             # strip out leading and/or trailing whitespace
846 44         91 $record_keys{$key_number} =~ s/^\s+//;
847 44         162 $record_keys{$key_number} =~ s/\s+$//;
848             }
849             }
850              
851 32 50       96 if ( $deduper->{ignore_case} )
852             {
853             # Transform every field where ignore_case was specified
854              
855 32         34 foreach my $key_number ( keys %{$deduper->{ignore_case}} )
  32         77  
856             {
857             # If this key is case insensitive, fold data to upper case
858 64         144 $record_keys{$key_number} = uc($record_keys{$key_number});
859             }
860             }
861 32         167 return(%record_keys);
862             }
863             #-------------------------------------------------------------------------------
864             #
865              
866             sub _assemble_full_key
867             {
868 16     16   33 my (%record_keys) = @_;
869 16         16 my $full_key;
870             my @each_key;
871 16         37 foreach my $current_key ( sort keys %record_keys )
872             {
873 32         59 push(@each_key,$record_keys{$current_key});
874             }
875 16         38 $full_key = join('_',@each_key);
876 16         46 return($full_key);
877              
878             }
879             #-------------------------------------------------------------------------------
880             #
881              
882             sub _alias_dupe
883             {
884 16     16   17 my $deduper = shift;
885 16         34 my ($alias_candidates_ref,%record_keys) = @_;
886              
887              
888 16         19 my $alias_dupe = 0;
889 16 50       41 if ( $deduper->{alias} )
890             {
891 16         15 my $alias_was_substituted = 0;
892              
893 16         18 foreach my $key_number ( keys %{$deduper->{alias}} )
  16         38  
894             {
895 16         13 my %current_alias = %{ $deduper->{alias}{$key_number} };
  16         66  
896 16         35 foreach my $current_alias_key ( keys %current_alias )
897             {
898 39 100       123 if ( $record_keys{$key_number} eq $current_alias_key )
899             {
900 8         9 $alias_was_substituted = 1;
901 8         12 $record_keys{$key_number} = $current_alias{$current_alias_key};
902 8         27 last;
903             }
904             }
905             }
906 16 100       43 if ( $alias_was_substituted )
907             {
908 8         10 my $full_key;
909 8         18 foreach my $current_key ( sort keys %record_keys )
910             {
911 16         33 $full_key .= $record_keys{$current_key} . ':';
912             }
913 8 100       22 if ( $alias_candidates_ref->{$full_key} )
914             {
915 4         8 $alias_dupe = $alias_candidates_ref->{$full_key};
916             }
917             }
918             }
919             # returns the number of the orignal unique record for which this current record is an alias dupe of
920 16         63 return($alias_dupe);
921             }
922             #-------------------------------------------------------------------------------
923             #
924              
925             sub _exact_dupe
926             {
927 12     12   16 my $deduper = shift;
928 12         27 my ($full_key,%seen_exact_dupes) = @_;
929             # problem with unitialized value, set to undef??
930 12 100       23 if ( $seen_exact_dupes{$full_key} )
931             {
932 5         17 return($seen_exact_dupes{$full_key});
933             }
934             else
935             {
936 7         20 return(0);
937             }
938             }
939              
940             #-------------------------------------------------------------------------------
941             #
942              
943             sub _read_one_record
944             {
945 48     48   78 my ($storage_type,$current_record_number,$input_array_ref,$input_fh) = @_;
946              
947 48         51 my $finished = 0;
948 48         51 my $current_line;
949              
950 48 50       116 if ( $storage_type eq 'file' )
    50          
951             {
952 0 0       0 if ( $current_line = $input_fh->getline )
953             {
954 0         0 chomp($current_line);
955 0 0       0 if ( $input_fh->eof )
956             {
957 0         0 $finished = 1;
958             }
959             }
960             else
961             {
962 0         0 warn "Could not read line from input file";
963 0         0 $finished = 1;
964             }
965             }
966             elsif ( $storage_type eq 'array' )
967             {
968 48         73 $current_line = @$input_array_ref[$current_record_number];
969 48         70 my $last_element = @$input_array_ref - 1;
970 48 100       128 if ( $current_record_number == $last_element )
    50          
971             {
972 9         13 $finished = 1;
973             }
974             elsif ( $current_record_number > $last_element )
975             {
976 0         0 warn "You are trying to access beyond the input array boundaries";
977 0         0 $finished = 1;
978             }
979             }
980             else
981             {
982 0         0 warn "Illegal storage type";
983 0         0 $finished = 1;
984             }
985 48         125 return($current_line,$finished);
986             }
987              
988              
989             #-------------------------------------------------------------------------------
990             #
991              
992             sub _classify_record
993             {
994 16     16   27 my ($dupe_type,$parent_record_number,$current_record_number,$full_key,$master_record_index) = @_;
995 16         50 $master_record_index->{$current_record_number}{key} = $full_key;
996 16         29 $master_record_index->{$current_record_number}{parent} = $parent_record_number;
997 16         28 $master_record_index->{$current_record_number}{type} = $dupe_type;
998              
999             # If there is a parent, update it now, so that is marked as the first
1000             # dupe in a set (of current type alias, indentical etc). Note that
1001             # a record can be parent to several record types, eg alias and indentical
1002             # Currently only updating from last child, but may want to record all in future.
1003              
1004 16 100       50 if ( $parent_record_number )
1005             {
1006 9         17 $master_record_index->{$parent_record_number}{type} = $dupe_type;
1007 9         102 $master_record_index->{$parent_record_number}{parent} = 0;
1008             }
1009             }
1010              
1011             #-------------------------------------------------------------------------------
1012             #
1013              
1014             sub _separate
1015             {
1016 3     3   36 my $deduper = shift;
1017 3         7 my ($storage_type,$master_record_index_ref,$input_array_ref,$input_fh,$unique_fh,$dupes_fh) = @_;
1018              
1019 3 50       10 if ( $storage_type eq 'file' )
1020             {
1021 0         0 _rewind_file($input_fh);
1022             }
1023              
1024 3         5 my $unique_ref = [];
1025 3         4 my $dupe_ref = [];
1026              
1027              
1028 3         4 my $current_record_number = 0;
1029 3         4 my $current_line;
1030 3         4 my $finished = 0;
1031              
1032              
1033 3         7 while ( not $finished )
1034             {
1035 16         20 my $current_line;
1036 16         26 ($current_line,$finished) = _read_one_record($storage_type,$current_record_number,$input_array_ref,$input_fh);
1037 16         22 $current_record_number++;
1038 16         31 my $dupe_type = $master_record_index_ref->{$current_record_number}{type};
1039 16         26 my $parent_record_number = $master_record_index_ref->{$current_record_number}{parent};
1040              
1041             # The first duplicate in a set of 1 or more dupes (the parent), is treated as a unique record
1042             # TO DO!!! allow user to define this initial dupe as not unique, and group with it's childeren dupes
1043             # TO DO!!! separate out to alias, soundex dupes to their own file if needed
1044 16 100       37 if ( $parent_record_number == 0 )
1045             {
1046 7         11 $dupe_type = 'unique';
1047             }
1048              
1049 16         25 _write_one_record($storage_type,$dupe_type,$current_line,$unique_ref,$dupe_ref,$input_fh,$unique_fh,$dupes_fh);
1050             }
1051 3         10 return($unique_ref,$dupe_ref);
1052              
1053             }
1054             #-------------------------------------------------------------------------------
1055             #
1056              
1057             sub _write_one_record
1058             {
1059 16     16   31 my ($storage_type,$dupe_type,$current_line,$unique_ref,$dupe_ref,$input_fh,$unique_fh,$dupes_fh) = @_;
1060              
1061 16 50       45 if ( $storage_type eq 'file' )
    50          
1062             {
1063              
1064 0 0       0 if ( $dupe_type eq 'unique' )
1065             {
1066 0         0 $unique_fh->print("$current_line\n");
1067             }
1068             else
1069             {
1070 0         0 $dupes_fh->print("$current_line\n");
1071             }
1072             }
1073             elsif ( $storage_type eq 'array' )
1074             {
1075 16 100       28 if ( $dupe_type eq 'unique' )
1076             {
1077 7         25 push(@$unique_ref,$current_line);
1078             }
1079             else
1080             {
1081             # TO DO!!! separate out to alias, soundex dupes etc if needed
1082 9         31 push(@$dupe_ref,$current_line);
1083             }
1084             }
1085             }
1086              
1087              
1088             1;
1089