File Coverage

blib/lib/Tie/Hash/Abbrev/BibRefs.pm
Criterion Covered Total %
statement 37 164 22.5
branch 2 92 2.1
condition 0 12 0.0
subroutine 12 29 41.3
pod 10 10 100.0
total 61 307 19.8


line stmt bran cond sub pod time code
1             package Tie::Hash::Abbrev::BibRefs;
2              
3             =head1 NAME
4              
5             Tie::Hash::Abbrev::BibRefs - match bibliographic references to the original titles
6              
7             =head1 SYNOPSIS
8              
9             use Tie::Hash::Abbrev::BibRefs;
10              
11             tie my %hash, 'Tie::Hash::Abbrev::BibRefs',
12             preprocess => sub { s/\s+[[:upper:]]:.*// },
13             stopwords => [ qw( a and de del der des di
14             et for für i if in la las
15             of on part Part Pt. Sect.
16             the to und ) ],
17             exceptions => { jpn => 'japan',
18             natl => 'national' };
19              
20             $hash{'Physical Review B'} = '0163-1829';
21              
22             print $hash{'Phys. Rev. B: Condens. Matter Mater. Phys.'};
23             # will print '0163-1829'
24              
25             =head1 DESCRIPTION
26              
27             This module is an attempt to ease the mapping of often abbreviated
28             bibliographical references to the original titles.
29              
30             To achieve this, it simplyfies the title according to parameterizable rules and
31             stores it as a I.
32              
33             When accessing the hash, the key given is also L
34             and compared to the normalized version of the original title.
35             In addition, each word (words are separated by whitespace) may be abbreviated by
36             specifying only the first few letters.
37              
38             If more than one matching hash entry is found, the values of all matching
39             entries are compared; as long as they are all
40             Lual (or all L), the
41             lookup is still considered to be successful.
42              
43             =head1 KEY NORMALIZATION
44              
45             The process of normalization is implemented as follows:
46              
47             =over 4
48              
49             =item 1.
50              
51             execute any preprocessing code (see L), which is
52             expected to operate on C<$_>.
53             You can use subroutine references or strings here; strings will be
54             L.
55              
56             =item 2.
57              
58             split the key into parts (at whitespace).
59              
60             =item 3.
61              
62             remove any parts contained in the list of stopwords
63             (see L).
64              
65             =item 4.
66              
67             replace any parts contained in the list of exceptions
68             by their corresponding value.
69             If the value is L, the entire part will be removed.
70             (In the L, "Jpn" would be replaced by "japan".)
71             This lookup is done case-insensitively.
72              
73             =item 5.
74              
75             remove any non-word characters at the end of each part or followed by a dash
76              
77             =back
78              
79             =cut
80              
81 1     1   27468 use strict;
  1         3  
  1         46  
82 1     1   6 use vars '$VERSION';
  1         2  
  1         61  
83              
84 1     1   7 use Carp 'croak';
  1         7  
  1         93  
85              
86             $VERSION = 0.02;
87              
88 1     1   7 use constant DATA => 0;
  1         2  
  1         59  
89 1     1   7 use constant I => 1;
  1         2  
  1         59  
90 1     1   6 use constant PREPROCESS => 2;
  1         1  
  1         51  
91 1     1   7 use constant STOPWORDS => 3;
  1         1  
  1         84  
92 1     1   7 use constant EXCEPTIONS => 4;
  1         2  
  1         60  
93 1     1   6 use constant DEBUG => 5;
  1         2  
  1         2689  
94              
95             sub TIEHASH {
96 1 50   1   17 croak 'Odd number of arguments.' unless @_ & 1;
97 1         3 my $package = shift;
98 1 50       5 $package = ref $package if length ref $package;
99 1         4 my $self = bless [], $package;
100 1         7 $self->[DATA] = [];
101 1         4 while (@_) {
102 0         0 my ( $option, $argument ) = splice @_, 0, 2;
103 0 0       0 if ( $option eq 'debug' ) { $self->debug($argument) }
  0 0       0  
    0          
    0          
104 0         0 elsif ( $option =~ /^exceptions?\z/ ) { $self->exceptions($argument) }
105 0         0 elsif ( $option eq 'preprocess' ) { $self->preprocess($argument) }
106             elsif ( $option =~ /^stopwords?\z/ ) {
107 0 0       0 $self->stopwords( ref $argument ? @$argument : $argument );
108             }
109 0         0 else { croak qq(Unknown TIEHASH option "$option"!) }
110             }
111 1         5 $self;
112             }
113              
114             sub FETCH {
115 0     0   0 my ( $self, $key ) = @_;
116 0 0       0 if ( defined( my $found = $self->find($key) ) ) { $self->[DATA][$found] }
  0         0  
117 0         0 else { undef }
118             }
119              
120             sub STORE {
121 0     0   0 my ( $self, $key, $value ) = @_;
122 0 0       0 if (
123             defined $self->exact(
124             $key, my $pos = $self->pos( my $normkey = $self->normalize($key) )
125             )
126             )
127             {
128 0         0 $self->[DATA][ $pos + 1 ] = $value;
129             }
130 0         0 else { splice @{ $self->[DATA] }, $pos, 0, $normkey, $value, $key }
  0         0  
131             }
132              
133             sub EXISTS {
134 0     0   0 my ( $self, $key ) = @_;
135 0 0       0 if ( defined $self->find($key) ) { 1 }
  0         0  
136 0         0 else { '' }
137             }
138              
139             sub DELETE {
140 0     0   0 my ( $self, $key ) = @_;
141 0         0 my $pos = $self->pos( my $normkey = $self->normalize($key) );
142 0 0       0 if ( defined $self->exact( $key, $pos ) ) {
143 0         0 ( undef, my $value ) = splice @{ $self->[DATA] }, $pos, 3;
  0         0  
144 0         0 $self->startover;
145 0         0 $value;
146             }
147 0         0 else { undef }
148             }
149              
150             sub CLEAR {
151 0     0   0 my ($self) = @_;
152 0         0 $self->startover;
153 0         0 @{ $self->[DATA] } = ();
  0         0  
154             }
155              
156             sub FIRSTKEY {
157 0     0   0 my ($self) = @_;
158 0 0       0 return undef unless @{ $self->[DATA] };
  0         0  
159 0         0 $self->[ $self->[I] = 2 ];
160             }
161              
162             sub NEXTKEY {
163 0     0   0 my ( $self, $lastkey ) = @_;
164 0 0       0 if ( ( my $i = $self->[I] += 3 ) <= $#{ $self->[DATA] } ) {
  0         0  
165 0         0 $self->[DATA][$i];
166             }
167             else {
168 0         0 $self->startover;
169 0         0 undef;
170             }
171             }
172              
173 0     0   0 sub UNTIE { }
174              
175 1     1   853 sub DESTROY { shift->startover }
176              
177             =head1 ADDITIONAL METHODS
178              
179             =head2 debug
180              
181             turn debug mode on (when given a true value as argument) or off
182             (when given a false value).
183             Returns the (possibly new) value.
184              
185             In debug mode, the L method will print debug messages to STDERR.
186              
187             =cut
188              
189             sub debug {
190 0     0 1 0 my $self = shift;
191 0 0       0 $self->[DEBUG] = shift if @_;
192 0         0 $self->[DEBUG];
193             }
194              
195             =head2 delete_abbrev
196              
197             my @deleted = tied(%hash)->delete_abbrev('foo','bar');
198              
199             Will delete all elements on the basis of all unambiguous abbreviations given as
200             arguments and return a (possibly empty) list of all deleted values.
201              
202             =cut
203              
204             sub delete_abbrev {
205 0     0 1 0 my $self = shift;
206 0         0 my @deleted;
207 0         0 for (@_) {
208             next
209             unless
210 0 0       0 defined( my $pos1 = $self->valid( $_, my $pos = $self->pos($_) ) );
211 0         0 my $i = 0;
212 0         0 push @deleted, grep $i++ & 1, splice @{ $self->[DATA] }, $pos,
  0         0  
213             3 + $pos1 - $pos;
214             }
215 0 0       0 $self->startover if @deleted;
216 0         0 @deleted;
217             }
218              
219             =head2 exceptions
220              
221             get or set the exceptions table for the hash.
222             Expects hash references or L, which clears the table.
223             Returns a reference to the new exception table.
224              
225             =cut
226              
227             sub exceptions {
228 0     0 1 0 my $self = shift;
229 0         0 for (@_) {
230 0 0       0 if (defined) {
231 0         0 while ( my ( $k, $v ) = each %$_ ) {
232 0         0 $self->[EXCEPTIONS]{ lc $k } = lc $v;
233             }
234             }
235 0         0 else { $self->[EXCEPTIONS] = {} }
236             }
237 0 0       0 $self->[EXCEPTIONS] || {};
238             }
239              
240             =head2 preprocess
241              
242             set up the preprocessing code chain for the hash.
243             Any code references or strings will be added to the chain,
244             an L will clear the chain.
245              
246             =cut
247              
248             sub preprocess {
249 0     0 1 0 my $self = shift;
250 0         0 for (@_) {
251 0 0       0 if (defined) { push @{ $self->[PREPROCESS] }, $_ }
  0         0  
  0         0  
252 0         0 else { @{ $self->[PREPROCESS] } = [] }
  0         0  
253             }
254 0 0       0 @{ $self->[PREPROCESS] || [] };
  0         0  
255             }
256              
257             =head2 stopwords
258              
259             get or set the /stopwords for the hash.
260             Any arguments given will be added to the list of stopwords.
261             An L> as argument will clear the list of stopwords.
262             The method returns the new list of stopwords (in an unsorted manner).
263              
264             =cut
265              
266             sub stopwords {
267 0     0 1 0 my $self = shift;
268 0         0 for (@_) {
269 0 0       0 if (defined) { $self->[STOPWORDS]{$_} = undef }
  0         0  
270 0         0 else { $self->[STOPWORDS] = {} }
271             }
272 0 0       0 keys %{ $self->[STOPWORDS] || {} };
  0         0  
273             }
274              
275             =head1 INTERNAL METHODS
276              
277             The following methods should usually not be called "from the outside";
278             the main intention of ducumenting them is that the author still wants to
279             understand his own module in case changes will be neccessary later. :o)
280              
281             =head2 exact
282              
283             expects a key as first and a L as second argument.
284             Returns the position if the given key equals (case-insensitively) the real key
285             stored at that position or undef if not.
286              
287             =cut
288              
289             sub exact {
290 0     0 1 0 my ( $self, $key, $pos ) = @_;
291 0 0 0     0 if ( $pos < $#{ $self->[DATA] } && lc $self->[DATA][ $pos + 2 ] eq lc $key )
  0         0  
292             {
293 0         0 $pos;
294             }
295 0         0 else { undef }
296             }
297              
298             =head2 find
299              
300             This is the central method for lookups, used by L and
301             C.
302              
303             It expects a key as its only argument.
304              
305             Upon success, the method returns an array index at which the corresponding value
306             can be found, or undef otherwise.
307              
308             =cut
309              
310             sub find {
311 0     0 1 0 my ( $self, $key ) = @_;
312 0         0 my $debug = $self->debug;
313 0         0 my ( $prefix, $pattern, $normkey ) = $self->normalize($key);
314 0 0       0 print STDERR <<_ if $debug;
315             --------------------------------------------------------------------------------
316             Key: <$key>
317             Prefix: <$prefix>
318             Pattern: <$pattern>
319             NormKey: <$normkey>
320             _
321 0 0       0 defined( my $pos = $self->pos($prefix) ) or return undef;
322 0         0 my $data = $self->[DATA];
323 0 0       0 print STDERR 'Starting search at entry #'
    0          
324             . ( $pos / 3 )
325             . (
326             $pos ? qq(; the key before that would be: "$data->[$pos-3]"\n) : ".\n" )
327             if $debug;
328 0         0 my $found;
329 0   0     0 do {
330 0 0       0 print STDERR 'Examining entry #'
331             . ( $pos / 3 )
332             . qq(: "$data->[$pos]"... )
333             if $debug;
334 0 0       0 if ( $data->[$pos] =~ $pattern ) {
335 0 0       0 if ( lc $data->[ $pos + 2 ] eq lc $key ) {
336 0 0       0 print STDERR "exact match.\n" if $debug;
337 0         0 return $pos + 1;
338             }
339 0 0 0     0 unless ( defined $found ) {
    0          
    0          
340 0         0 $found = $pos + 1;
341 0 0       0 print STDERR qq( matches, value: "$data->[$found]"\n)
342             if $debug;
343             }
344             elsif (
345             defined $data->[$found]
346             ? !defined $data->[ $pos + 1 ]
347             || $data->[ $pos + 1 ] ne $data->[$found]
348             : defined $data->[ $pos + 1 ]
349             )
350             {
351 0 0       0 print STDERR
352             qq( also matches, but has a different value: "$data->[$pos+1]"\n)
353             if $debug;
354 0         0 return;
355             }
356             }
357 0 0       0 else { print STDERR "does not match.\n" if $debug }
358             } while ( $pos += 3 ) < $#$data
359             && $prefix eq substr $data->[$pos], 0, length $prefix;
360 0 0       0 print STDERR $pos > $#$data ? "Last element reached.\n"
    0          
    0          
361             : qq("$data->[$pos]" has a different prefix.\n),
362             defined $found ? "Search was successful.\n"
363             : "Search was NOT successful.\n"
364             if $debug;
365 0         0 $found;
366             }
367              
368             =head2 normalize
369              
370             Given a key as the its only argument,
371             this method will return the normalized key in scalar
372             and a three element list in array context, consisting of
373              
374             =over 4
375              
376             =item 0.
377              
378             the L
379              
380             =item 1.
381              
382             the L and
383              
384             =item 2.
385              
386             the L.
387              
388             =back
389              
390             =cut
391              
392             sub normalize {
393 0     0 1 0 my ( $self, $key ) = @_;
394 0         0 my ( $exceptions, $stopwords ) = @{$self}[ EXCEPTIONS, STOPWORDS ];
  0         0  
395 0         0 local $_ = $key;
396 0         0 for my $pp ( $self->preprocess ) {
397 0 0       0 if ( ref $pp ) { &$pp }
  0         0  
398 0         0 else { eval $pp }
399             }
400             (
401 0 0       0 my $normkey =
    0          
402             join ' ',
403             map exists $exceptions->{ +lc }
404             ? defined $exceptions->{ +lc } ? $exceptions->{ +lc } : ()
405             : lc,
406             grep !exists $stopwords->{$_},
407             split /\s+|-/
408             ) =~ s/\W+(?=\s|-|$)//g;
409 0 0       0 return $normkey unless wantarray;
410 0         0 my ($prefix) = $normkey =~ /^([^\s-]*)/;
411 0         0 my $pattern = '^'
412             . join ( ' ', map quotemeta() . '\S*', split /\s+|-/, $normkey ) . '$';
413 0 0       0 $prefix, $] < 5.006 ? $pattern : eval 'qr/$pattern/', $normkey;
414             }
415              
416             =head2 pos
417              
418             expects an (usually L) key as (its only) argument
419             and returns the position at which this key is stored (if it exists)
420             or should be sorted (if it does not already exist).
421              
422             =cut
423              
424             sub pos {
425 0     0 1 0 my ( $self, $key ) = @_;
426 0         0 my $data = $self->[DATA];
427 0         0 my $a = 0;
428 0         0 my $b = @$data;
429 0   0     0 while ( $a < $b && $a < $#$data ) { # perform a binary search
430 0 0       0 if ( $data->[ my $c = 3 * int +( $a + $b >> 1 ) / 3 ] lt $key ) {
431 0         0 $a = $c + 3;
432             }
433 0         0 else { $b = $c }
434             }
435 0         0 $a;
436             }
437              
438             =head2 startover
439              
440             expects no arguments and simply resets the iterator for the hash,
441             so that the next call to L will return the first key/value
442             pair again.
443              
444             =cut
445              
446             sub startover {
447 1     1 1 3 my ($self) = @_;
448 1         99 $self->[I] = undef;
449             }
450              
451             =head1 BUGS
452              
453             None known so far.
454              
455             =head1 AUTHOR
456              
457             Martin H. Sluka
458             mailto:martin@sluka.de
459             http://martin.sluka.de/
460              
461             =head1 THANKS TO
462              
463             Dr. Hermann Schier from the Max Planck Institute for Solid State Research
464             in Stuttgart/Germany for initiating and underwriting the development of this
465             module and for contribution a lot of ideas.
466              
467             =head1 COPYRIGHT
468              
469             This program is free software; you can redistribute
470             it and/or modify it under the same terms as Perl itself.
471              
472             The full text of the license can be found in the
473             LICENSE file included with this module.
474              
475             =head1 SEE ALSO
476              
477             L
478              
479             =cut
480              
481             1