File Coverage

blib/lib/Tie/Hash/Abbrev/BibRefs.pm

Criterion	Covered	Total	%
statement	37	164	22.5
branch	2	92	2.1
condition	0	12	0.0
subroutine	12	29	41.3
pod	10	10	100.0
total	61	307	19.8

line	stmt	bran	cond	sub	pod	time	code
1							package Tie::Hash::Abbrev::BibRefs;
2
3							=head1 NAME
4
5							Tie::Hash::Abbrev::BibRefs - match bibliographic references to the original titles
6
7							=head1 SYNOPSIS
8
9							use Tie::Hash::Abbrev::BibRefs;
10
11							tie my %hash, 'Tie::Hash::Abbrev::BibRefs',
12							preprocess => sub { s/\s+[[:upper:]]:.*// },
13							stopwords => [ qw( a and de del der des di
14							et for f�r i if in la las
15							of on part Part Pt. Sect.
16							the to und ) ],
17							exceptions => { jpn => 'japan',
18							natl => 'national' };
19
20							$hash{'Physical Review B'} = '0163-1829';
21
22							print $hash{'Phys. Rev. B: Condens. Matter Mater. Phys.'};
23							# will print '0163-1829'
24
25							=head1 DESCRIPTION
26
27							This module is an attempt to ease the mapping of often abbreviated
28							bibliographical references to the original titles.
29
30							To achieve this, it simplyfies the title according to parameterizable rules and
31							stores it as a I.
32
33							When accessing the hash, the key given is also L
34							and compared to the normalized version of the original title.
35							In addition, each word (words are separated by whitespace) may be abbreviated by
36							specifying only the first few letters.
37
38							If more than one matching hash entry is found, the values of all matching
39							entries are compared; as long as they are all
40							Lual (or all L), the
41							lookup is still considered to be successful.
42
43							=head1 KEY NORMALIZATION
44
45							The process of normalization is implemented as follows:
46
47							=over 4
48
49							=item 1.
50
51							execute any preprocessing code (see L), which is
52							expected to operate on C<$_>.
53							You can use subroutine references or strings here; strings will be
54							L.
55
56							=item 2.
57
58							split the key into parts (at whitespace).
59
60							=item 3.
61
62							remove any parts contained in the list of stopwords
63							(see L).
64
65							=item 4.
66
67							replace any parts contained in the list of exceptions
68							by their corresponding value.
69							If the value is L, the entire part will be removed.
70							(In the L, "Jpn" would be replaced by "japan".)
71							This lookup is done case-insensitively.
72
73							=item 5.
74
75							remove any non-word characters at the end of each part or followed by a dash
76
77							=back
78
79							=cut
80
81	1			1		27468	use strict;
	1					3
	1					46
82	1			1		6	use vars '$VERSION';
	1					2
	1					61
83
84	1			1		7	use Carp 'croak';
	1					7
	1					93
85
86							$VERSION = 0.02;
87
88	1			1		7	use constant DATA => 0;
	1					2
	1					59
89	1			1		7	use constant I => 1;
	1					2
	1					59
90	1			1		6	use constant PREPROCESS => 2;
	1					1
	1					51
91	1			1		7	use constant STOPWORDS => 3;
	1					1
	1					84
92	1			1		7	use constant EXCEPTIONS => 4;
	1					2
	1					60
93	1			1		6	use constant DEBUG => 5;
	1					2
	1					2689
94
95							sub TIEHASH {
96	1	50		1		17	croak 'Odd number of arguments.' unless @_ & 1;
97	1					3	my $package = shift;
98	1	50				5	$package = ref $package if length ref $package;
99	1					4	my $self = bless [], $package;
100	1					7	$self->[DATA] = [];
101	1					4	while (@_) {
102	0					0	my ( $option, $argument ) = splice @_, 0, 2;
103	0	0				0	if ( $option eq 'debug' ) { $self->debug($argument) }
	0	0				0
		0
		0
104	0					0	elsif ( $option =~ /^exceptions?\z/ ) { $self->exceptions($argument) }
105	0					0	elsif ( $option eq 'preprocess' ) { $self->preprocess($argument) }
106							elsif ( $option =~ /^stopwords?\z/ ) {
107	0	0				0	$self->stopwords( ref $argument ? @$argument : $argument );
108							}
109	0					0	else { croak qq(Unknown TIEHASH option "$option"!) }
110							}
111	1					5	$self;
112							}
113
114							sub FETCH {
115	0			0		0	my ( $self, $key ) = @_;
116	0	0				0	if ( defined( my $found = $self->find($key) ) ) { $self->[DATA][$found] }
	0					0
117	0					0	else { undef }
118							}
119
120							sub STORE {
121	0			0		0	my ( $self, $key, $value ) = @_;
122	0	0				0	if (
123							defined $self->exact(
124							$key, my $pos = $self->pos( my $normkey = $self->normalize($key) )
125							)
126							)
127							{
128	0					0	$self->[DATA][ $pos + 1 ] = $value;
129							}
130	0					0	else { splice @{ $self->[DATA] }, $pos, 0, $normkey, $value, $key }
	0					0
131							}
132
133							sub EXISTS {
134	0			0		0	my ( $self, $key ) = @_;
135	0	0				0	if ( defined $self->find($key) ) { 1 }
	0					0
136	0					0	else { '' }
137							}
138
139							sub DELETE {
140	0			0		0	my ( $self, $key ) = @_;
141	0					0	my $pos = $self->pos( my $normkey = $self->normalize($key) );
142	0	0				0	if ( defined $self->exact( $key, $pos ) ) {
143	0					0	( undef, my $value ) = splice @{ $self->[DATA] }, $pos, 3;
	0					0
144	0					0	$self->startover;
145	0					0	$value;
146							}
147	0					0	else { undef }
148							}
149
150							sub CLEAR {
151	0			0		0	my ($self) = @_;
152	0					0	$self->startover;
153	0					0	@{ $self->[DATA] } = ();
	0					0
154							}
155
156							sub FIRSTKEY {
157	0			0		0	my ($self) = @_;
158	0	0				0	return undef unless @{ $self->[DATA] };
	0					0
159	0					0	$self->[ $self->[I] = 2 ];
160							}
161
162							sub NEXTKEY {
163	0			0		0	my ( $self, $lastkey ) = @_;
164	0	0				0	if ( ( my $i = $self->[I] += 3 ) <= $#{ $self->[DATA] } ) {
	0					0
165	0					0	$self->[DATA][$i];
166							}
167							else {
168	0					0	$self->startover;
169	0					0	undef;
170							}
171							}
172
173	0			0		0	sub UNTIE { }
174
175	1			1		853	sub DESTROY { shift->startover }
176
177							=head1 ADDITIONAL METHODS
178
179							=head2 debug
180
181							turn debug mode on (when given a true value as argument) or off
182							(when given a false value).
183							Returns the (possibly new) value.
184
185							In debug mode, the L method will print debug messages to STDERR.
186
187							=cut
188
189							sub debug {
190	0			0	1	0	my $self = shift;
191	0	0				0	$self->[DEBUG] = shift if @_;
192	0					0	$self->[DEBUG];
193							}
194
195							=head2 delete_abbrev
196
197							my @deleted = tied(%hash)->delete_abbrev('foo','bar');
198
199							Will delete all elements on the basis of all unambiguous abbreviations given as
200							arguments and return a (possibly empty) list of all deleted values.
201
202							=cut
203
204							sub delete_abbrev {
205	0			0	1	0	my $self = shift;
206	0					0	my @deleted;
207	0					0	for (@_) {
208							next
209							unless
210	0	0				0	defined( my $pos1 = $self->valid( $_, my $pos = $self->pos($_) ) );
211	0					0	my $i = 0;
212	0					0	push @deleted, grep $i++ & 1, splice @{ $self->[DATA] }, $pos,
	0					0
213							3 + $pos1 - $pos;
214							}
215	0	0				0	$self->startover if @deleted;
216	0					0	@deleted;
217							}
218
219							=head2 exceptions
220
221							get or set the exceptions table for the hash.
222							Expects hash references or L, which clears the table.
223							Returns a reference to the new exception table.
224
225							=cut
226
227							sub exceptions {
228	0			0	1	0	my $self = shift;
229	0					0	for (@_) {
230	0	0				0	if (defined) {
231	0					0	while ( my ( $k, $v ) = each %$_ ) {
232	0					0	$self->[EXCEPTIONS]{ lc $k } = lc $v;
233							}
234							}
235	0					0	else { $self->[EXCEPTIONS] = {} }
236							}
237	0	0				0	$self->[EXCEPTIONS] \|\| {};
238							}
239
240							=head2 preprocess
241
242							set up the preprocessing code chain for the hash.
243							Any code references or strings will be added to the chain,
244							an L will clear the chain.
245
246							=cut
247
248							sub preprocess {
249	0			0	1	0	my $self = shift;
250	0					0	for (@_) {
251	0	0				0	if (defined) { push @{ $self->[PREPROCESS] }, $_ }
	0					0
	0					0
252	0					0	else { @{ $self->[PREPROCESS] } = [] }
	0					0
253							}
254	0	0				0	@{ $self->[PREPROCESS] \|\| [] };
	0					0
255							}
256
257							=head2 stopwords
258
259							get or set the /stopwords for the hash.
260							Any arguments given will be added to the list of stopwords.
261							An L> as argument will clear the list of stopwords.
262							The method returns the new list of stopwords (in an unsorted manner).
263
264							=cut
265
266							sub stopwords {
267	0			0	1	0	my $self = shift;
268	0					0	for (@_) {
269	0	0				0	if (defined) { $self->[STOPWORDS]{$_} = undef }
	0					0
270	0					0	else { $self->[STOPWORDS] = {} }
271							}
272	0	0				0	keys %{ $self->[STOPWORDS] \|\| {} };
	0					0
273							}
274
275							=head1 INTERNAL METHODS
276
277							The following methods should usually not be called "from the outside";
278							the main intention of ducumenting them is that the author still wants to
279							understand his own module in case changes will be neccessary later. :o)
280
281							=head2 exact
282
283							expects a key as first and a L as second argument.
284							Returns the position if the given key equals (case-insensitively) the real key
285							stored at that position or undef if not.
286
287							=cut
288
289							sub exact {
290	0			0	1	0	my ( $self, $key, $pos ) = @_;
291	0	0	0			0	if ( $pos < $#{ $self->[DATA] } && lc $self->[DATA][ $pos + 2 ] eq lc $key )
	0					0
292							{
293	0					0	$pos;
294							}
295	0					0	else { undef }
296							}
297
298							=head2 find
299
300							This is the central method for lookups, used by L and
301							C.
302
303							It expects a key as its only argument.
304
305							Upon success, the method returns an array index at which the corresponding value
306							can be found, or undef otherwise.
307
308							=cut
309
310							sub find {
311	0			0	1	0	my ( $self, $key ) = @_;
312	0					0	my $debug = $self->debug;
313	0					0	my ( $prefix, $pattern, $normkey ) = $self->normalize($key);
314	0	0				0	print STDERR <<_ if $debug;
315							--------------------------------------------------------------------------------
316							Key: <$key>
317							Prefix: <$prefix>
318							Pattern: <$pattern>
319							NormKey: <$normkey>
320							_
321	0	0				0	defined( my $pos = $self->pos($prefix) ) or return undef;
322	0					0	my $data = $self->[DATA];
323	0	0				0	print STDERR 'Starting search at entry #'
		0
324							. ( $pos / 3 )
325							. (
326							$pos ? qq(; the key before that would be: "$data->[$pos-3]"\n) : ".\n" )
327							if $debug;
328	0					0	my $found;
329	0		0			0	do {
330	0	0				0	print STDERR 'Examining entry #'
331							. ( $pos / 3 )
332							. qq(: "$data->[$pos]"... )
333							if $debug;
334	0	0				0	if ( $data->[$pos] =~ $pattern ) {
335	0	0				0	if ( lc $data->[ $pos + 2 ] eq lc $key ) {
336	0	0				0	print STDERR "exact match.\n" if $debug;
337	0					0	return $pos + 1;
338							}
339	0	0	0			0	unless ( defined $found ) {
		0
		0
340	0					0	$found = $pos + 1;
341	0	0				0	print STDERR qq( matches, value: "$data->[$found]"\n)
342							if $debug;
343							}
344							elsif (
345							defined $data->[$found]
346							? !defined $data->[ $pos + 1 ]
347							\|\| $data->[ $pos + 1 ] ne $data->[$found]
348							: defined $data->[ $pos + 1 ]
349							)
350							{
351	0	0				0	print STDERR
352							qq( also matches, but has a different value: "$data->[$pos+1]"\n)
353							if $debug;
354	0					0	return;
355							}
356							}
357	0	0				0	else { print STDERR "does not match.\n" if $debug }
358							} while ( $pos += 3 ) < $#$data
359							&& $prefix eq substr $data->[$pos], 0, length $prefix;
360	0	0				0	print STDERR $pos > $#$data ? "Last element reached.\n"
		0
		0
361							: qq("$data->[$pos]" has a different prefix.\n),
362							defined $found ? "Search was successful.\n"
363							: "Search was NOT successful.\n"
364							if $debug;
365	0					0	$found;
366							}
367
368							=head2 normalize
369
370							Given a key as the its only argument,
371							this method will return the normalized key in scalar
372							and a three element list in array context, consisting of
373
374							=over 4
375
376							=item 0.
377
378							the L
379
380							=item 1.
381
382							the L and
383
384							=item 2.
385
386							the L.
387
388							=back
389
390							=cut
391
392							sub normalize {
393	0			0	1	0	my ( $self, $key ) = @_;
394	0					0	my ( $exceptions, $stopwords ) = @{$self}[ EXCEPTIONS, STOPWORDS ];
	0					0
395	0					0	local $_ = $key;
396	0					0	for my $pp ( $self->preprocess ) {
397	0	0				0	if ( ref $pp ) { &$pp }
	0					0
398	0					0	else { eval $pp }
399							}
400							(
401	0	0				0	my $normkey =
		0
402							join ' ',
403							map exists $exceptions->{ +lc }
404							? defined $exceptions->{ +lc } ? $exceptions->{ +lc } : ()
405							: lc,
406							grep !exists $stopwords->{$_},
407							split /\s+\|-/
408							) =~ s/\W+(?=\s\|-\|$)//g;
409	0	0				0	return $normkey unless wantarray;
410	0					0	my ($prefix) = $normkey =~ /^([^\s-]*)/;
411	0					0	my $pattern = '^'
412							. join ( ' ', map quotemeta() . '\S*', split /\s+\|-/, $normkey ) . '$';
413	0	0				0	$prefix, $] < 5.006 ? $pattern : eval 'qr/$pattern/', $normkey;
414							}
415
416							=head2 pos
417
418							expects an (usually L) key as (its only) argument
419							and returns the position at which this key is stored (if it exists)
420							or should be sorted (if it does not already exist).
421
422							=cut
423
424							sub pos {
425	0			0	1	0	my ( $self, $key ) = @_;
426	0					0	my $data = $self->[DATA];
427	0					0	my $a = 0;
428	0					0	my $b = @$data;
429	0		0			0	while ( $a < $b && $a < $#$data ) { # perform a binary search
430	0	0				0	if ( $data->[ my $c = 3 * int +( $a + $b >> 1 ) / 3 ] lt $key ) {
431	0					0	$a = $c + 3;
432							}
433	0					0	else { $b = $c }
434							}
435	0					0	$a;
436							}
437
438							=head2 startover
439
440							expects no arguments and simply resets the iterator for the hash,
441							so that the next call to L will return the first key/value
442							pair again.
443
444							=cut
445
446							sub startover {
447	1			1	1	3	my ($self) = @_;
448	1					99	$self->[I] = undef;
449							}
450
451							=head1 BUGS
452
453							None known so far.
454
455							=head1 AUTHOR
456
457							Martin H. Sluka
458							mailto:martin@sluka.de
459							http://martin.sluka.de/
460
461							=head1 THANKS TO
462
463							Dr. Hermann Schier from the Max Planck Institute for Solid State Research
464							in Stuttgart/Germany for initiating and underwriting the development of this
465							module and for contribution a lot of ideas.
466
467							=head1 COPYRIGHT
468
469							This program is free software; you can redistribute
470							it and/or modify it under the same terms as Perl itself.
471
472							The full text of the license can be found in the
473							LICENSE file included with this module.
474
475							=head1 SEE ALSO
476
477							L
478
479							=cut
480
481							1