File Coverage

blib/lib/Text/Normalize/NACO.pm

Criterion	Covered	Total	%
statement	41	41	100.0
branch	8	8	100.0
condition	4	4	100.0
subroutine	8	8	100.0
pod	4	4	100.0
total	65	65	100.0

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Normalize::NACO;
2
3							=head1 NAME
4
5							Text::Normalize::NACO - Normalize text based on the NACO rules
6
7							=head1 SYNOPSIS
8
9							# exported method
10							use Text::Normalize::NACO qw( naco_normalize );
11
12							$normalized = naco_normalize( $original );
13
14							# as an object
15							$naco = Text::Normalize::NACO->new;
16							$normalized = $naco->normalize( $original );
17
18							# normalize to lowercase
19							$naco->case( 'lower' );
20							$normalized = $naco->normalize( $original );
21
22							=head1 DESCRIPTION
23
24							In general, normalization is defined as:
25
26							To make (a text or language) regular and consistent, especially with respect to spelling or style.
27
28							It is commonly used for comparative purposes. These particular normalization rules have been set out by the
29							Name Authority Cooperative. The rules are described in detail at: http://www.loc.gov/catdir/pcc/naco/normrule.html
30
31							=head1 INSTALLATION
32
33							perl Makefile.PL
34							make
35							make test
36							make install
37
38							=cut
39
40	3			3		141066	use base qw( Exporter );
	3					7
	3					360
41
42	3			3		16	use strict;
	3					7
	3					108
43	3			3		17	use warnings;
	3					12
	3					99
44
45	3			3		3027	use Text::Unidecode;
	3					13552
	3					1660
46
47							our $VERSION = '0.13';
48
49							our @EXPORT_OK = qw( naco_normalize );
50
51							=head1 METHODS
52
53							=head2 new( %options )
54
55							Creates a new Text::Normalize::NACO object. You explicitly request
56							strings to be normalized in upper or lower-case by setting
57							the "case" option (defaults to "upper").
58
59							my $naco = Text::Normalize::NACO->new( case => 'lower' );
60
61							=cut
62
63							sub new {
64	3			3	1	88	my $class = shift;
65	3					11	my %options = @_;
66	3					12	my $self = bless {}, $class;
67
68	3		100			33	$self->case( $options{ case } \|\| 'upper' );
69
70	3					10	return $self;
71							}
72
73							=head2 case( $case )
74
75							Accessor/Mutator for the case in which the string should be returned.
76
77							# lower-case
78							$naco->case( 'lower' );
79
80							# upper-case
81							$naco->case( 'upper' );
82
83							=cut
84
85							sub case {
86	107			107	1	160	my $self = shift;
87	107					137	my ( $case ) = @_;
88
89	107	100				265	$self->{ _CASE } = $case if @_;
90
91	107					363	return $self->{ _CASE };
92							}
93
94							=head2 naco_normalize( $text, { %options } )
95
96							Exported version of C. You can specify any extra
97							options by passing a hashref after the string to be normalized.
98
99							my $normalized = naco_normalize( $original, { case => 'lower' } );
100
101							=cut
102
103							sub naco_normalize {
104	2			2	1	513	my $text = shift;
105	2					3	my $options = shift;
106	2		100			13	my $case = $options->{ case } \|\| 'upper';
107
108	2					6	my $normalized = normalize( undef, $text );
109
110	2	100				6	if ( $case eq 'lower' ) {
111	1					3	$normalized =~ tr/A-Z/a-z/;
112							}
113							else {
114	1					3	$normalized =~ tr/a-z/A-Z/;
115							}
116
117	2					12	return $normalized;
118							}
119
120							=head2 normalize( $text )
121
122							Normalizes $text and returns the new string.
123
124							my $normalized = $naco->normalize( $original );
125
126							=cut
127
128							sub normalize {
129	105			105	1	59459	my $self = shift;
130	105					168	my $data = shift;
131
132							# Rules taken from NACO Normalization
133							# http://lcweb.loc.gov/catdir/pcc/naco/normrule.html
134
135							# Remove diacritical marks and convert special chars
136	105					273	unidecode( $data );
137
138							# Convert special chars to spaces
139	105					4356	$data =~ s/[\Q!(){}<>-;:.?,\/\\@*%=\$^_~\E]/ /g;
140
141							# Delete special chars
142	105					195	$data =~ s/[\Q'[]\|\E]//g;
143
144							# Convert lowercase to uppercase or vice-versa.
145	105	100				304	if ( $self ) {
146	103	100				213	if ( $self->case eq 'lower' ) {
147	102					239	$data =~ tr/A-Z/a-z/;
148							}
149							else {
150	1					3	$data =~ tr/a-z/A-Z/;
151							}
152							}
153
154							# Remove leading and trailing spaces
155	105					1002	$data =~ s/^\s+\|\s+$//g;
156
157							# Condense multiple spaces
158	105					607	$data =~ s/\s+/ /g;
159
160	105					538	return $data;
161							}
162
163							=head1 SEE ALSO
164
165							=over 4
166
167							=item * http://www.loc.gov/catdir/pcc/naco/normrule.html
168
169							=back
170
171							=head1 AUTHOR
172
173							Brian Cassidy Ebricas@cpan.orgE
174
175							=head1 COPYRIGHT AND LICENSE
176
177							Copyright 2004-2009 by Brian Cassidy
178
179							This library is free software; you can redistribute it and/or modify
180							it under the same terms as Perl itself.
181
182							=cut
183
184							1;