File Coverage

blib/lib/Text/BibTeX/Name.pm

Criterion	Covered	Total	%
statement	38	38	100.0
branch	17	18	94.4
condition	4	6	66.6
subroutine	9	9	100.0
pod	4	4	100.0
total	72	75	96.0

line	stmt	bran	cond	sub	pod	time	code
1							# ----------------------------------------------------------------------
2							# NAME : BibTeX/Name.pm
3							# CLASSES : Text::BibTeX::Name
4							# RELATIONS :
5							# DESCRIPTION: Provides an object-oriented interface to the BibTeX-
6							# style author names (parsing them, that is; formatting
7							# them is done by the Text::BibTeX::NameFormat class).
8							# CREATED : Nov 1997, Greg Ward
9							# MODIFIED :
10							# VERSION : $Id$
11							# COPYRIGHT : Copyright (c) 1997-2000 by Gregory P. Ward. All rights
12							# reserved.
13							#
14							# This file is part of the Text::BibTeX library. This
15							# library is free software; you may redistribute it and/or
16							# modify it under the same terms as Perl itself.
17							# ----------------------------------------------------------------------
18
19							package Text::BibTeX::Name;
20
21							require 5.004;
22
23	13			13		218	use strict;
	13					74
	13					381
24	13			13		67	use Carp;
	13					24
	13					744
25	13			13		73	use vars qw'$VERSION';
	13					38
	13					729
26							$VERSION = 0.88;
27
28	13			13		82	use Text::BibTeX;
	13					24
	13					7581
29
30							=encoding UTF-8
31
32							=head1 NAME
33
34							Text::BibTeX::Name - interface to BibTeX-style author names
35
36							=head1 SYNOPSIS
37
38							use Text::BibTeX::Name;
39
40							$name = Text::BibTeX::Name->new();
41							$name->split('J. Random Hacker');
42							# or:
43							$name = Text::BibTeX::Name->new('J. Random Hacker');
44
45							@firstname_tokens = $name->part ('first');
46							$lastname = join (' ', $name->part ('last'));
47
48							$format = Text::BibTeX::NameFormat->new();
49							# ...customize $format...
50							$formatted = $name->format ($format);
51
52							=head1 DESCRIPTION
53
54							C provides an abstraction for BibTeX-style names and
55							some basic operations on them. A name, in the BibTeX world, consists of
56							a list of I which are divided amongst four I: `first',
57							`von', `last', and `jr'.
58
59							Tokens are separated by whitespace or commas at brace-level zero. Thus
60							the name
61
62							van der Graaf, Horace Q.
63
64							has five tokens, whereas the name
65
66							{Foo, Bar, and Sons}
67
68							consists of a single token. Skip down to L<"EXAMPLES"> for more examples, or
69							read on if you want to know the exact details of how names are split into
70							tokens and parts.
71
72							How tokens are divided into parts depends on the form of the name. If
73							the name has no commas at brace-level zero (as in the second example),
74							then it is assumed to be in either "first last" or "first von last"
75							form. If there are no tokens that start with a lower-case letter, then
76							"first last" form is assumed: the final token is the last name, and all
77							other tokens form the first name. Otherwise, the earliest contiguous
78							sequence of tokens with initial lower-case letters is taken as the `von'
79							part; if this sequence includes the final token, then a warning is
80							printed and the final token is forced to be the `last' part.
81
82							If a name has a single comma, then it is assumed to be in "von last,
83							first" form. A leading sequence of tokens with initial lower-case
84							letters, if any, forms the `von' part; tokens between the `von' and the
85							comma form the `last' part; tokens following the comma form the `first'
86							part. Again, if there are no tokens following a leading sequence of
87							lowercase tokens, a warning is printed and the token immediately
88							preceding the comma is taken to be the `last' part.
89
90							If a name has more than two commas, a warning is printed and the name is
91							treated as though only the first two commas were present.
92
93							Finally, if a name has two commas, it is assumed to be in "von last, jr,
94							first" form. (This is the only way to represent a name with a `jr'
95							part.) The parsing of the name is the same as for a one-comma name,
96							except that tokens between the two commas are taken to be the `jr' part.
97
98							=head1 CAVEAT
99
100							The C code that does the actual work of splitting up names takes a shortcut
101							and makes few assumptions about whitespace. In particular, there must be
102							no leading whitespace, no trailing whitespace, no consecutive whitespace
103							characters in the string, and no whitespace characters other than space.
104							In other words, all whitespace must consist of lone internal spaces.
105
106							=head1 EXAMPLES
107
108							The strings C<"John Smith"> and C<"Smith, John"> are different
109							representations of the same name, so split into parts and tokens the
110							same way, namely as:
111
112							first => ('John')
113							von => ()
114							last => ('Smith')
115							jr => ()
116
117							Note that every part is a list of tokens, even if there is only one
118							token in that part; empty parts get empty token lists. Every token is
119							just a string. Writing this example in actual code is simple:
120
121							$name = Text::BibTeX::Name->new("John Smith"); # or "Smith, John"
122							$name->part ('first'); # returns list ("John")
123							$name->part ('last'); # returns list ("Smith")
124							$name->part ('von'); # returns list ()
125							$name->part ('jr'); # returns list ()
126
127							(We'll omit the empty parts in the rest of the examples: just assume
128							that any unmentioned part is an empty list.) If more than two tokens
129							are included and there's no comma, they'll go to the first name: thus
130							C<"John Q. Smith"> splits into
131
132							first => ("John", "Q."))
133							last => ("Smith")
134
135							and C<"J. R. R. Tolkein"> into
136
137							first => ("J.", "R.", "R.")
138							last => ("Tolkein")
139
140							The ambiguous name C<"Kevin Philips Bong"> splits into
141
142							first => ("Kevin", "Philips")
143							last => ("Bong")
144
145							which may or may not be the right thing, depending on the particular
146							person. There's no way to know though, so if this fellow's last name is
147							"Philips Bong" and not "Bong", the string representation of his name
148							must disambiguate. One possibility is C<"Philips Bong, Kevin"> which
149							splits into
150
151							first => ("Kevin")
152							last => ("Philips", "Bong")
153
154							Alternately, C<"Kevin {Philips Bong}"> takes advantage of the fact that
155							tokes are only split on whitespace I, and becomes
156
157							first => ("Kevin")
158							last => ("{Philips Bong}")
159
160							which is fine if your names are destined to be processed by TeX, but
161							might be problematic in other contexts. Similarly, C<"St John-Mollusc,
162							Oliver"> becomes
163
164							first => ("Oliver")
165							last => ("St", "John-Mollusc")
166
167							which can also be written as C<"Oliver {St John-Mollusc}">:
168
169							first => ("Oliver")
170							last => ("{St John-Mollusc}")
171
172							Since tokens are separated purely by whitespace, hyphenated names will
173							work either way: both C<"Nigel Incubator-Jones"> and C<"Incubator-Jones,
174							Nigel"> come out as
175
176							first => ("Nigel")
177							last => ("Incubator-Jones")
178
179							Multi-token last names with lowercase components -- the "von part" --
180							work fine: both C<"Ludwig van Beethoven"> and C<"van Beethoven, Ludwig">
181							parse (correctly) into
182
183							first => ("Ludwig")
184							von => ("van")
185							last => ("Beethoven")
186
187							This allows these European aristocratic names to sort properly,
188							i.e. I under I rather than I. Speaking of
189							aristocratic European names, C<"Charles Louis Xavier Joseph de la
190							Vall{\'e}e Poussin"> is handled just fine, and splits into
191
192							first => ("Charles", "Louis", "Xavier", "Joseph")
193							von => ("de", "la")
194							last => ("Vall{\'e}e", "Poussin")
195
196							so could be sorted under I rather than I. (Note that the sorting
197							algorithm in L is a slavish imitiation of BibTeX
198							0.99, and therefore does the wrong thing with these names: the sort key
199							starts with the "von" part.)
200
201							However, capitalized "von parts" don't work so well: C<"R. J. Van de
202							Graaff"> splits into
203
204							first => ("R.", "J.", "Van")
205							von => ("de")
206							last => ("Graaff")
207
208							which is clearly wrong. This name should be represented as C<"Van de
209							Graaff, R. J.">
210
211							first => ("R.", "J.")
212							last => ("Van", "de", "Graaff")
213
214							which is probably right. (This particular Van de Graaff was an
215							American, so he probably belongs under I -- which is where my
216							(British) dictionary puts him. Other Van de Graaff's mileages may
217							vary.)
218
219							Finally, many names include a suffix: "Jr.", "III", "fils", and so
220							forth. These are handled, but with some limitations. If there's a
221							comma before the suffix (the usual U.S. convention for "Jr."), then the
222							name should be in I form, e.g. C<"Doe, Jr., John">
223							comes out (correctly) as
224
225							first => ("John")
226							last => ("Doe")
227							jr => ("Jr.")
228
229							but C<"John Doe, Jr."> is ambiguous and is parsed as
230
231							first => ("Jr.")
232							last => ("John", "Doe")
233
234							(so don't do it that way). If there's no comma before the suffix -- the
235							usual for Roman numerals, and occasionally seen with "Jr." -- then
236							you're stuck and have to make the suffix part of the last name. Thus,
237							C<"Gates III, William H."> comes out
238
239							first => ("William", "H.")
240							last => ("Gates", "III")
241
242							but C<"William H. Gates III"> is ambiguous, and becomes
243
244							first => ("William", "H.", "Gates")
245							last => ("III")
246
247							-- not what you want. Again, the curly-brace trick comes in handy, so
248							C<"William H. {Gates III}"> splits into
249
250							first => ("William", "H.")
251							last => ("{Gates III}")
252
253							There is no way to make a comma-less suffix the C part. (This is an
254							unfortunate consequence of slavishly imitating BibTeX 0.99.)
255
256							Finally, names that aren't really names of people but rather are
257							organization or company names should be forced into a single token by
258							wrapping them in curly braces. For example, "Foo, Bar and Sons" should
259							be written C<"{Foo, Bar and Sons}">, which will split as
260
261							last => ("{Foo, Bar and Sons}")
262
263							Of course, if this is one name in a BibTeX C or C
264							list, this name has to be wrapped in braces anyways (because of the C<"
265							and ">), but that's another story.
266
267							=head1 FORMATTING NAMES
268
269							Putting a split-up name back together again in a flexible, customizable
270							way is the job of another module: see L.
271
272							=head1 METHODS
273
274							=over 4
275
276							=item new([ [OPTS,] NAME [, FILENAME, LINE, NAME_NUM]])
277
278							Creates a new C object. If NAME is supplied, it
279							must be a string containing a single name, and it will be be passed to
280							the C method for further processing. FILENAME, LINE, and
281							NAME_NUM, if present, are all also passed to C to allow better
282							error messages.
283
284							If the first argument is a hash reference, it is used to define
285							configuration values. At the moment the available values are:
286
287							=over 4
288
289							=item BINMODE
290
291							Set the way Text::BibTeX deals with strings. By default it manages
292							strings as bytes. You can set BINMODE to 'utf-8' to get NFC normalized
293							UTF-8 strings and you can customise the normalization with the NORMALIZATION option.
294
295							Text::BibTeX::Name->new(
296							{ binmode => 'utf-8', normalization => 'NFD' },
297							"Alberto Simões"});
298
299							=back
300
301							=cut
302
303							sub new {
304	52			52	1	37794	my $class = shift;
305	52	100				137	my $opts = ref $_[0] eq 'HASH' ? shift : {};
306
307	52					242	$opts->{ lc $_ } = $opts->{$_} for ( keys %$opts );
308
309	52					124	my ( $name, $filename, $line, $name_num ) = @_;
310
311	52		33			193	$class = ref($class) \|\| $class;
312	52					136	my $self = bless { }, $class;
313
314	52					126	$self->{binmode} = 'bytes';
315	52					76	$self->{normalization} = 'NFC';
316							$self->{binmode} = 'utf-8'
317	52	100	100			251	if exists $opts->{binmode} && $opts->{binmode} =~ /utf-?8/i;
318	52	100				126	$self->{normalization} = $opts->{normalization} if exists $opts->{normalization};
319
320	52	100				179	$self->split( Text::BibTeX->_process_argument($name, $self->{binmode}, $self->{normalization}),
321							$filename, $line, $name_num, 1 )
322							if ( defined $name );
323	52					202	$self;
324							}
325
326
327							sub DESTROY
328							{
329	52			52		3227	my $self = shift;
330	52					533	$self->free; # free the C structure kept by `split'
331							}
332
333
334							=item split (NAME [, FILENAME, LINE, NAME_NUM])
335
336							Splits NAME (a string containing a single name) into tokens and
337							subsequently into the four parts of a BibTeX-style name (first, von,
338							last, and jr). (Each part is a list of tokens, and tokens are separated
339							by whitespace or commas at brace-depth zero. See above for full details
340							on how a name is split into its component parts.)
341
342							The token-lists that make up each part of the name are then stored in
343							the C object for later retrieval or formatting with
344							the C and C methods.
345
346							=cut
347
348							sub split
349							{
350	54			54	1	846	my ($self, $name, $filename, $line, $name_num) = @_;
351
352							# Call the XSUB with default values if necessary
353	54	100				132	$self->_split (Text::BibTeX->_process_argument($name, $self->{binmode}, $self->{normalization}), $filename,
		100
354							defined $line ? $line : -1,
355							defined $name_num ? $name_num : -1,
356							1);
357							}
358
359
360							=item part (PARTNAME)
361
362							Returns the list of tokens in part PARTNAME of a name previously split with
363							C. For example, suppose a C object is created and
364							initialized like this:
365
366							$name = Text::BibTeX::Name->new();
367							$name->split ('Charles Louis Xavier Joseph de la Vall{\'e}e Poussin');
368
369							Then this code:
370
371							$name->part ('von');
372
373							would return the list C<('de','la')>.
374
375							=cut
376
377							sub part {
378	33			33	1	1476	my ( $self, $partname ) = @_;
379
380	33	50				141	croak "unknown name part"
381							unless $partname =~ /^(first\|von\|last\|jr)$/;
382
383	33	100				79	if ( exists $self->{$partname} ) {
384	33					187	my @x = map { Text::BibTeX->_process_result($_, $self->{binmode}, $self->{normalization}) }
385	24					34	@{ $self->{$partname} };
	24					52
386	24	100				435	return @x > 1 ? @x : $x[0];
387							}
388	9					21	return undef;
389							}
390
391
392							=item format (FORMAT)
393
394							Formats a name according to the specifications encoded in FORMAT, which
395							should be a C (or descendant) object. (In short,
396							it must supply a method C which takes a C
397							object as its only argument.) Returns the formatted name as a string.
398
399							See L for full details on formatting names.
400
401							=cut
402
403							sub format
404							{
405	23			23	1	44	my ($self, $format) = @_;
406
407	23					53	$format->apply ($self);
408							}
409
410							1;
411
412							=back
413
414							=head1 SEE ALSO
415
416							L, L, L.
417
418							=head1 AUTHOR
419
420							Greg Ward
421
422							=head1 COPYRIGHT
423
424							Copyright (c) 1997-2000 by Gregory P. Ward. All rights reserved. This file
425							is part of the Text::BibTeX library. This library is free software; you
426							may redistribute it and/or modify it under the same terms as Perl itself.