File Coverage

blib/lib/Text/BibTeX/Name.pm
Criterion Covered Total %
statement 38 38 100.0
branch 17 18 94.4
condition 4 6 66.6
subroutine 9 9 100.0
pod 4 4 100.0
total 72 75 96.0


line stmt bran cond sub pod time code
1             # ----------------------------------------------------------------------
2             # NAME : BibTeX/Name.pm
3             # CLASSES : Text::BibTeX::Name
4             # RELATIONS :
5             # DESCRIPTION: Provides an object-oriented interface to the BibTeX-
6             # style author names (parsing them, that is; formatting
7             # them is done by the Text::BibTeX::NameFormat class).
8             # CREATED : Nov 1997, Greg Ward
9             # MODIFIED :
10             # VERSION : $Id$
11             # COPYRIGHT : Copyright (c) 1997-2000 by Gregory P. Ward. All rights
12             # reserved.
13             #
14             # This file is part of the Text::BibTeX library. This
15             # library is free software; you may redistribute it and/or
16             # modify it under the same terms as Perl itself.
17             # ----------------------------------------------------------------------
18              
19             package Text::BibTeX::Name;
20              
21             require 5.004;
22              
23 13     13   219 use strict;
  13         29  
  13         401  
24 13     13   69 use Carp;
  13         23  
  13         815  
25 13     13   73 use vars qw'$VERSION';
  13         24  
  13         695  
26             $VERSION = 0.88;
27              
28 13     13   73 use Text::BibTeX;
  13         36  
  13         7315  
29              
30             =encoding UTF-8
31              
32             =head1 NAME
33              
34             Text::BibTeX::Name - interface to BibTeX-style author names
35              
36             =head1 SYNOPSIS
37              
38             use Text::BibTeX::Name;
39              
40             $name = Text::BibTeX::Name->new();
41             $name->split('J. Random Hacker');
42             # or:
43             $name = Text::BibTeX::Name->new('J. Random Hacker');
44              
45             @firstname_tokens = $name->part ('first');
46             $lastname = join (' ', $name->part ('last'));
47              
48             $format = Text::BibTeX::NameFormat->new();
49             # ...customize $format...
50             $formatted = $name->format ($format);
51              
52             =head1 DESCRIPTION
53              
54             C provides an abstraction for BibTeX-style names and
55             some basic operations on them. A name, in the BibTeX world, consists of
56             a list of I which are divided amongst four I: `first',
57             `von', `last', and `jr'.
58              
59             Tokens are separated by whitespace or commas at brace-level zero. Thus
60             the name
61              
62             van der Graaf, Horace Q.
63              
64             has five tokens, whereas the name
65              
66             {Foo, Bar, and Sons}
67              
68             consists of a single token. Skip down to L<"EXAMPLES"> for more examples, or
69             read on if you want to know the exact details of how names are split into
70             tokens and parts.
71              
72             How tokens are divided into parts depends on the form of the name. If
73             the name has no commas at brace-level zero (as in the second example),
74             then it is assumed to be in either "first last" or "first von last"
75             form. If there are no tokens that start with a lower-case letter, then
76             "first last" form is assumed: the final token is the last name, and all
77             other tokens form the first name. Otherwise, the earliest contiguous
78             sequence of tokens with initial lower-case letters is taken as the `von'
79             part; if this sequence includes the final token, then a warning is
80             printed and the final token is forced to be the `last' part.
81              
82             If a name has a single comma, then it is assumed to be in "von last,
83             first" form. A leading sequence of tokens with initial lower-case
84             letters, if any, forms the `von' part; tokens between the `von' and the
85             comma form the `last' part; tokens following the comma form the `first'
86             part. Again, if there are no tokens following a leading sequence of
87             lowercase tokens, a warning is printed and the token immediately
88             preceding the comma is taken to be the `last' part.
89              
90             If a name has more than two commas, a warning is printed and the name is
91             treated as though only the first two commas were present.
92              
93             Finally, if a name has two commas, it is assumed to be in "von last, jr,
94             first" form. (This is the only way to represent a name with a `jr'
95             part.) The parsing of the name is the same as for a one-comma name,
96             except that tokens between the two commas are taken to be the `jr' part.
97              
98             =head1 CAVEAT
99              
100             The C code that does the actual work of splitting up names takes a shortcut
101             and makes few assumptions about whitespace. In particular, there must be
102             no leading whitespace, no trailing whitespace, no consecutive whitespace
103             characters in the string, and no whitespace characters other than space.
104             In other words, all whitespace must consist of lone internal spaces.
105              
106             =head1 EXAMPLES
107              
108             The strings C<"John Smith"> and C<"Smith, John"> are different
109             representations of the same name, so split into parts and tokens the
110             same way, namely as:
111              
112             first => ('John')
113             von => ()
114             last => ('Smith')
115             jr => ()
116              
117             Note that every part is a list of tokens, even if there is only one
118             token in that part; empty parts get empty token lists. Every token is
119             just a string. Writing this example in actual code is simple:
120              
121             $name = Text::BibTeX::Name->new("John Smith"); # or "Smith, John"
122             $name->part ('first'); # returns list ("John")
123             $name->part ('last'); # returns list ("Smith")
124             $name->part ('von'); # returns list ()
125             $name->part ('jr'); # returns list ()
126              
127             (We'll omit the empty parts in the rest of the examples: just assume
128             that any unmentioned part is an empty list.) If more than two tokens
129             are included and there's no comma, they'll go to the first name: thus
130             C<"John Q. Smith"> splits into
131              
132             first => ("John", "Q."))
133             last => ("Smith")
134              
135             and C<"J. R. R. Tolkein"> into
136              
137             first => ("J.", "R.", "R.")
138             last => ("Tolkein")
139              
140             The ambiguous name C<"Kevin Philips Bong"> splits into
141              
142             first => ("Kevin", "Philips")
143             last => ("Bong")
144              
145             which may or may not be the right thing, depending on the particular
146             person. There's no way to know though, so if this fellow's last name is
147             "Philips Bong" and not "Bong", the string representation of his name
148             must disambiguate. One possibility is C<"Philips Bong, Kevin"> which
149             splits into
150              
151             first => ("Kevin")
152             last => ("Philips", "Bong")
153              
154             Alternately, C<"Kevin {Philips Bong}"> takes advantage of the fact that
155             tokes are only split on whitespace I, and becomes
156              
157             first => ("Kevin")
158             last => ("{Philips Bong}")
159              
160             which is fine if your names are destined to be processed by TeX, but
161             might be problematic in other contexts. Similarly, C<"St John-Mollusc,
162             Oliver"> becomes
163              
164             first => ("Oliver")
165             last => ("St", "John-Mollusc")
166              
167             which can also be written as C<"Oliver {St John-Mollusc}">:
168              
169             first => ("Oliver")
170             last => ("{St John-Mollusc}")
171              
172             Since tokens are separated purely by whitespace, hyphenated names will
173             work either way: both C<"Nigel Incubator-Jones"> and C<"Incubator-Jones,
174             Nigel"> come out as
175              
176             first => ("Nigel")
177             last => ("Incubator-Jones")
178              
179             Multi-token last names with lowercase components -- the "von part" --
180             work fine: both C<"Ludwig van Beethoven"> and C<"van Beethoven, Ludwig">
181             parse (correctly) into
182              
183             first => ("Ludwig")
184             von => ("van")
185             last => ("Beethoven")
186              
187             This allows these European aristocratic names to sort properly,
188             i.e. I under I rather than I. Speaking of
189             aristocratic European names, C<"Charles Louis Xavier Joseph de la
190             Vall{\'e}e Poussin"> is handled just fine, and splits into
191              
192             first => ("Charles", "Louis", "Xavier", "Joseph")
193             von => ("de", "la")
194             last => ("Vall{\'e}e", "Poussin")
195              
196             so could be sorted under I rather than I. (Note that the sorting
197             algorithm in L is a slavish imitiation of BibTeX
198             0.99, and therefore does the wrong thing with these names: the sort key
199             starts with the "von" part.)
200              
201             However, capitalized "von parts" don't work so well: C<"R. J. Van de
202             Graaff"> splits into
203              
204             first => ("R.", "J.", "Van")
205             von => ("de")
206             last => ("Graaff")
207              
208             which is clearly wrong. This name should be represented as C<"Van de
209             Graaff, R. J.">
210              
211             first => ("R.", "J.")
212             last => ("Van", "de", "Graaff")
213              
214             which is probably right. (This particular Van de Graaff was an
215             American, so he probably belongs under I -- which is where my
216             (British) dictionary puts him. Other Van de Graaff's mileages may
217             vary.)
218              
219             Finally, many names include a suffix: "Jr.", "III", "fils", and so
220             forth. These are handled, but with some limitations. If there's a
221             comma before the suffix (the usual U.S. convention for "Jr."), then the
222             name should be in I form, e.g. C<"Doe, Jr., John">
223             comes out (correctly) as
224              
225             first => ("John")
226             last => ("Doe")
227             jr => ("Jr.")
228              
229             but C<"John Doe, Jr."> is ambiguous and is parsed as
230              
231             first => ("Jr.")
232             last => ("John", "Doe")
233              
234             (so don't do it that way). If there's no comma before the suffix -- the
235             usual for Roman numerals, and occasionally seen with "Jr." -- then
236             you're stuck and have to make the suffix part of the last name. Thus,
237             C<"Gates III, William H."> comes out
238              
239             first => ("William", "H.")
240             last => ("Gates", "III")
241              
242             but C<"William H. Gates III"> is ambiguous, and becomes
243              
244             first => ("William", "H.", "Gates")
245             last => ("III")
246              
247             -- not what you want. Again, the curly-brace trick comes in handy, so
248             C<"William H. {Gates III}"> splits into
249              
250             first => ("William", "H.")
251             last => ("{Gates III}")
252              
253             There is no way to make a comma-less suffix the C part. (This is an
254             unfortunate consequence of slavishly imitating BibTeX 0.99.)
255              
256             Finally, names that aren't really names of people but rather are
257             organization or company names should be forced into a single token by
258             wrapping them in curly braces. For example, "Foo, Bar and Sons" should
259             be written C<"{Foo, Bar and Sons}">, which will split as
260              
261             last => ("{Foo, Bar and Sons}")
262              
263             Of course, if this is one name in a BibTeX C or C
264             list, this name has to be wrapped in braces anyways (because of the C<"
265             and ">), but that's another story.
266              
267             =head1 FORMATTING NAMES
268              
269             Putting a split-up name back together again in a flexible, customizable
270             way is the job of another module: see L.
271              
272             =head1 METHODS
273              
274             =over 4
275              
276             =item new([ [OPTS,] NAME [, FILENAME, LINE, NAME_NUM]])
277              
278             Creates a new C object. If NAME is supplied, it
279             must be a string containing a single name, and it will be be passed to
280             the C method for further processing. FILENAME, LINE, and
281             NAME_NUM, if present, are all also passed to C to allow better
282             error messages.
283              
284             If the first argument is a hash reference, it is used to define
285             configuration values. At the moment the available values are:
286              
287             =over 4
288              
289             =item BINMODE
290              
291             Set the way Text::BibTeX deals with strings. By default it manages
292             strings as bytes. You can set BINMODE to 'utf-8' to get NFC normalized
293             UTF-8 strings and you can customise the normalization with the NORMALIZATION option.
294              
295             Text::BibTeX::Name->new(
296             { binmode => 'utf-8', normalization => 'NFD' },
297             "Alberto Simões"});
298              
299             =back
300              
301             =cut
302              
303             sub new {
304 52     52 1 34872 my $class = shift;
305 52 100       114 my $opts = ref $_[0] eq 'HASH' ? shift : {};
306              
307 52         208 $opts->{ lc $_ } = $opts->{$_} for ( keys %$opts );
308              
309 52         99 my ( $name, $filename, $line, $name_num ) = @_;
310              
311 52   33     188 $class = ref($class) || $class;
312 52         85 my $self = bless { }, $class;
313              
314 52         108 $self->{binmode} = 'bytes';
315 52         69 $self->{normalization} = 'NFC';
316             $self->{binmode} = 'utf-8'
317 52 100 100     200 if exists $opts->{binmode} && $opts->{binmode} =~ /utf-?8/i;
318 52 100       102 $self->{normalization} = $opts->{normalization} if exists $opts->{normalization};
319              
320 52 100       155 $self->split( Text::BibTeX->_process_argument($name, $self->{binmode}, $self->{normalization}),
321             $filename, $line, $name_num, 1 )
322             if ( defined $name );
323 52         176 $self;
324             }
325              
326              
327             sub DESTROY
328             {
329 52     52   3580 my $self = shift;
330 52         444 $self->free; # free the C structure kept by `split'
331             }
332              
333              
334             =item split (NAME [, FILENAME, LINE, NAME_NUM])
335              
336             Splits NAME (a string containing a single name) into tokens and
337             subsequently into the four parts of a BibTeX-style name (first, von,
338             last, and jr). (Each part is a list of tokens, and tokens are separated
339             by whitespace or commas at brace-depth zero. See above for full details
340             on how a name is split into its component parts.)
341              
342             The token-lists that make up each part of the name are then stored in
343             the C object for later retrieval or formatting with
344             the C and C methods.
345              
346             =cut
347              
348             sub split
349             {
350 54     54 1 767 my ($self, $name, $filename, $line, $name_num) = @_;
351              
352             # Call the XSUB with default values if necessary
353 54 100       120 $self->_split (Text::BibTeX->_process_argument($name, $self->{binmode}, $self->{normalization}), $filename,
    100          
354             defined $line ? $line : -1,
355             defined $name_num ? $name_num : -1,
356             1);
357             }
358              
359              
360             =item part (PARTNAME)
361              
362             Returns the list of tokens in part PARTNAME of a name previously split with
363             C. For example, suppose a C object is created and
364             initialized like this:
365              
366             $name = Text::BibTeX::Name->new();
367             $name->split ('Charles Louis Xavier Joseph de la Vall{\'e}e Poussin');
368              
369             Then this code:
370              
371             $name->part ('von');
372              
373             would return the list C<('de','la')>.
374              
375             =cut
376              
377             sub part {
378 33     33 1 1539 my ( $self, $partname ) = @_;
379              
380 33 50       142 croak "unknown name part"
381             unless $partname =~ /^(first|von|last|jr)$/;
382              
383 33 100       75 if ( exists $self->{$partname} ) {
384 33         200 my @x = map { Text::BibTeX->_process_result($_, $self->{binmode}, $self->{normalization}) }
385 24         29 @{ $self->{$partname} };
  24         60  
386 24 100       417 return @x > 1 ? @x : $x[0];
387             }
388 9         20 return undef;
389             }
390              
391              
392             =item format (FORMAT)
393              
394             Formats a name according to the specifications encoded in FORMAT, which
395             should be a C (or descendant) object. (In short,
396             it must supply a method C which takes a C
397             object as its only argument.) Returns the formatted name as a string.
398              
399             See L for full details on formatting names.
400              
401             =cut
402              
403             sub format
404             {
405 23     23 1 37 my ($self, $format) = @_;
406              
407 23         38 $format->apply ($self);
408             }
409              
410             1;
411              
412             =back
413              
414             =head1 SEE ALSO
415              
416             L, L, L.
417              
418             =head1 AUTHOR
419              
420             Greg Ward
421              
422             =head1 COPYRIGHT
423              
424             Copyright (c) 1997-2000 by Gregory P. Ward. All rights reserved. This file
425             is part of the Text::BibTeX library. This library is free software; you
426             may redistribute it and/or modify it under the same terms as Perl itself.