File Coverage

blib/lib/BibTeX/Parser/Author.pm

Criterion	Covered	Total	%
statement	105	152	69.0
branch	45	80	56.2
condition	2	18	11.1
subroutine	14	16	87.5
pod	7	7	100.0
total	173	273	63.3

line	stmt	bran	cond	sub	pod	time	code
1							package BibTeX::Parser::Author;
2							{
3							$BibTeX::Parser::Author::VERSION = '1.93';
4							}
5
6	19			19		261582	use warnings;
	19					60
	19					1274
7	19			19		105	use strict;
	19					33
	19					936
8							require BibTeX::Parser; # mutual dependency
9
10							use overload
11	19			19		11351	'""' => \&to_string;
	19					32978
	19					195
12
13
14
15							sub new {
16	68			68	1	230774	my $class = shift;
17
18	68	100				203	if (@_) {
19	65					222	my $self = [ $class->split(@_) ];
20	65					455	return bless $self, $class;
21							} else {
22	3					14	return bless [], $class;
23							}
24							}
25
26							sub _get_or_set_field {
27	406			406		839	my ($self, $field, $value) = @_;
28	406	100				849	if (defined $value) {
29	4					28	$self->[$field] = $value;
30							} else {
31	402					1589	return $self->[$field];
32							}
33							}
34
35
36							sub first {
37	140			140	1	41795	shift->_get_or_set_field(0, @_);
38							}
39
40
41							sub von {
42	86			86	1	244	shift->_get_or_set_field(1, @_);
43							}
44
45
46							sub last {
47	94			94	1	21639	shift->_get_or_set_field(2, @_);
48							}
49
50
51							sub jr {
52	86			86	1	214	shift->_get_or_set_field(3, @_);
53							}
54
55
56							# Take a string and create an array [first, von, last, jr]
57							sub split {
58	98			98	1	16557	my ($self_or_class, $name) = @_;
59							#warn " starting split for $name\n";
60
61							# remove whitespace at start and end of string
62	98					813	$name =~ s/^\s(.)\s*$/$1/s;
63
64
65
66	98	100				337	if (!length($name)) {
67	6					29	return (undef, undef, undef, undef);
68							}
69
70	92					310	my @comma_separated =
71							BibTeX::Parser::_split_braced_string($name,
72							'\s,\s');
73							#warn " got ccs (", 0+@comma_separated, "): @comma_separated\n";
74	92	50				239	if (scalar(@comma_separated) == 0) {
75							# Error?
76	0					0	return (undef, undef, undef, undef);
77							}
78
79	92					179	my $first=undef;
80	92					164	my $von=undef;
81	92					177	my $last=undef;
82	92					150	my $jr=undef;
83
84	92	100				245	if (scalar(@comma_separated) == 1) {
85							# First von Last form
86	65					189	my @tokens =
87							BibTeX::Parser::_split_braced_string($name, '\s+');
88	65	50				158	if (!scalar (@tokens)) {
89	0					0	return (undef, undef, undef, undef);
90							}
91	65					210	my ($start_von, $start_last) = _getStartVonLast (@tokens);
92							#warn " got start_von=$start_von, start_last=$start_last for tokens @tokens\n";
93	65	100				182	if ($start_von >0) {
94	55					197	$first = join(' ', splice(@tokens,0,$start_von));
95							}
96	65	100				202	if (($start_last-$start_von) >0) {
97	9					29	$von = join(' ', splice(@tokens,0,$start_last-$start_von));
98							}
99	65					180	$last = join(' ',@tokens);
100	65					423	return ($first, $von, $last, $jr);
101							}
102							# Now we work with von Last, [Jr,] First form
103	27	100				74	if (scalar @comma_separated == 2) { # no jr
104	21					85	my @tokens=
105							BibTeX::Parser::_split_braced_string($comma_separated[1], '\s+');
106	21					66	$first = join(' ', @tokens);
107							} else { # jr is present
108	6					23	my @tokens=
109							BibTeX::Parser::_split_braced_string($comma_separated[1], '\s+');
110	6					17	$jr = join(' ', @tokens);
111	6					21	@tokens=
112							BibTeX::Parser::_split_braced_string($comma_separated[2], '\s+');
113	6					21	$first = join(' ', @tokens);
114							}
115	27					93	my @tokens =
116							BibTeX::Parser::_split_braced_string($comma_separated[0], '\s+');
117	27					90	my $start_last = _getStartLast(@tokens);
118	27	100				70	if ($start_last > 0) {
119	10					34	$von=join(' ', splice(@tokens,0,$start_last));
120							}
121	27					64	$last = join(' ',@tokens);
122	27					194	return ($first, $von, $last, $jr);
123
124							}
125
126							# Return the index of the first von element and the first lastname
127							# element. If no von element, von=last
128
129							sub _getStartVonLast {
130	65			65		131	my $length=scalar(@_);
131	65	100				171	if ($length==1) {
132	8					24	return (0,0);
133							}
134	57					107	my $start_von=-1;
135	57					97	my $start_last=$length-1;
136	57					184	for (my $i=0; $i<$length; $i++) {
137	130	100				347	if (_is_von_token($_[$i])) {
138	14					28	$start_von=$i;
139	14					27	last;
140							}
141							}
142	57	100				152	if ($start_von== -1) { # no von part
143	43					151	return($length-1, $length-1);
144							}
145	14	100				45	if ($start_von== $length-1) { # all parts but last are upper case?
146	5					13	return($length-1, $length-1);
147							}
148	9					48	for (my $i=$start_von+1; $i<$length; $i++) {
149	13	100				38	if (!_is_von_token($_[$i])) {
150	9					17	$start_last=$i;
151	9					19	last;
152							}
153							}
154	9					29	return($start_von, $start_last);
155							}
156
157
158							# Return the index of the first lastname
159							# element provided no first name elements are present
160
161							sub _getStartLast {
162	27			27		53	my $length=scalar(@_);
163	27	100				75	if ($length==1) {
164	15					3868	return 0;
165							}
166	12					28	my $start_last=$length-1;
167	12					42	for (my $i=0; $i<$length; $i++) {
168	22	100				62	if (!_is_von_token($_[$i])) {
169	12					22	$start_last=$i;
170	12					44	last;
171							}
172							}
173	12					25	return $start_last;
174							}
175
176
177							sub _split_name_parts {
178	0			0		0	my $name = shift;
179
180	0	0				0	if ( $name !~ /\{/ ) {
181	0					0	return split /\s+/, $name;
182							} else {
183	0					0	my @parts;
184	0					0	my $cur_token = '';
185	0					0	while ( scalar( $name =~ /\G ( [^\s\{]* ) ( \s+ \| \{ \| \s* $ ) /xgc ) ) {
186	0					0	$cur_token .= $1;
187	0	0				0	if ( $2 =~ /\{/ ) {
188	0	0				0	if ( scalar( $name =~ /\G([^\}]*)\}/gc ) ) {
189	0					0	$cur_token .= "{$1}";
190							} else {
191	0					0	die "Unmatched brace in name '$name'";
192							}
193							} else {
194	0	0				0	if ( $cur_token =~ /^{(.*)}$/ ) {
195	0					0	$cur_token = $1;
196							}
197	0					0	push @parts, $cur_token;
198	0					0	$cur_token = '';
199							}
200							}
201	0					0	return @parts;
202							}
203
204							}
205
206
207							sub _get_single_author_from_tokens {
208	0			0		0	my (@tokens) = @_;
209	0	0				0	if (@tokens == 0) {
		0
		0
210	0					0	return (undef, undef, undef, undef);
211							} elsif (@tokens == 1) { # name without comma
212	0	0				0	if ( $tokens[0] =~ /(^\|\s)[[:lower:]]/) { # name has von part or has only lowercase names
213	0					0	my @name_parts = _split_name_parts $tokens[0];
214
215	0					0	my $first;
216	0		0			0	while (@name_parts && ucfirst($name_parts[0]) eq $name_parts[0] ) {
217	0	0				0	$first .= $first ? ' ' . shift @name_parts : shift @name_parts;
218							}
219
220	0					0	my $von;
221							# von part are lowercase words
222	0		0			0	while ( @name_parts && lc($name_parts[0]) eq $name_parts[0] ) {
223	0	0				0	$von .= $von ? ' ' . shift @name_parts : shift @name_parts;
224							}
225
226	0	0				0	if (@name_parts) {
227	0					0	return ($first, $von, join(" ", @name_parts), undef);
228							} else {
229	0					0	return (undef, undef, $tokens[0], undef);
230							}
231							} else {
232	0	0	0			0	if ( $tokens[0] !~ /\{/ && $tokens[0] =~ /^((.*)\s+)?\b(\S+)$/) {
233	0					0	return ($2, undef, $3, undef);
234							} else {
235	0					0	my @name_parts = _split_name_parts $tokens[0];
236	0					0	return ($name_parts[0], undef, $name_parts[1], undef);
237							}
238							}
239
240							} elsif (@tokens == 2) {
241	0					0	my @von_last_parts = _split_name_parts $tokens[0];
242	0					0	my $von;
243							# von part are lowercase words
244	0		0			0	while ( @von_last_parts && lc($von_last_parts[0]) eq $von_last_parts[0] ) {
245	0	0				0	$von .= $von ? ' ' . shift @von_last_parts : shift @von_last_parts;
246							}
247	0					0	return ($tokens[1], $von, join(" ", @von_last_parts), undef);
248							} else {
249	0					0	my @von_last_parts = _split_name_parts $tokens[0];
250	0					0	my $von;
251							# von part are lowercase words
252	0		0			0	while ( @von_last_parts && lc($von_last_parts[0]) eq $von_last_parts[0] ) {
253	0	0				0	$von .= $von ? ' ' . shift @von_last_parts : shift @von_last_parts;
254							}
255	0					0	return ($tokens[2], $von, join(" ", @von_last_parts), $tokens[1]);
256							}
257
258							}
259
260
261
262							# The goal is to return a name in form
263							# von Last, Jr, First
264							# where any of the parts except Last may be empty.
265							#
266							sub to_string {
267	46			46	1	3110	my $self = shift;
268
269	46					106	my $last = $self->last; # assume always present
270	46	50				105	my $first = $self->first ? (", " . $self->first) : ''; # ", first"
271	46	50				118	my $von = $self->von ? ($self->von . " ") : ''; # "von "
272	46	50				105	my $jr = $self->jr ? (", " . $self->jr ) : ''; # ", jr"
273							#
274	46					123	my $ret = "${von}${last}${jr}${first}";
275							#warn "returning name: $ret\n";
276	46					170	return $ret;
277
278							# original code, which introduced a spurious space with a von part.
279							# https://github.com/borisveytsman/crossrefware/issues/11
280							#
281							# if ($self->jr) {
282							# return () . " " . $self->last . ", " . $self->jr . ", " . $self->first;
283							# } else {
284							# return ($self->von ? $self->von . " " : '') . $self->last . ($self->first ? ", " . $self->first : '');
285							# }
286							#
287							}
288
289
290							# Return 1 if the first letter on brace level 0 is lowercase
291							sub _is_von_token {
292	170			170		214548	my $string = shift;
293							#warn " checking von: $string\n";
294
295							# The while loop removes all non-alpha characters from the front of
296							# the string. But our input might use an entity for a character,
297							# say Šimon for \v{S}imon. If we go through the loop, we will
298							# wrongly consider the "x" of "&#x" to be the first character, and
299							# since that is lowercase, it will become a von part.
300							#
301							# As a kludge, we simply avoid the loop if the first character is "&",
302							# assuming it is an uppercase letter (TUGboat 46:1,
303							# tb142konecny-opbible). If a von part truly does ever start with an
304							# accented lowercase letter, this will fail.
305							#
306	170	50				418	if ($string !~ /^&/) {
307	170					572	while ($string =~
308							s/^(\\[[:alpha:]]+\{\|\{\|\\[[:^alpha:]]?\|[[:^alpha:]])//) {
309	8	100				47	if ($1 eq '{' ) {
310	5					11	my $numbraces=1;
311	5		66			30	while ($numbraces !=0 && length($string)) {
312	66					89	my $symbol = substr($string, 0, 1);
313	66	50				141	if ($symbol eq '{') {
		100
314	0					0	$numbraces ++;
315							} elsif ($symbol eq '}') {
316	5					8	$numbraces --;
317							}
318	66					179	$string = substr($string,1);
319							}
320							}
321							}
322							}
323
324	170	100				379	if (length $string ) {
325	165					345	my $symbol = substr($string, 0, 1);
326							# We have to check against uc($symbol) not being the same,
327							# rather than lc($symbol) being the same, because for a
328							# non-alpha like "&", lc will indeed be the same.
329							#warn " von symbol: $symbol, vs. ", uc($symbol), "\n";
330	165	100				389	if (uc($symbol) ne $symbol) {
331	25					119	return 1;
332							} else {
333	140					470	return 0;
334							}
335							} else {
336	5					14	return 1;
337							}
338
339							}
340
341							1; # End of BibTeX::Entry
342
343							__END__