File Coverage

blib/lib/BibTeX/Parser.pm

Criterion	Covered	Total	%
statement	127	137	92.7
branch	54	64	84.3
condition	14	17	82.3
subroutine	10	10	100.0
pod	2	2	100.0
total	207	230	90.0

line	stmt	bran	cond	sub	pod	time	code
1							package BibTeX::Parser;
2							{
3							$BibTeX::Parser::VERSION = '1.04';
4							}
5							# ABSTRACT: A pure perl BibTeX parser
6	18			18		1014776	use warnings;
	18					174
	18					640
7	18			18		104	use strict;
	18					34
	18					412
8
9	18			18		7238	use BibTeX::Parser::Entry;
	18					48
	18					31457
10
11
12							my $re_namechar = qr/[a-zA-Z0-9\!\$\&\*\+\-\.\/\:\;\<\>\?\[\]\^\_\`\\|\']/o;
13							my $re_name = qr/$re_namechar+/o;
14
15
16							sub new {
17	15			15	1	6040	my ( $class, $fh ) = @_;
18
19	15					227	return bless {
20							fh => $fh,
21							strings => {
22							jan => "January",
23							feb => "February",
24							mar => "March",
25							apr => "April",
26							may => "May",
27							jun => "June",
28							jul => "July",
29							aug => "August",
30							sep => "September",
31							oct => "October",
32							nov => "November",
33							dec => "December",
34
35							},
36							line => -1,
37							buffer => "",
38							}, $class;
39							}
40
41							sub _slurp_close_bracket;
42
43							sub _parse_next {
44	42			42		96	my $self = shift;
45
46	42					68	while (1) { # loop until regular entry is finished
47	46	100				229	return 0 if $self->{fh}->eof;
48	41					814	local $_ = $self->{buffer};
49
50	41					118	until (/@/m) {
51	111					2162	my $line = $self->{fh}->getline;
52	111	100				2593	return 0 unless defined $line;
53	107					186	$line =~ s/^%.*$//;
54	107					358	$_ .= $line;
55							}
56
57	37					180	my $current_entry = new BibTeX::Parser::Entry;
58	37	100				501	if (/@($re_name)/cgo) {
59	36					112	my $type = uc $1;
60	36					125	$current_entry->type( $type );
61	36					94	my $start_pos = pos($_) - length($type) - 1;
62
63							# read rest of entry (matches braces)
64	36					55	my $bracelevel = 0;
65	36					85	$bracelevel += tr/\{/\{/; #count braces
66	36					74	$bracelevel -= tr/\}/\}/;
67	36					107	while ( $bracelevel != 0 ) {
68	156					254	my $position = pos($_);
69	156					2312	my $line = $self->{fh}->getline;
70	156	50				3453	last unless defined $line;
71	156					290	$bracelevel =
72							$bracelevel + ( $line =~ tr/\{/\{/ ) - ( $line =~ tr/\}/\}/ );
73	156					323	$_ .= $line;
74	156					454	pos($_) = $position;
75							}
76
77							# Remember text before the entry
78	36					129	my $pre = substr($_, 0, $start_pos-1);
79	36	100				115	if ($start_pos == 0) {
80	9					21	$pre = '';
81							}
82	36					139	$current_entry->pre($pre);
83
84
85							# Remember raw bibtex code
86	36					80	my $raw = substr($_, $start_pos);
87	36					112	$raw =~ s/^\s+//;
88	36					268	$raw =~ s/\s+$//;
89	36					136	$current_entry->raw_bibtex($raw);
90
91	36					57	my $pos = pos $_;
92	36					88	tr/\n/ /;
93	36					89	pos($_) = $pos;
94
95	36	100	100			211	if ( $type eq "STRING" ) {
		100
96	2	50				37	if (/\G\{\s($re_name)\s=\s*/cgo) {
97	2					6	my $key = $1;
98	2					5	my $value = _parse_string( $self->{strings} );
99	2	50				8	if ( defined $self->{strings}->{$key} ) {
100	0					0	warn("Redefining string $key!");
101							}
102	2					5	$self->{strings}->{$key} = $value;
103	2					8	/\G[\s\n]*\}/cg;
104							} else {
105	0					0	$current_entry->error("Malformed string!");
106	0					0	return $current_entry;
107							}
108							} elsif ( $type eq "COMMENT" or $type eq "PREAMBLE" ) {
109	2					6	/\G\{./cgo;
110	2					6	_slurp_close_bracket;
111							} else { # normal entry
112	32					111	$current_entry->parse_ok(1);
113
114							# parse key
115	32	100				717	if (/\G\s\{(?:\s($re_name)\s,[\s\n]\|\s+\r?\s*)/cgo) {
116	31					137	$current_entry->key($1);
117
118							# fields
119	31					676	while (/\G[\s\n]($re_name)[\s\n]=[\s\n]*/cgo) {
120							$current_entry->field(
121	135					374	$1 => _parse_string( $self->{strings} ) );
122	135					291	my $idx = index( $_, ',', pos($_) );
123	135	100				695	pos($_) = $idx + 1 if $idx > 0;
124							}
125
126	31					169	return $current_entry;
127
128							} else {
129
130	1		50			20	$current_entry->error("Malformed entry (key contains illegal characters) at " . substr($_, pos($_) \|\| 0, 20) . ", ignoring");
131	1					4	_slurp_close_bracket;
132	1					11	return $current_entry;
133							}
134							}
135
136	4					27	$self->{buffer} = substr $_, pos($_);
137
138							} else {
139	1		50			14	$current_entry->error("Did not find type at " . substr($_, pos($_) \|\| 0, 20));
140	1					4	return $current_entry;
141							}
142
143							}
144							}
145
146
147							sub next {
148	42			42	1	6265	my $self = shift;
149
150	42					106	return $self->_parse_next;
151							}
152
153							# slurp everything till the next closing brace. Handles
154							# nested brackets
155							sub _slurp_close_bracket {
156	3			3		6	my $bracelevel = 0;
157							BRACE: {
158	3	50				5	/\G[^\}]*\{/cg && do { $bracelevel++; redo BRACE };
	3					9
	0					0
	0					0
159							/\G[^\{]*\}/cg
160	3	100				21	&& do {
161	2	50				6	if ( $bracelevel > 0 ) {
162	0					0	$bracelevel--;
163	0					0	redo BRACE;
164							} else {
165	2					4	return;
166							}
167							}
168							}
169							}
170
171							# parse bibtex string in $_ and return. A BibTeX string is either enclosed
172							# in double quotes '"' or matching braces '{}'. The braced form may contain
173							# nested braces.
174							sub _parse_string {
175	144			144		3746	my $strings_ref = shift;
176
177	144					205	my $value = "";
178
179							PART: {
180	144	100				191	if (/\G(\d+)/cg) {
	151	100				1072
		100
181	12					38	$value .= $1;
182							} elsif (/\G($re_name)/cgo) {
183	10	50				46	warn("Using undefined string $1") unless defined $strings_ref->{$1};
184	10		50			51	$value .= $strings_ref->{$1} \|\| "";
185							} elsif (/\G"(([^"\\](\\.)[^\\"]))"/cgs)
186							{ # quoted string with embeded escapes
187	81					223	$value .= $1;
188							} else {
189	48					117	my $part = _extract_bracketed( $_ );
190	48					126	$value .= substr $part, 1, length($part) - 2; # strip quotes
191							}
192
193	151	100				396	if (/\G\s#\s/cg) { # string concatenation by #
194	7					17	redo PART;
195							}
196							}
197	144					544	$value =~ s/[\s\n]+/ /g;
198	144					537	return $value;
199							}
200
201							sub _extract_bracketed
202							{
203	48			48		90	for($_[0]) # alias to $_
204							{
205	48					103	/\G\s+/cg;
206	48					71	my $start = pos($_);
207	48					60	my $depth = 0;
208	48					67	while(1)
209							{
210	175	100				349	/\G\\./cg && next;
211	173	100				352	/\G\{/cg && (++$depth, next);
212	118	100				300	/\G\}/cg && (--$depth > 0 ? next : last);
		100
213	63	50				168	/\G([^\\\{\}]+)/cg && next;
214	0					0	last; # end of string
215							}
216	48					172	return substr($_, $start, pos($_)-$start);
217							}
218							}
219
220							# Split the $string using $pattern as a delimiter with
221							# each part having balanced braces (so "{$pattern}"
222							# does NOT split).
223							# Return empty list if unmatched braces
224
225							sub _split_braced_string {
226	239			239		4320	my $string = shift;
227	239					342	my $pattern = shift;
228	239					347	my @tokens;
229	239	100				543	return () if $string eq '';
230	238					324	my $buffer;
231	238		100			592	while (!defined pos $string \|\| pos $string < length $string) {
232	406	100				6114	if ( $string =~ /\G(.*?)(\{\|$pattern)/cgi ) {
233	186					485	my $match = $1;
234	186	100				1345	if ( $2 =~ /$pattern/i ) {
		50
235	157					318	$buffer .= $match;
236	157					358	push @tokens, $buffer;
237	157					774	$buffer = "";
238							} elsif ( $2 =~ /\{/ ) {
239	29					97	$buffer .= $match . "{";
240	29					49	my $numbraces=1;
241	29		100			125	while ($numbraces !=0 && pos $string < length $string) {
242	345					530	my $symbol = substr($string, pos $string, 1);
243	345					432	$buffer .= $symbol;
244	345	50				674	if ($symbol eq '{') {
		100
245	0					0	$numbraces ++;
246							} elsif ($symbol eq '}') {
247	27					34	$numbraces --;
248							}
249	345					1080	pos($string) ++;
250							}
251	29	100				130	if ($numbraces != 0) {
252	2					13	return ();
253							}
254							} else {
255	0					0	$buffer .= $match;
256							}
257							} else {
258	220		100			856	$buffer .= substr $string, (pos $string \|\| 0);
259	220					431	last;
260							}
261							}
262	236	50				676	push @tokens, $buffer if $buffer;
263	236					898	return @tokens;
264							}
265
266
267							1; # End of BibTeX::Parser
268
269
270							__END__