| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Biblio::Document::Parser::Brody; |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
###################################################################### |
|
4
|
|
|
|
|
|
|
# |
|
5
|
|
|
|
|
|
|
# Biblio::Document::Parser::Brody; |
|
6
|
|
|
|
|
|
|
# |
|
7
|
|
|
|
|
|
|
###################################################################### |
|
8
|
|
|
|
|
|
|
# |
|
9
|
|
|
|
|
|
|
# Reference Parser by Tim Brody |
|
10
|
|
|
|
|
|
|
# |
|
11
|
|
|
|
|
|
|
# This file is part of ParaCite Tools (http://paracite.eprints.org/developers/) |
|
12
|
|
|
|
|
|
|
# |
|
13
|
|
|
|
|
|
|
# Copyright (c) 2002 University of Southampton, UK. SO17 1BJ. |
|
14
|
|
|
|
|
|
|
# |
|
15
|
|
|
|
|
|
|
# ParaTools is free software; you can redistribute it and/or modify |
|
16
|
|
|
|
|
|
|
# it under the terms of the GNU General Public License as published by |
|
17
|
|
|
|
|
|
|
# the Free Software Foundation; either version 2 of the License, or |
|
18
|
|
|
|
|
|
|
# (at your option) any later version. |
|
19
|
|
|
|
|
|
|
# |
|
20
|
|
|
|
|
|
|
# ParaTools is distributed in the hope that it will be useful, |
|
21
|
|
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
22
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
23
|
|
|
|
|
|
|
# GNU General Public License for more details. |
|
24
|
|
|
|
|
|
|
# |
|
25
|
|
|
|
|
|
|
# You should have received a copy of the GNU General Public License |
|
26
|
|
|
|
|
|
|
# along with ParaTools; if not, write to the Free Software |
|
27
|
|
|
|
|
|
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
28
|
|
|
|
|
|
|
# |
|
29
|
|
|
|
|
|
|
###################################################################### |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=pod |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
=head1 NAME |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
Biblio::Document::Parser::Brody |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
Module that parses reference strings from a document. Relies on a reference section starting with a title "References", "Bibliography", or "Cited". Seperates references by prefixed number (e.g. "[1]" or "1.") or by year (e.g. "Smith, J (1992)"). |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
use Biblio::Document::Parser::Brody; |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
my $parser = new Biblio::Document::Parser::Brody(); |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
my @refs = $parser->parse(\*FILE_IO); |
|
48
|
|
|
|
|
|
|
my @refs = $parser->parse($str); |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head1 METHODS |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=cut |
|
53
|
|
|
|
|
|
|
|
|
54
|
1
|
|
|
1
|
|
7028
|
use strict; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
33
|
|
|
55
|
|
|
|
|
|
|
|
|
56
|
1
|
|
|
1
|
|
5
|
use Carp; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
60
|
|
|
57
|
1
|
|
|
1
|
|
4
|
use vars qw($DEBUG $RE_BOR $RE_EOR $RE_NAME_CHARS $RE_NAME $RE_NAME_LIST_CHARS $MAX_SIZE); |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
2963
|
|
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
# Set up the input/output appropriately |
|
60
|
|
|
|
|
|
|
#use open IN => ':encoding(latin1)', OUT => ':utf8'; |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
$MAX_SIZE = 1024*2000; # 2MB |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
$RE_BOR = qr/^[^a-z]*(?:references(?:\s+cited)?)|(?:bibliography)[^a-z]*$/i; |
|
65
|
|
|
|
|
|
|
$RE_EOR = qr/^\s*(?:\d+\.?\s*)*(?:acknowledge?ment)|(?:footnote)|(?:appendix)|(?:abbreviation)|(?:glossary)|(?:figure)[^\n]{0,10}\s*$/i; |
|
66
|
|
|
|
|
|
|
$RE_NAME_CHARS = qr/[a-zA-Z`'\-]/; |
|
67
|
|
|
|
|
|
|
$RE_NAME_LIST_CHARS = qr/[a-zA-Z,\.;\(\)\-\s\&'`]/; |
|
68
|
|
|
|
|
|
|
$RE_NAME = qr/(?:[a-zA-Z`'\-]{4,7}, *(?:[a-zA-Z]\. *)+)/; |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=pod |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=over 4 |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
=item $p = Biblio::Document::Parser::Brody->new([-debug=>1]) |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
Constructor method for class. |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
=cut |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
sub new { |
|
81
|
1
|
|
|
1
|
1
|
17
|
my ($class,%args) = @_; |
|
82
|
1
|
|
|
|
|
4
|
$DEBUG = $args{-debug}; |
|
83
|
1
|
|
|
|
|
5
|
return bless {}, $class; |
|
84
|
|
|
|
|
|
|
} |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=pod |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=item @refs = $p->parse($str) |
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
Parses a string $str and returns a list of unstructured reference strings. |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
=cut |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
sub parse { |
|
95
|
1
|
|
|
1
|
1
|
616
|
my $self = shift @_; |
|
96
|
1
|
|
|
|
|
2
|
my $arg = shift @_; |
|
97
|
1
|
|
|
|
|
2
|
my $BIBL = ''; |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
# UNIVERSAL::isa($arg,"IO::Handle") doesn't work? |
|
100
|
1
|
50
|
|
|
|
6
|
if( ref($arg) ) { |
|
101
|
1
|
50
|
|
|
|
48
|
read($arg,$BIBL,$MAX_SIZE) or croak "Error reading from file handle: $!\n"; |
|
102
|
|
|
|
|
|
|
} else { |
|
103
|
0
|
|
|
|
|
0
|
$BIBL = join('',$arg,@_); |
|
104
|
|
|
|
|
|
|
} |
|
105
|
|
|
|
|
|
|
|
|
106
|
1
|
50
|
|
|
|
5
|
croak "No data to parse\n" unless length($BIBL); |
|
107
|
|
|
|
|
|
|
|
|
108
|
1
|
|
|
|
|
3
|
$BIBL =~ s/\f/\n\n/sg; |
|
109
|
|
|
|
|
|
|
|
|
110
|
1
|
|
|
|
|
2
|
my %HEADERS; |
|
111
|
|
|
|
|
|
|
|
|
112
|
1
|
|
|
|
|
224
|
while( $BIBL =~ /(?:\n[\r[:blank:]]*){2}([^\n]{0,40}\w+[^\n]{0,40})(?:\n[\r[:blank:]]*){3}/osg ) { |
|
113
|
0
|
|
|
|
|
0
|
$HEADERS{header_to_regexp($1)}++; |
|
114
|
|
|
|
|
|
|
} |
|
115
|
|
|
|
|
|
|
|
|
116
|
1
|
50
|
|
|
|
4
|
if( %HEADERS ) { |
|
117
|
0
|
|
|
|
|
0
|
my @regexps = sort { $HEADERS{$b} <=> $HEADERS{$a} } keys %HEADERS; |
|
|
0
|
|
|
|
|
0
|
|
|
118
|
0
|
|
|
|
|
0
|
my $regexp = $regexps[0]; |
|
119
|
0
|
0
|
|
|
|
0
|
if( $HEADERS{$regexp} > 3 ) { |
|
120
|
0
|
|
|
|
|
0
|
my $c = $BIBL =~ s/(?:\n[\r[:blank:]]*){2}(?:$regexp)(?:\n[\r[:blank:]]*){3}/\n\n/sg; |
|
121
|
0
|
0
|
|
|
|
0
|
warn "Applying regexp: $regexp ($HEADERS{$regexp} original matches) Removed $c header/footers using ($HEADERS{$regexp} original matches): $regexp\n" if $DEBUG; |
|
122
|
|
|
|
|
|
|
} else { |
|
123
|
0
|
0
|
|
|
|
0
|
warn "Not enough matching header/footers were found\n" if $DEBUG; |
|
124
|
|
|
|
|
|
|
} |
|
125
|
|
|
|
|
|
|
} else { |
|
126
|
1
|
50
|
|
|
|
4
|
warn "No header/footers were found\n" if $DEBUG; |
|
127
|
|
|
|
|
|
|
} |
|
128
|
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
# Kill any bad chars |
|
130
|
|
|
|
|
|
|
# local *lat2uni = convertor( 'latin1', 'utf8' ); |
|
131
|
|
|
|
|
|
|
# lat2uni(\$BIBL); |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
# if( $BIBL =~ /$RE_BOR/mi ) { |
|
134
|
|
|
|
|
|
|
# $BIBL = $'; |
|
135
|
|
|
|
|
|
|
# } else { |
|
136
|
|
|
|
|
|
|
# croak "FATAL: Unable to find reference section\n"; |
|
137
|
|
|
|
|
|
|
# } |
|
138
|
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
|
|
140
|
1
|
|
|
|
|
2
|
my @REFS; |
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
# Attempt to find the reference section |
|
143
|
1
|
|
33
|
|
|
125
|
while( !@REFS && ($BIBL =~ /$RE_BOR/mi) && ($BIBL = $') ) { |
|
|
|
|
33
|
|
|
|
|
|
144
|
0
|
|
|
|
|
0
|
my $c = 0; |
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
# Count the number of occurences of [\d] over the next 2k of data or so |
|
147
|
0
|
|
|
|
|
0
|
my $buffer = substr($BIBL, 0, 2048); |
|
148
|
0
|
|
|
|
|
0
|
$c = 0; |
|
149
|
0
|
0
|
|
|
|
0
|
while($buffer =~ m/^\s*\[\d+\]/mog) { last if ++$c == 5 } |
|
|
0
|
|
|
|
|
0
|
|
|
150
|
0
|
0
|
|
|
|
0
|
if( $c >= 5 ) { |
|
151
|
0
|
0
|
|
|
|
0
|
warn "Style = numbered square ([1])\n" if $DEBUG; |
|
152
|
0
|
0
|
|
|
|
0
|
last if (@REFS = &style_numbered_square($BIBL)); |
|
153
|
|
|
|
|
|
|
} |
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
# How about 1. notation |
|
156
|
|
|
|
|
|
|
# $buffer = substr($BIBL, 0, 2046); |
|
157
|
0
|
|
|
|
|
0
|
$c = 0; |
|
158
|
0
|
0
|
|
|
|
0
|
while($buffer =~ m/^\s*(\d+)\./mog) { last if ++$c == 5 } |
|
|
0
|
|
|
|
|
0
|
|
|
159
|
0
|
0
|
|
|
|
0
|
if( $c >= 5 ) { |
|
160
|
0
|
0
|
|
|
|
0
|
warn "Style = numbered (1.)\n" if $DEBUG; |
|
161
|
|
|
|
|
|
|
# $BIBL =~ s/^\s*(\d+)\./\[$1\]/mg; |
|
162
|
0
|
0
|
|
|
|
0
|
last if (@REFS = &style_numbered($BIBL)); |
|
163
|
|
|
|
|
|
|
} |
|
164
|
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
# Now we're getting desperate - hopefully its a name list followed by year |
|
166
|
|
|
|
|
|
|
# $buffer = substr($BIBL, 0, 2048); |
|
167
|
0
|
|
|
|
|
0
|
$c = 0; |
|
168
|
0
|
0
|
|
|
|
0
|
while($buffer =~ m/^$RE_NAME_LIST_CHARS{10,40}[^\d\-]19|20\d{2}[^\d\-]/mog) { last if ++$c == 5 } |
|
|
0
|
|
|
|
|
0
|
|
|
169
|
0
|
0
|
|
|
|
0
|
if( $c >= 5 ) { |
|
170
|
0
|
0
|
|
|
|
0
|
warn "Style = years\n" if $DEBUG; |
|
171
|
0
|
0
|
|
|
|
0
|
last if (@REFS = &style_years($BIBL)); |
|
172
|
|
|
|
|
|
|
} |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
# if( @REFS ) { |
|
175
|
|
|
|
|
|
|
# last; |
|
176
|
|
|
|
|
|
|
# } elsif( $BIBL =~ /$RE_BOR/mi ) { |
|
177
|
|
|
|
|
|
|
# warn "Skipping section ...\n" if $DEBUG; |
|
178
|
|
|
|
|
|
|
# $BIBL = $'; |
|
179
|
|
|
|
|
|
|
# } else { |
|
180
|
|
|
|
|
|
|
# last; |
|
181
|
|
|
|
|
|
|
# } |
|
182
|
|
|
|
|
|
|
} |
|
183
|
|
|
|
|
|
|
|
|
184
|
1
|
|
|
|
|
7
|
for( my $i = 0; $i < @REFS; $i++ ) { |
|
185
|
0
|
0
|
|
|
|
0
|
my $ref = $REFS[$i] or next; |
|
186
|
|
|
|
|
|
|
# $REFS[$i] = "[" . ($i+1) . "] " . unicode_string($ref); |
|
187
|
0
|
|
|
|
|
0
|
$REFS[$i] = "[" . ($i+1) . "] " . $ref; |
|
188
|
|
|
|
|
|
|
} |
|
189
|
|
|
|
|
|
|
|
|
190
|
1
|
0
|
|
|
|
4
|
return grep { defined($_) && length($_) } @REFS; |
|
|
0
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
} |
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
#my ($BIBL, $buffer); |
|
194
|
|
|
|
|
|
|
#$BIBL = ''; |
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
#my $lc = 0; |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
#die "FATAL: Input has gone beyond $MAX_SIZE byte limit" if read(STDIN,$BIBL,$MAX_SIZE) == $MAX_SIZE; |
|
199
|
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
#die "Empty input" unless length($BIBL); |
|
201
|
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
#while( read(STDIN,$buffer,4096) ) { |
|
203
|
|
|
|
|
|
|
# $BIBL .= $buffer; |
|
204
|
|
|
|
|
|
|
# die "FATAL: Input has gone beyond $MAX_SIZE bytes limit" if length($BIBL) > $MAX_SIZE; |
|
205
|
|
|
|
|
|
|
#} |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
#while( <> ) { |
|
209
|
|
|
|
|
|
|
# s/\f/\n\n/sg; |
|
210
|
|
|
|
|
|
|
# $BIBL = $_ . $BIBL; |
|
211
|
|
|
|
|
|
|
# die "FATAL: Input has gone beyond $MAX_SIZE bytes limit" if length($BIBL) > $MAX_SIZE; |
|
212
|
|
|
|
|
|
|
# if( $_ =~ /^(?:\n\s*){3}/ ) { |
|
213
|
|
|
|
|
|
|
# # Regexp matches for the end of the string are *really* bad performance |
|
214
|
|
|
|
|
|
|
# # Lines are in reverse order! |
|
215
|
|
|
|
|
|
|
# if( $BIBL =~ /^(?:\n\s*){3}([^\n]{0,40}\w+[^\n]{0,40})(?:\n\s*){2}/os ) { |
|
216
|
|
|
|
|
|
|
# $HEADERS{header_to_regexp($1)}++; |
|
217
|
|
|
|
|
|
|
# } |
|
218
|
|
|
|
|
|
|
# } |
|
219
|
|
|
|
|
|
|
#} |
|
220
|
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
# Put the lines back in-order |
|
222
|
|
|
|
|
|
|
#my @lines = split(/\n/,$BIBL); |
|
223
|
|
|
|
|
|
|
#$BIBL = ''; |
|
224
|
|
|
|
|
|
|
#for(@lines) { |
|
225
|
|
|
|
|
|
|
# $BIBL = $_ . "\n" . $BIBL; |
|
226
|
|
|
|
|
|
|
#} |
|
227
|
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
# Read in the document |
|
229
|
|
|
|
|
|
|
#while( read(STDIN,$buffer,4096) ) { |
|
230
|
|
|
|
|
|
|
# if( length($BIBL) > $MAX_SIZE ) { |
|
231
|
|
|
|
|
|
|
# die "FATAL: Input has gone beyond $MAX_SIZE Bytes limit\n"; |
|
232
|
|
|
|
|
|
|
# } |
|
233
|
|
|
|
|
|
|
# $BIBL .= $buffer; |
|
234
|
|
|
|
|
|
|
#} |
|
235
|
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
#print "Ref section:\n", $BIBL; |
|
237
|
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
# Change to utf8 |
|
239
|
|
|
|
|
|
|
#use utf8; |
|
240
|
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
#### REMAINING FUNCTIONS ARE INTERNAL OR DEPRECATED #### |
|
242
|
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
sub end_of_references { |
|
244
|
0
|
|
|
0
|
0
|
|
my $ref = shift; |
|
245
|
0
|
0
|
0
|
|
|
|
if( $$ref =~ /${RE_EOR}/im || |
|
246
|
|
|
|
|
|
|
$$ref =~ /^\s*acknowledgements:/im ) { |
|
247
|
0
|
|
|
|
|
|
$$ref = $`; |
|
248
|
0
|
|
|
|
|
|
return 1; |
|
249
|
|
|
|
|
|
|
} |
|
250
|
0
|
0
|
|
|
|
|
if( $$ref =~ /(?:\s*\n){3,}/s ) { |
|
251
|
0
|
|
|
|
|
|
$$ref = $`; |
|
252
|
0
|
|
|
|
|
|
return 1; |
|
253
|
|
|
|
|
|
|
} |
|
254
|
0
|
0
|
|
|
|
|
if( length($$ref) > 1024 ) { |
|
255
|
0
|
|
|
|
|
|
return 1; |
|
256
|
|
|
|
|
|
|
} |
|
257
|
0
|
|
|
|
|
|
return 0; |
|
258
|
|
|
|
|
|
|
} |
|
259
|
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
sub style_numbered { |
|
261
|
0
|
|
|
0
|
0
|
|
my @REFS = split(/^\s*(\d+\.)/m, shift); |
|
262
|
|
|
|
|
|
|
|
|
263
|
0
|
|
0
|
|
|
|
shift @REFS while (@REFS && ($REFS[0] !~ /^\d+\./ || substr($REFS[0],0,-1) != 1)); |
|
|
|
|
0
|
|
|
|
|
|
264
|
|
|
|
|
|
|
|
|
265
|
0
|
|
|
|
|
|
my $i = 2; |
|
266
|
0
|
|
|
|
|
|
while( $i < @REFS ) { |
|
267
|
0
|
0
|
|
|
|
|
if( $REFS[$i] =~ /^\d+\./ ) { |
|
268
|
0
|
|
|
|
|
|
my $val = substr($REFS[$i],0,-1); |
|
269
|
0
|
0
|
|
|
|
|
if( $val != ($i/2)+1 ) { |
|
270
|
0
|
|
|
|
|
|
$REFS[$i-1] .= splice(@REFS,$i,1); |
|
271
|
|
|
|
|
|
|
} else { |
|
272
|
0
|
|
|
|
|
|
$i+=2; |
|
273
|
|
|
|
|
|
|
} |
|
274
|
|
|
|
|
|
|
} else { |
|
275
|
0
|
|
|
|
|
|
$REFS[$i-1] .= splice(@REFS,$i,1); |
|
276
|
|
|
|
|
|
|
} |
|
277
|
0
|
0
|
|
|
|
|
if( end_of_references(\$REFS[$i-1]) ) { |
|
278
|
0
|
|
|
|
|
|
splice(@REFS,$i); |
|
279
|
|
|
|
|
|
|
} |
|
280
|
|
|
|
|
|
|
} |
|
281
|
|
|
|
|
|
|
|
|
282
|
0
|
|
|
|
|
|
for( my $i = 0; $i < @REFS; $i++ ) { |
|
283
|
0
|
|
|
|
|
|
$REFS[$i] .= splice(@REFS,$i+1,1); |
|
284
|
0
|
|
|
|
|
|
$REFS[$i] =~ s/\s+/ /sg; |
|
285
|
0
|
|
|
|
|
|
$REFS[$i] =~ s/^\s+//; |
|
286
|
0
|
|
|
|
|
|
$REFS[$i] =~ s/\s+$//; |
|
287
|
|
|
|
|
|
|
} |
|
288
|
|
|
|
|
|
|
|
|
289
|
0
|
|
|
|
|
|
@REFS; |
|
290
|
|
|
|
|
|
|
} |
|
291
|
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
sub style_numbered_square { |
|
293
|
0
|
|
|
0
|
0
|
|
my $BIBL = shift; |
|
294
|
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
# Split the bibliography |
|
296
|
0
|
|
|
|
|
|
$BIBL =~ /(?=\[\d+\])/; |
|
297
|
0
|
0
|
|
|
|
|
my @REFS = split(/^\s*\[(\d+)\]/m, $') or return (); |
|
298
|
0
|
0
|
|
|
|
|
shift @REFS unless $REFS[0]; |
|
299
|
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
# Make sure there is a "value" to go with a reference number |
|
302
|
|
|
|
|
|
|
# for( my $i = 0; $i < @REFS; $i+=2 ) { |
|
303
|
|
|
|
|
|
|
# if( $REFS[$i+1] =~ /\[\d+\]/ ) { |
|
304
|
|
|
|
|
|
|
# splice(@REFS,$i+1,0,''); |
|
305
|
|
|
|
|
|
|
# } |
|
306
|
|
|
|
|
|
|
# } |
|
307
|
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
# If there is a large reference its probably the end of the bibliography |
|
309
|
0
|
|
|
|
|
|
for( my $i = 10; $i < @REFS; $i++ ) { |
|
310
|
0
|
0
|
|
|
|
|
if( length($REFS[$i]) > 1024 ) { |
|
311
|
0
|
|
|
|
|
|
splice(@REFS, $i+1); |
|
312
|
0
|
|
|
|
|
|
$REFS[$i] = substr($REFS[$i],0,1024) . " RUNAWAY_REFERENCE_DETECTED "; |
|
313
|
|
|
|
|
|
|
} |
|
314
|
|
|
|
|
|
|
} |
|
315
|
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
# Add any out-of-order chunks to the previous reference value |
|
317
|
0
|
|
|
|
|
|
my $last = 0; |
|
318
|
0
|
|
|
|
|
|
my $max = 0; |
|
319
|
0
|
|
|
|
|
|
for( my $i = 0; $i < @REFS; $i+=2 ) { |
|
320
|
0
|
|
|
|
|
|
my $n = $REFS[$i]; |
|
321
|
|
|
|
|
|
|
# $n =~ s/\D//g; |
|
322
|
0
|
0
|
|
|
|
|
$max = $n if $n > $max; |
|
323
|
0
|
0
|
|
|
|
|
if( $n == $last+1 ) { |
|
324
|
0
|
|
|
|
|
|
$last++; |
|
325
|
0
|
|
|
|
|
|
next; |
|
326
|
|
|
|
|
|
|
} else { |
|
327
|
|
|
|
|
|
|
# Join this out-of-order chunk onto the previous ref. |
|
328
|
0
|
|
|
|
|
|
$REFS[$i-1] .= splice(@REFS,$i,2); |
|
329
|
|
|
|
|
|
|
} |
|
330
|
|
|
|
|
|
|
} |
|
331
|
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
# Remove any trailing garbage |
|
333
|
0
|
|
|
|
|
|
splice(@REFS, $last*2, -1); |
|
334
|
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
# Presumably there is a gap between the last reference and any trailing junk |
|
336
|
0
|
|
|
|
|
|
$REFS[$#REFS] =~ s/(\r?\n){2}.*//s; |
|
337
|
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
# Prettify the references |
|
339
|
0
|
|
|
|
|
|
for( my $i = 1; $i < @REFS; $i+=2 ) { |
|
340
|
0
|
|
|
|
|
|
$REFS[$i] =~ s/[\r\n]+/ /sg; |
|
341
|
0
|
|
|
|
|
|
$REFS[$i] =~ s/^\s+//sg; |
|
342
|
0
|
|
|
|
|
|
$REFS[$i] =~ s/\s+$//sg; |
|
343
|
|
|
|
|
|
|
} |
|
344
|
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
# Get rid of the numbering |
|
346
|
0
|
|
|
|
|
|
for( my $i = 0; $i < @REFS; $i++ ) { |
|
347
|
|
|
|
|
|
|
# $REFS[$i] = $REFS[$i+1]; |
|
348
|
0
|
|
|
|
|
|
splice(@REFS,$i,2,$REFS[$i+1]); |
|
349
|
|
|
|
|
|
|
} |
|
350
|
|
|
|
|
|
|
|
|
351
|
0
|
|
|
|
|
|
return @REFS; |
|
352
|
|
|
|
|
|
|
} |
|
353
|
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
sub style_years { |
|
355
|
0
|
|
|
0
|
0
|
|
my $BIBL = shift; |
|
356
|
|
|
|
|
|
|
|
|
357
|
0
|
|
|
|
|
|
$BIBL =~ s/^\s+//sg; |
|
358
|
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
# Convert very long lines of spaces into a return |
|
360
|
0
|
|
|
|
|
|
$BIBL =~ s/ {70} */\n/sg; |
|
361
|
|
|
|
|
|
|
|
|
362
|
0
|
|
|
|
|
|
my @REFS; |
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
# Lets try splitting on a blank line |
|
365
|
0
|
|
|
|
|
|
@REFS = split(/((?:\s*\n){2})/, $BIBL); |
|
366
|
|
|
|
|
|
|
|
|
367
|
0
|
|
0
|
|
|
|
shift @REFS while (@REFS && $REFS[0] !~ /^$RE_NAME_LIST_CHARS+\d{4}\D/); |
|
368
|
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
# That didn't work, lets split on left-aligned things (where the next line(s) are blank or indented) |
|
370
|
0
|
0
|
0
|
|
|
|
if( !@REFS || length($REFS[0]) > 300 ) { |
|
371
|
0
|
|
|
|
|
|
@REFS = split(/\n[ ]{0,2}((?:(?:\S$RE_NAME_LIST_CHARS{10,})|$RE_NAME[^\d\-])\d{4}[^\d\-][^\n]+)/, $BIBL); |
|
372
|
0
|
|
0
|
|
|
|
shift @REFS while (@REFS && $REFS[0] !~ /^$RE_NAME_LIST_CHARS{10,}\d{4}\D/s); |
|
373
|
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
#return @REFS; |
|
375
|
|
|
|
|
|
|
|
|
376
|
0
|
|
|
|
|
|
for( my $i = 1; $i < @REFS; $i++ ) { |
|
377
|
0
|
0
|
|
|
|
|
if( end_of_references(\$REFS[$i]) ) { |
|
|
|
0
|
|
|
|
|
|
|
378
|
0
|
|
|
|
|
|
splice(@REFS,$i+1); |
|
379
|
|
|
|
|
|
|
# Indented |
|
380
|
|
|
|
|
|
|
} elsif( $REFS[$i] =~ /^\s* {5}|\t/m ) { |
|
381
|
0
|
|
|
|
|
|
$REFS[$i-1] .= splice(@REFS,$i,1); |
|
382
|
|
|
|
|
|
|
} |
|
383
|
|
|
|
|
|
|
} |
|
384
|
|
|
|
|
|
|
} else { |
|
385
|
0
|
|
|
|
|
|
for( my $i = 1; $i < @REFS; $i++ ) { |
|
386
|
0
|
0
|
|
|
|
|
if( end_of_references(\$REFS[$i]) ) { |
|
387
|
0
|
|
|
|
|
|
splice(@REFS,$i+1); |
|
388
|
|
|
|
|
|
|
} |
|
389
|
|
|
|
|
|
|
} |
|
390
|
|
|
|
|
|
|
} |
|
391
|
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
# If we find what looks like the end of the reference section, discard the trailing rubbish |
|
393
|
|
|
|
|
|
|
# for( my $i = 0; $i < @REFS; $i++ ) { |
|
394
|
|
|
|
|
|
|
# if( end_of_references(\$REFS[$i]) ) { |
|
395
|
|
|
|
|
|
|
# splice(@REFS,$i+1); |
|
396
|
|
|
|
|
|
|
# } elsif( $BIBL =~ /(\r?\n){3}/s ) { |
|
397
|
|
|
|
|
|
|
# $REFS[$i] = $`; |
|
398
|
|
|
|
|
|
|
# splice(@REFS,$i+1); |
|
399
|
|
|
|
|
|
|
# } |
|
400
|
|
|
|
|
|
|
# } |
|
401
|
|
|
|
|
|
|
|
|
402
|
0
|
0
|
|
|
|
|
unless( @REFS ) { |
|
403
|
0
|
|
|
|
|
|
warn "Unable to split year-based references\n"; |
|
404
|
0
|
|
|
|
|
|
return (); |
|
405
|
|
|
|
|
|
|
} |
|
406
|
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
# Remove heavily indented lines following a blank line |
|
408
|
0
|
|
|
|
|
|
for( my $i = 1; $i < @REFS; $i++ ) { |
|
409
|
0
|
0
|
0
|
|
|
|
if( $REFS[$i-1] !~ /\S/ && $REFS[$i] =~ /^\s{40}/ ) { |
|
410
|
0
|
|
|
|
|
|
splice(@REFS,$i,1); |
|
411
|
0
|
|
|
|
|
|
$i--; |
|
412
|
|
|
|
|
|
|
} |
|
413
|
|
|
|
|
|
|
} |
|
414
|
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
# Join refs with the previous reference if they are very short or are quite short and don't start with ...(year) |
|
416
|
0
|
|
|
|
|
|
for( my $i = 1; $i < @REFS; $i++ ) { |
|
417
|
0
|
|
|
|
|
|
my $l = $REFS[$i]; |
|
418
|
0
|
|
|
|
|
|
$l =~ s/\s+//sg; |
|
419
|
0
|
0
|
0
|
|
|
|
if( (length($l) < 30) || |
|
|
|
|
0
|
|
|
|
|
|
420
|
|
|
|
|
|
|
(length($l) < 50 && $REFS[$i] !~ /^$RE_NAME_LIST_CHARS{10,40}[^\d\-](\d{4})[^\d\-]/s) ) { |
|
421
|
0
|
|
|
|
|
|
$REFS[$i-1] .= $REFS[$i]; |
|
422
|
0
|
|
|
|
|
|
splice(@REFS,$i,1); |
|
423
|
0
|
|
|
|
|
|
$i--; |
|
424
|
|
|
|
|
|
|
} |
|
425
|
|
|
|
|
|
|
} |
|
426
|
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
# If we find 3 sequential references without years near the beginning we probably have trailing garbage |
|
428
|
0
|
|
|
|
|
|
my $lc = 0; |
|
429
|
0
|
|
|
|
|
|
for( my $i = 10; $i < @REFS; $i++ ) { |
|
430
|
0
|
0
|
|
|
|
|
if( $REFS[$i] =~ /^\D{10,50}19|20\d{2}/s ) { |
|
431
|
0
|
|
|
|
|
|
$lc = 0; |
|
432
|
|
|
|
|
|
|
} else { |
|
433
|
0
|
|
|
|
|
|
$lc++; |
|
434
|
|
|
|
|
|
|
} |
|
435
|
0
|
0
|
|
|
|
|
if( $lc == 3 ) { |
|
436
|
0
|
|
|
|
|
|
splice(@REFS,$i-2); |
|
437
|
|
|
|
|
|
|
} |
|
438
|
|
|
|
|
|
|
} |
|
439
|
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
# Remove lines without any numbers that are quite long (excluding spaces) |
|
441
|
0
|
|
|
|
|
|
for( my $i = 0; $i < @REFS; $i++ ) { |
|
442
|
0
|
|
|
|
|
|
my $l = $REFS[$i]; |
|
443
|
0
|
|
|
|
|
|
$l =~ s/\s+//sg; |
|
444
|
0
|
0
|
0
|
|
|
|
if( length($l) > 100 && $REFS[$i] !~ /\d/ ) { |
|
445
|
0
|
|
|
|
|
|
splice(@REFS,$i,1); |
|
446
|
|
|
|
|
|
|
} |
|
447
|
|
|
|
|
|
|
} |
|
448
|
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
# Prettify |
|
450
|
0
|
|
|
|
|
|
map { $_ =~ s/\s+/ /sg; $_ =~ s/^\s+//; $_ =~ s/\s+$//s; } @REFS; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
# This doesn't work - names are too icky |
|
453
|
|
|
|
|
|
|
# Now go back in and split anything that looks like name, x (year) |
|
454
|
|
|
|
|
|
|
# for( my $i = 0; $i < @REFS; $i++ ) { |
|
455
|
|
|
|
|
|
|
# my @srefs = grep { $_ =~ /\S/ } split(/((?:[a-zA-Z\-\'\.]+\s*,\s*[a-zA-Z\.]+.{0,7})+\d{4}\b)/, $REFS[$i]); |
|
456
|
|
|
|
|
|
|
# next unless @srefs > 2; |
|
457
|
|
|
|
|
|
|
#print "Split reference:\n", |
|
458
|
|
|
|
|
|
|
# (map { "PART: \"$_\"\n" } @srefs), "\n"; |
|
459
|
|
|
|
|
|
|
# } |
|
460
|
|
|
|
|
|
|
#die; |
|
461
|
|
|
|
|
|
|
|
|
462
|
0
|
|
|
|
|
|
return @REFS; |
|
463
|
|
|
|
|
|
|
} |
|
464
|
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
sub header_to_regexp { |
|
466
|
0
|
|
|
0
|
0
|
|
my $header = shift; |
|
467
|
0
|
|
|
|
|
|
$header =~ s/([\\\|\(\)\[\]\.\*\+\?\{\}])/\\$1/g; |
|
468
|
0
|
|
|
|
|
|
$header =~ s/\s+/\\s+/g; |
|
469
|
0
|
|
|
|
|
|
$header =~ s/\d+/\\d+/g; |
|
470
|
0
|
|
|
|
|
|
return $header; |
|
471
|
0
|
|
|
|
|
|
return q/(?:\n\s*){3}(/.$header.q/)(?:\n\s*){2}/; |
|
472
|
|
|
|
|
|
|
} |
|
473
|
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
#sub unicode_string { |
|
475
|
|
|
|
|
|
|
# $_ = shift(); |
|
476
|
|
|
|
|
|
|
# s/[\x00-\x08\x0b-\x0c\x0e-\x1f]//sg; |
|
477
|
|
|
|
|
|
|
# s/([\x80-\xff])/sprintf("%04x;",ord($1))/seg; |
|
478
|
|
|
|
|
|
|
# return $_; |
|
479
|
|
|
|
|
|
|
#} |
|
480
|
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
1; |
|
482
|
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
__END__ |