line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Biblio::Document::Parser::Brody; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
###################################################################### |
4
|
|
|
|
|
|
|
# |
5
|
|
|
|
|
|
|
# Biblio::Document::Parser::Brody; |
6
|
|
|
|
|
|
|
# |
7
|
|
|
|
|
|
|
###################################################################### |
8
|
|
|
|
|
|
|
# |
9
|
|
|
|
|
|
|
# Reference Parser by Tim Brody |
10
|
|
|
|
|
|
|
# |
11
|
|
|
|
|
|
|
# This file is part of ParaCite Tools (http://paracite.eprints.org/developers/) |
12
|
|
|
|
|
|
|
# |
13
|
|
|
|
|
|
|
# Copyright (c) 2002 University of Southampton, UK. SO17 1BJ. |
14
|
|
|
|
|
|
|
# |
15
|
|
|
|
|
|
|
# ParaTools is free software; you can redistribute it and/or modify |
16
|
|
|
|
|
|
|
# it under the terms of the GNU General Public License as published by |
17
|
|
|
|
|
|
|
# the Free Software Foundation; either version 2 of the License, or |
18
|
|
|
|
|
|
|
# (at your option) any later version. |
19
|
|
|
|
|
|
|
# |
20
|
|
|
|
|
|
|
# ParaTools is distributed in the hope that it will be useful, |
21
|
|
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
22
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
23
|
|
|
|
|
|
|
# GNU General Public License for more details. |
24
|
|
|
|
|
|
|
# |
25
|
|
|
|
|
|
|
# You should have received a copy of the GNU General Public License |
26
|
|
|
|
|
|
|
# along with ParaTools; if not, write to the Free Software |
27
|
|
|
|
|
|
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
28
|
|
|
|
|
|
|
# |
29
|
|
|
|
|
|
|
###################################################################### |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=pod |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
=head1 NAME |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
Biblio::Document::Parser::Brody |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=head1 DESCRIPTION |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
Module that parses reference strings from a document. Relies on a reference section starting with a title "References", "Bibliography", or "Cited". Seperates references by prefixed number (e.g. "[1]" or "1.") or by year (e.g. "Smith, J (1992)"). |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=head1 SYNOPSIS |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
use Biblio::Document::Parser::Brody; |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
my $parser = new Biblio::Document::Parser::Brody(); |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
my @refs = $parser->parse(\*FILE_IO); |
48
|
|
|
|
|
|
|
my @refs = $parser->parse($str); |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head1 METHODS |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=cut |
53
|
|
|
|
|
|
|
|
54
|
1
|
|
|
1
|
|
7028
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
33
|
|
55
|
|
|
|
|
|
|
|
56
|
1
|
|
|
1
|
|
5
|
use Carp; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
60
|
|
57
|
1
|
|
|
1
|
|
4
|
use vars qw($DEBUG $RE_BOR $RE_EOR $RE_NAME_CHARS $RE_NAME $RE_NAME_LIST_CHARS $MAX_SIZE); |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
2963
|
|
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
# Set up the input/output appropriately |
60
|
|
|
|
|
|
|
#use open IN => ':encoding(latin1)', OUT => ':utf8'; |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
$MAX_SIZE = 1024*2000; # 2MB |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
$RE_BOR = qr/^[^a-z]*(?:references(?:\s+cited)?)|(?:bibliography)[^a-z]*$/i; |
65
|
|
|
|
|
|
|
$RE_EOR = qr/^\s*(?:\d+\.?\s*)*(?:acknowledge?ment)|(?:footnote)|(?:appendix)|(?:abbreviation)|(?:glossary)|(?:figure)[^\n]{0,10}\s*$/i; |
66
|
|
|
|
|
|
|
$RE_NAME_CHARS = qr/[a-zA-Z`'\-]/; |
67
|
|
|
|
|
|
|
$RE_NAME_LIST_CHARS = qr/[a-zA-Z,\.;\(\)\-\s\&'`]/; |
68
|
|
|
|
|
|
|
$RE_NAME = qr/(?:[a-zA-Z`'\-]{4,7}, *(?:[a-zA-Z]\. *)+)/; |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=pod |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=over 4 |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
=item $p = Biblio::Document::Parser::Brody->new([-debug=>1]) |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
Constructor method for class. |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
=cut |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
sub new { |
81
|
1
|
|
|
1
|
1
|
17
|
my ($class,%args) = @_; |
82
|
1
|
|
|
|
|
4
|
$DEBUG = $args{-debug}; |
83
|
1
|
|
|
|
|
5
|
return bless {}, $class; |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=pod |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=item @refs = $p->parse($str) |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
Parses a string $str and returns a list of unstructured reference strings. |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
=cut |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
sub parse { |
95
|
1
|
|
|
1
|
1
|
616
|
my $self = shift @_; |
96
|
1
|
|
|
|
|
2
|
my $arg = shift @_; |
97
|
1
|
|
|
|
|
2
|
my $BIBL = ''; |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
# UNIVERSAL::isa($arg,"IO::Handle") doesn't work? |
100
|
1
|
50
|
|
|
|
6
|
if( ref($arg) ) { |
101
|
1
|
50
|
|
|
|
48
|
read($arg,$BIBL,$MAX_SIZE) or croak "Error reading from file handle: $!\n"; |
102
|
|
|
|
|
|
|
} else { |
103
|
0
|
|
|
|
|
0
|
$BIBL = join('',$arg,@_); |
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
|
106
|
1
|
50
|
|
|
|
5
|
croak "No data to parse\n" unless length($BIBL); |
107
|
|
|
|
|
|
|
|
108
|
1
|
|
|
|
|
3
|
$BIBL =~ s/\f/\n\n/sg; |
109
|
|
|
|
|
|
|
|
110
|
1
|
|
|
|
|
2
|
my %HEADERS; |
111
|
|
|
|
|
|
|
|
112
|
1
|
|
|
|
|
224
|
while( $BIBL =~ /(?:\n[\r[:blank:]]*){2}([^\n]{0,40}\w+[^\n]{0,40})(?:\n[\r[:blank:]]*){3}/osg ) { |
113
|
0
|
|
|
|
|
0
|
$HEADERS{header_to_regexp($1)}++; |
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
|
116
|
1
|
50
|
|
|
|
4
|
if( %HEADERS ) { |
117
|
0
|
|
|
|
|
0
|
my @regexps = sort { $HEADERS{$b} <=> $HEADERS{$a} } keys %HEADERS; |
|
0
|
|
|
|
|
0
|
|
118
|
0
|
|
|
|
|
0
|
my $regexp = $regexps[0]; |
119
|
0
|
0
|
|
|
|
0
|
if( $HEADERS{$regexp} > 3 ) { |
120
|
0
|
|
|
|
|
0
|
my $c = $BIBL =~ s/(?:\n[\r[:blank:]]*){2}(?:$regexp)(?:\n[\r[:blank:]]*){3}/\n\n/sg; |
121
|
0
|
0
|
|
|
|
0
|
warn "Applying regexp: $regexp ($HEADERS{$regexp} original matches) Removed $c header/footers using ($HEADERS{$regexp} original matches): $regexp\n" if $DEBUG; |
122
|
|
|
|
|
|
|
} else { |
123
|
0
|
0
|
|
|
|
0
|
warn "Not enough matching header/footers were found\n" if $DEBUG; |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
} else { |
126
|
1
|
50
|
|
|
|
4
|
warn "No header/footers were found\n" if $DEBUG; |
127
|
|
|
|
|
|
|
} |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
# Kill any bad chars |
130
|
|
|
|
|
|
|
# local *lat2uni = convertor( 'latin1', 'utf8' ); |
131
|
|
|
|
|
|
|
# lat2uni(\$BIBL); |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
# if( $BIBL =~ /$RE_BOR/mi ) { |
134
|
|
|
|
|
|
|
# $BIBL = $'; |
135
|
|
|
|
|
|
|
# } else { |
136
|
|
|
|
|
|
|
# croak "FATAL: Unable to find reference section\n"; |
137
|
|
|
|
|
|
|
# } |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
|
140
|
1
|
|
|
|
|
2
|
my @REFS; |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
# Attempt to find the reference section |
143
|
1
|
|
33
|
|
|
125
|
while( !@REFS && ($BIBL =~ /$RE_BOR/mi) && ($BIBL = $') ) { |
|
|
|
33
|
|
|
|
|
144
|
0
|
|
|
|
|
0
|
my $c = 0; |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
# Count the number of occurences of [\d] over the next 2k of data or so |
147
|
0
|
|
|
|
|
0
|
my $buffer = substr($BIBL, 0, 2048); |
148
|
0
|
|
|
|
|
0
|
$c = 0; |
149
|
0
|
0
|
|
|
|
0
|
while($buffer =~ m/^\s*\[\d+\]/mog) { last if ++$c == 5 } |
|
0
|
|
|
|
|
0
|
|
150
|
0
|
0
|
|
|
|
0
|
if( $c >= 5 ) { |
151
|
0
|
0
|
|
|
|
0
|
warn "Style = numbered square ([1])\n" if $DEBUG; |
152
|
0
|
0
|
|
|
|
0
|
last if (@REFS = &style_numbered_square($BIBL)); |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
# How about 1. notation |
156
|
|
|
|
|
|
|
# $buffer = substr($BIBL, 0, 2046); |
157
|
0
|
|
|
|
|
0
|
$c = 0; |
158
|
0
|
0
|
|
|
|
0
|
while($buffer =~ m/^\s*(\d+)\./mog) { last if ++$c == 5 } |
|
0
|
|
|
|
|
0
|
|
159
|
0
|
0
|
|
|
|
0
|
if( $c >= 5 ) { |
160
|
0
|
0
|
|
|
|
0
|
warn "Style = numbered (1.)\n" if $DEBUG; |
161
|
|
|
|
|
|
|
# $BIBL =~ s/^\s*(\d+)\./\[$1\]/mg; |
162
|
0
|
0
|
|
|
|
0
|
last if (@REFS = &style_numbered($BIBL)); |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
# Now we're getting desperate - hopefully its a name list followed by year |
166
|
|
|
|
|
|
|
# $buffer = substr($BIBL, 0, 2048); |
167
|
0
|
|
|
|
|
0
|
$c = 0; |
168
|
0
|
0
|
|
|
|
0
|
while($buffer =~ m/^$RE_NAME_LIST_CHARS{10,40}[^\d\-]19|20\d{2}[^\d\-]/mog) { last if ++$c == 5 } |
|
0
|
|
|
|
|
0
|
|
169
|
0
|
0
|
|
|
|
0
|
if( $c >= 5 ) { |
170
|
0
|
0
|
|
|
|
0
|
warn "Style = years\n" if $DEBUG; |
171
|
0
|
0
|
|
|
|
0
|
last if (@REFS = &style_years($BIBL)); |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
# if( @REFS ) { |
175
|
|
|
|
|
|
|
# last; |
176
|
|
|
|
|
|
|
# } elsif( $BIBL =~ /$RE_BOR/mi ) { |
177
|
|
|
|
|
|
|
# warn "Skipping section ...\n" if $DEBUG; |
178
|
|
|
|
|
|
|
# $BIBL = $'; |
179
|
|
|
|
|
|
|
# } else { |
180
|
|
|
|
|
|
|
# last; |
181
|
|
|
|
|
|
|
# } |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
|
184
|
1
|
|
|
|
|
7
|
for( my $i = 0; $i < @REFS; $i++ ) { |
185
|
0
|
0
|
|
|
|
0
|
my $ref = $REFS[$i] or next; |
186
|
|
|
|
|
|
|
# $REFS[$i] = "[" . ($i+1) . "] " . unicode_string($ref); |
187
|
0
|
|
|
|
|
0
|
$REFS[$i] = "[" . ($i+1) . "] " . $ref; |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
|
190
|
1
|
0
|
|
|
|
4
|
return grep { defined($_) && length($_) } @REFS; |
|
0
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
#my ($BIBL, $buffer); |
194
|
|
|
|
|
|
|
#$BIBL = ''; |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
#my $lc = 0; |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
#die "FATAL: Input has gone beyond $MAX_SIZE byte limit" if read(STDIN,$BIBL,$MAX_SIZE) == $MAX_SIZE; |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
#die "Empty input" unless length($BIBL); |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
#while( read(STDIN,$buffer,4096) ) { |
203
|
|
|
|
|
|
|
# $BIBL .= $buffer; |
204
|
|
|
|
|
|
|
# die "FATAL: Input has gone beyond $MAX_SIZE bytes limit" if length($BIBL) > $MAX_SIZE; |
205
|
|
|
|
|
|
|
#} |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
#while( <> ) { |
209
|
|
|
|
|
|
|
# s/\f/\n\n/sg; |
210
|
|
|
|
|
|
|
# $BIBL = $_ . $BIBL; |
211
|
|
|
|
|
|
|
# die "FATAL: Input has gone beyond $MAX_SIZE bytes limit" if length($BIBL) > $MAX_SIZE; |
212
|
|
|
|
|
|
|
# if( $_ =~ /^(?:\n\s*){3}/ ) { |
213
|
|
|
|
|
|
|
# # Regexp matches for the end of the string are *really* bad performance |
214
|
|
|
|
|
|
|
# # Lines are in reverse order! |
215
|
|
|
|
|
|
|
# if( $BIBL =~ /^(?:\n\s*){3}([^\n]{0,40}\w+[^\n]{0,40})(?:\n\s*){2}/os ) { |
216
|
|
|
|
|
|
|
# $HEADERS{header_to_regexp($1)}++; |
217
|
|
|
|
|
|
|
# } |
218
|
|
|
|
|
|
|
# } |
219
|
|
|
|
|
|
|
#} |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
# Put the lines back in-order |
222
|
|
|
|
|
|
|
#my @lines = split(/\n/,$BIBL); |
223
|
|
|
|
|
|
|
#$BIBL = ''; |
224
|
|
|
|
|
|
|
#for(@lines) { |
225
|
|
|
|
|
|
|
# $BIBL = $_ . "\n" . $BIBL; |
226
|
|
|
|
|
|
|
#} |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
# Read in the document |
229
|
|
|
|
|
|
|
#while( read(STDIN,$buffer,4096) ) { |
230
|
|
|
|
|
|
|
# if( length($BIBL) > $MAX_SIZE ) { |
231
|
|
|
|
|
|
|
# die "FATAL: Input has gone beyond $MAX_SIZE Bytes limit\n"; |
232
|
|
|
|
|
|
|
# } |
233
|
|
|
|
|
|
|
# $BIBL .= $buffer; |
234
|
|
|
|
|
|
|
#} |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
#print "Ref section:\n", $BIBL; |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
# Change to utf8 |
239
|
|
|
|
|
|
|
#use utf8; |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
#### REMAINING FUNCTIONS ARE INTERNAL OR DEPRECATED #### |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
sub end_of_references { |
244
|
0
|
|
|
0
|
0
|
|
my $ref = shift; |
245
|
0
|
0
|
0
|
|
|
|
if( $$ref =~ /${RE_EOR}/im || |
246
|
|
|
|
|
|
|
$$ref =~ /^\s*acknowledgements:/im ) { |
247
|
0
|
|
|
|
|
|
$$ref = $`; |
248
|
0
|
|
|
|
|
|
return 1; |
249
|
|
|
|
|
|
|
} |
250
|
0
|
0
|
|
|
|
|
if( $$ref =~ /(?:\s*\n){3,}/s ) { |
251
|
0
|
|
|
|
|
|
$$ref = $`; |
252
|
0
|
|
|
|
|
|
return 1; |
253
|
|
|
|
|
|
|
} |
254
|
0
|
0
|
|
|
|
|
if( length($$ref) > 1024 ) { |
255
|
0
|
|
|
|
|
|
return 1; |
256
|
|
|
|
|
|
|
} |
257
|
0
|
|
|
|
|
|
return 0; |
258
|
|
|
|
|
|
|
} |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
sub style_numbered { |
261
|
0
|
|
|
0
|
0
|
|
my @REFS = split(/^\s*(\d+\.)/m, shift); |
262
|
|
|
|
|
|
|
|
263
|
0
|
|
0
|
|
|
|
shift @REFS while (@REFS && ($REFS[0] !~ /^\d+\./ || substr($REFS[0],0,-1) != 1)); |
|
|
|
0
|
|
|
|
|
264
|
|
|
|
|
|
|
|
265
|
0
|
|
|
|
|
|
my $i = 2; |
266
|
0
|
|
|
|
|
|
while( $i < @REFS ) { |
267
|
0
|
0
|
|
|
|
|
if( $REFS[$i] =~ /^\d+\./ ) { |
268
|
0
|
|
|
|
|
|
my $val = substr($REFS[$i],0,-1); |
269
|
0
|
0
|
|
|
|
|
if( $val != ($i/2)+1 ) { |
270
|
0
|
|
|
|
|
|
$REFS[$i-1] .= splice(@REFS,$i,1); |
271
|
|
|
|
|
|
|
} else { |
272
|
0
|
|
|
|
|
|
$i+=2; |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
} else { |
275
|
0
|
|
|
|
|
|
$REFS[$i-1] .= splice(@REFS,$i,1); |
276
|
|
|
|
|
|
|
} |
277
|
0
|
0
|
|
|
|
|
if( end_of_references(\$REFS[$i-1]) ) { |
278
|
0
|
|
|
|
|
|
splice(@REFS,$i); |
279
|
|
|
|
|
|
|
} |
280
|
|
|
|
|
|
|
} |
281
|
|
|
|
|
|
|
|
282
|
0
|
|
|
|
|
|
for( my $i = 0; $i < @REFS; $i++ ) { |
283
|
0
|
|
|
|
|
|
$REFS[$i] .= splice(@REFS,$i+1,1); |
284
|
0
|
|
|
|
|
|
$REFS[$i] =~ s/\s+/ /sg; |
285
|
0
|
|
|
|
|
|
$REFS[$i] =~ s/^\s+//; |
286
|
0
|
|
|
|
|
|
$REFS[$i] =~ s/\s+$//; |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
|
289
|
0
|
|
|
|
|
|
@REFS; |
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
sub style_numbered_square { |
293
|
0
|
|
|
0
|
0
|
|
my $BIBL = shift; |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
# Split the bibliography |
296
|
0
|
|
|
|
|
|
$BIBL =~ /(?=\[\d+\])/; |
297
|
0
|
0
|
|
|
|
|
my @REFS = split(/^\s*\[(\d+)\]/m, $') or return (); |
298
|
0
|
0
|
|
|
|
|
shift @REFS unless $REFS[0]; |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
# Make sure there is a "value" to go with a reference number |
302
|
|
|
|
|
|
|
# for( my $i = 0; $i < @REFS; $i+=2 ) { |
303
|
|
|
|
|
|
|
# if( $REFS[$i+1] =~ /\[\d+\]/ ) { |
304
|
|
|
|
|
|
|
# splice(@REFS,$i+1,0,''); |
305
|
|
|
|
|
|
|
# } |
306
|
|
|
|
|
|
|
# } |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
# If there is a large reference its probably the end of the bibliography |
309
|
0
|
|
|
|
|
|
for( my $i = 10; $i < @REFS; $i++ ) { |
310
|
0
|
0
|
|
|
|
|
if( length($REFS[$i]) > 1024 ) { |
311
|
0
|
|
|
|
|
|
splice(@REFS, $i+1); |
312
|
0
|
|
|
|
|
|
$REFS[$i] = substr($REFS[$i],0,1024) . " RUNAWAY_REFERENCE_DETECTED "; |
313
|
|
|
|
|
|
|
} |
314
|
|
|
|
|
|
|
} |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
# Add any out-of-order chunks to the previous reference value |
317
|
0
|
|
|
|
|
|
my $last = 0; |
318
|
0
|
|
|
|
|
|
my $max = 0; |
319
|
0
|
|
|
|
|
|
for( my $i = 0; $i < @REFS; $i+=2 ) { |
320
|
0
|
|
|
|
|
|
my $n = $REFS[$i]; |
321
|
|
|
|
|
|
|
# $n =~ s/\D//g; |
322
|
0
|
0
|
|
|
|
|
$max = $n if $n > $max; |
323
|
0
|
0
|
|
|
|
|
if( $n == $last+1 ) { |
324
|
0
|
|
|
|
|
|
$last++; |
325
|
0
|
|
|
|
|
|
next; |
326
|
|
|
|
|
|
|
} else { |
327
|
|
|
|
|
|
|
# Join this out-of-order chunk onto the previous ref. |
328
|
0
|
|
|
|
|
|
$REFS[$i-1] .= splice(@REFS,$i,2); |
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
# Remove any trailing garbage |
333
|
0
|
|
|
|
|
|
splice(@REFS, $last*2, -1); |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
# Presumably there is a gap between the last reference and any trailing junk |
336
|
0
|
|
|
|
|
|
$REFS[$#REFS] =~ s/(\r?\n){2}.*//s; |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
# Prettify the references |
339
|
0
|
|
|
|
|
|
for( my $i = 1; $i < @REFS; $i+=2 ) { |
340
|
0
|
|
|
|
|
|
$REFS[$i] =~ s/[\r\n]+/ /sg; |
341
|
0
|
|
|
|
|
|
$REFS[$i] =~ s/^\s+//sg; |
342
|
0
|
|
|
|
|
|
$REFS[$i] =~ s/\s+$//sg; |
343
|
|
|
|
|
|
|
} |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
# Get rid of the numbering |
346
|
0
|
|
|
|
|
|
for( my $i = 0; $i < @REFS; $i++ ) { |
347
|
|
|
|
|
|
|
# $REFS[$i] = $REFS[$i+1]; |
348
|
0
|
|
|
|
|
|
splice(@REFS,$i,2,$REFS[$i+1]); |
349
|
|
|
|
|
|
|
} |
350
|
|
|
|
|
|
|
|
351
|
0
|
|
|
|
|
|
return @REFS; |
352
|
|
|
|
|
|
|
} |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
sub style_years { |
355
|
0
|
|
|
0
|
0
|
|
my $BIBL = shift; |
356
|
|
|
|
|
|
|
|
357
|
0
|
|
|
|
|
|
$BIBL =~ s/^\s+//sg; |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
# Convert very long lines of spaces into a return |
360
|
0
|
|
|
|
|
|
$BIBL =~ s/ {70} */\n/sg; |
361
|
|
|
|
|
|
|
|
362
|
0
|
|
|
|
|
|
my @REFS; |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
# Lets try splitting on a blank line |
365
|
0
|
|
|
|
|
|
@REFS = split(/((?:\s*\n){2})/, $BIBL); |
366
|
|
|
|
|
|
|
|
367
|
0
|
|
0
|
|
|
|
shift @REFS while (@REFS && $REFS[0] !~ /^$RE_NAME_LIST_CHARS+\d{4}\D/); |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
# That didn't work, lets split on left-aligned things (where the next line(s) are blank or indented) |
370
|
0
|
0
|
0
|
|
|
|
if( !@REFS || length($REFS[0]) > 300 ) { |
371
|
0
|
|
|
|
|
|
@REFS = split(/\n[ ]{0,2}((?:(?:\S$RE_NAME_LIST_CHARS{10,})|$RE_NAME[^\d\-])\d{4}[^\d\-][^\n]+)/, $BIBL); |
372
|
0
|
|
0
|
|
|
|
shift @REFS while (@REFS && $REFS[0] !~ /^$RE_NAME_LIST_CHARS{10,}\d{4}\D/s); |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
#return @REFS; |
375
|
|
|
|
|
|
|
|
376
|
0
|
|
|
|
|
|
for( my $i = 1; $i < @REFS; $i++ ) { |
377
|
0
|
0
|
|
|
|
|
if( end_of_references(\$REFS[$i]) ) { |
|
|
0
|
|
|
|
|
|
378
|
0
|
|
|
|
|
|
splice(@REFS,$i+1); |
379
|
|
|
|
|
|
|
# Indented |
380
|
|
|
|
|
|
|
} elsif( $REFS[$i] =~ /^\s* {5}|\t/m ) { |
381
|
0
|
|
|
|
|
|
$REFS[$i-1] .= splice(@REFS,$i,1); |
382
|
|
|
|
|
|
|
} |
383
|
|
|
|
|
|
|
} |
384
|
|
|
|
|
|
|
} else { |
385
|
0
|
|
|
|
|
|
for( my $i = 1; $i < @REFS; $i++ ) { |
386
|
0
|
0
|
|
|
|
|
if( end_of_references(\$REFS[$i]) ) { |
387
|
0
|
|
|
|
|
|
splice(@REFS,$i+1); |
388
|
|
|
|
|
|
|
} |
389
|
|
|
|
|
|
|
} |
390
|
|
|
|
|
|
|
} |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
# If we find what looks like the end of the reference section, discard the trailing rubbish |
393
|
|
|
|
|
|
|
# for( my $i = 0; $i < @REFS; $i++ ) { |
394
|
|
|
|
|
|
|
# if( end_of_references(\$REFS[$i]) ) { |
395
|
|
|
|
|
|
|
# splice(@REFS,$i+1); |
396
|
|
|
|
|
|
|
# } elsif( $BIBL =~ /(\r?\n){3}/s ) { |
397
|
|
|
|
|
|
|
# $REFS[$i] = $`; |
398
|
|
|
|
|
|
|
# splice(@REFS,$i+1); |
399
|
|
|
|
|
|
|
# } |
400
|
|
|
|
|
|
|
# } |
401
|
|
|
|
|
|
|
|
402
|
0
|
0
|
|
|
|
|
unless( @REFS ) { |
403
|
0
|
|
|
|
|
|
warn "Unable to split year-based references\n"; |
404
|
0
|
|
|
|
|
|
return (); |
405
|
|
|
|
|
|
|
} |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
# Remove heavily indented lines following a blank line |
408
|
0
|
|
|
|
|
|
for( my $i = 1; $i < @REFS; $i++ ) { |
409
|
0
|
0
|
0
|
|
|
|
if( $REFS[$i-1] !~ /\S/ && $REFS[$i] =~ /^\s{40}/ ) { |
410
|
0
|
|
|
|
|
|
splice(@REFS,$i,1); |
411
|
0
|
|
|
|
|
|
$i--; |
412
|
|
|
|
|
|
|
} |
413
|
|
|
|
|
|
|
} |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
# Join refs with the previous reference if they are very short or are quite short and don't start with ...(year) |
416
|
0
|
|
|
|
|
|
for( my $i = 1; $i < @REFS; $i++ ) { |
417
|
0
|
|
|
|
|
|
my $l = $REFS[$i]; |
418
|
0
|
|
|
|
|
|
$l =~ s/\s+//sg; |
419
|
0
|
0
|
0
|
|
|
|
if( (length($l) < 30) || |
|
|
|
0
|
|
|
|
|
420
|
|
|
|
|
|
|
(length($l) < 50 && $REFS[$i] !~ /^$RE_NAME_LIST_CHARS{10,40}[^\d\-](\d{4})[^\d\-]/s) ) { |
421
|
0
|
|
|
|
|
|
$REFS[$i-1] .= $REFS[$i]; |
422
|
0
|
|
|
|
|
|
splice(@REFS,$i,1); |
423
|
0
|
|
|
|
|
|
$i--; |
424
|
|
|
|
|
|
|
} |
425
|
|
|
|
|
|
|
} |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
# If we find 3 sequential references without years near the beginning we probably have trailing garbage |
428
|
0
|
|
|
|
|
|
my $lc = 0; |
429
|
0
|
|
|
|
|
|
for( my $i = 10; $i < @REFS; $i++ ) { |
430
|
0
|
0
|
|
|
|
|
if( $REFS[$i] =~ /^\D{10,50}19|20\d{2}/s ) { |
431
|
0
|
|
|
|
|
|
$lc = 0; |
432
|
|
|
|
|
|
|
} else { |
433
|
0
|
|
|
|
|
|
$lc++; |
434
|
|
|
|
|
|
|
} |
435
|
0
|
0
|
|
|
|
|
if( $lc == 3 ) { |
436
|
0
|
|
|
|
|
|
splice(@REFS,$i-2); |
437
|
|
|
|
|
|
|
} |
438
|
|
|
|
|
|
|
} |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
# Remove lines without any numbers that are quite long (excluding spaces) |
441
|
0
|
|
|
|
|
|
for( my $i = 0; $i < @REFS; $i++ ) { |
442
|
0
|
|
|
|
|
|
my $l = $REFS[$i]; |
443
|
0
|
|
|
|
|
|
$l =~ s/\s+//sg; |
444
|
0
|
0
|
0
|
|
|
|
if( length($l) > 100 && $REFS[$i] !~ /\d/ ) { |
445
|
0
|
|
|
|
|
|
splice(@REFS,$i,1); |
446
|
|
|
|
|
|
|
} |
447
|
|
|
|
|
|
|
} |
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
# Prettify |
450
|
0
|
|
|
|
|
|
map { $_ =~ s/\s+/ /sg; $_ =~ s/^\s+//; $_ =~ s/\s+$//s; } @REFS; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
# This doesn't work - names are too icky |
453
|
|
|
|
|
|
|
# Now go back in and split anything that looks like name, x (year) |
454
|
|
|
|
|
|
|
# for( my $i = 0; $i < @REFS; $i++ ) { |
455
|
|
|
|
|
|
|
# my @srefs = grep { $_ =~ /\S/ } split(/((?:[a-zA-Z\-\'\.]+\s*,\s*[a-zA-Z\.]+.{0,7})+\d{4}\b)/, $REFS[$i]); |
456
|
|
|
|
|
|
|
# next unless @srefs > 2; |
457
|
|
|
|
|
|
|
#print "Split reference:\n", |
458
|
|
|
|
|
|
|
# (map { "PART: \"$_\"\n" } @srefs), "\n"; |
459
|
|
|
|
|
|
|
# } |
460
|
|
|
|
|
|
|
#die; |
461
|
|
|
|
|
|
|
|
462
|
0
|
|
|
|
|
|
return @REFS; |
463
|
|
|
|
|
|
|
} |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
sub header_to_regexp { |
466
|
0
|
|
|
0
|
0
|
|
my $header = shift; |
467
|
0
|
|
|
|
|
|
$header =~ s/([\\\|\(\)\[\]\.\*\+\?\{\}])/\\$1/g; |
468
|
0
|
|
|
|
|
|
$header =~ s/\s+/\\s+/g; |
469
|
0
|
|
|
|
|
|
$header =~ s/\d+/\\d+/g; |
470
|
0
|
|
|
|
|
|
return $header; |
471
|
0
|
|
|
|
|
|
return q/(?:\n\s*){3}(/.$header.q/)(?:\n\s*){2}/; |
472
|
|
|
|
|
|
|
} |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
#sub unicode_string { |
475
|
|
|
|
|
|
|
# $_ = shift(); |
476
|
|
|
|
|
|
|
# s/[\x00-\x08\x0b-\x0c\x0e-\x1f]//sg; |
477
|
|
|
|
|
|
|
# s/([\x80-\xff])/sprintf("%04x;",ord($1))/seg; |
478
|
|
|
|
|
|
|
# return $_; |
479
|
|
|
|
|
|
|
#} |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
1; |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
__END__ |