blib/lib/Biblio/Citation/Parser/Jiao/Utility.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 107 | 122 | 87.7 |
branch | 10 | 22 | 45.4 |
condition | 0 | 3 | 0.0 |
subroutine | 10 | 12 | 83.3 |
pod | 0 | 9 | 0.0 |
total | 127 | 168 | 75.6 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package Biblio::Citation::Parser::Jiao::Utility; | ||||||
2 | |||||||
3 | ###################################################################### | ||||||
4 | # | ||||||
5 | # ParaTools::Citation::Parser::Jiao::Utility; | ||||||
6 | # | ||||||
7 | ###################################################################### | ||||||
8 | # | ||||||
9 | # This file is part of ParaCite Tools | ||||||
10 | # Based on Zhuoan Jiao's (zj@ecs.soton.ac.uk) citation parser (available | ||||||
11 | # at http://arabica.ecs.soton.ac.uk/code/doc/ReadMe.html) | ||||||
12 | # | ||||||
13 | # The code is relatively unchanged, except to bring into compliance | ||||||
14 | # with the ParaCite metadata style, and to allow interoperability with | ||||||
15 | # the other parsers. | ||||||
16 | # | ||||||
17 | # Copyright (c) 2002 University of Southampton, UK. SO17 1BJ. | ||||||
18 | # | ||||||
19 | # ParaTools is free software; you can redistribute it and/or modify | ||||||
20 | # it under the terms of the GNU General Public License as published by | ||||||
21 | # the Free Software Foundation; either version 2 of the License, or | ||||||
22 | # (at your option) any later version. | ||||||
23 | # | ||||||
24 | # ParaTools is distributed in the hope that it will be useful, | ||||||
25 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
26 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||||||
27 | # GNU General Public License for more details. | ||||||
28 | # | ||||||
29 | # You should have received a copy of the GNU General Public License | ||||||
30 | # along with ParaTools; if not, write to the Free Software | ||||||
31 | # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||||||
32 | # | ||||||
33 | ###################################################################### | ||||||
34 | |||||||
35 | 1 | 1 | 5 | use strict; | |||
1 | 1 | ||||||
1 | 37 | ||||||
36 | 1 | 1 | 5 | use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); | |||
1 | 1 | ||||||
1 | 562 | ||||||
37 | |||||||
38 | require Exporter; | ||||||
39 | |||||||
40 | @ISA = qw(Exporter); | ||||||
41 | @EXPORT = qw(&normalisation &normalise_html &normalise_name | ||||||
42 | &normalise_date &normalise_journal &num_of_figures); | ||||||
43 | |||||||
44 | $VERSION = '0.01'; | ||||||
45 | |||||||
46 | # | ||||||
47 | # Normalisation utilities | ||||||
48 | # | ||||||
49 | sub normalisation { | ||||||
50 | 2 | 2 | 0 | 4 | my($Text) = @_; | ||
51 | # replace embedded '\n' with ' ' | ||||||
52 | 2 | 6 | $Text =~ s/^\s+//s; | ||||
53 | 2 | 10 | $Text =~ s/\s+$//s; | ||||
54 | 2 | 24 | $Text =~ s/\s+/ /g; # Use single space | ||||
55 | # while ($Text =~ /.+?\n.+?/g) { | ||||||
56 | # $Text =~ s/(.+?)\n(.+?)/$1 $2/ | ||||||
57 | # }; | ||||||
58 | 2 | 5 | $Text =~ s/``(.*?)''/"$1"/sg; # Replace ``A Paper Title'' with "A Paper Title" | ||||
59 | |||||||
60 | 2 | 17 | $Text =~ s/\s*-\s*/-/g; # remove space around '-' | ||||
61 | 2 | 4 | $Text =~ s/\s*'\s*/'/g; # remove space around ' | ||||
62 | 2 | 15 | $Text =~ s/\s*:\s*/:/g; # remove space around : | ||||
63 | 2 | 7 | $Text =~ s/\(\s+/\(/g; # ( 1998) ==> (1998) | ||||
64 | 2 | 9 | $Text =~ s/\s+\)/\)/g; # (1998 ) ==> (1998) | ||||
65 | # while ($Text =~/--/g) {$Text =~ s/--/-/}; # use single '-'. | ||||||
66 | 2 | 4 | $Text =~ s/--+/-/g; | ||||
67 | 2 | 4 | $Text =~ s/~//g; # remove '~' (e.g. C.~B.~Hanna) | ||||
68 | 2 | 13 | $Text =~ s/[,;\s]+$//; # remove last ',;\s' on a line | ||||
69 | # 'Nr.' caused error in 'find_jnl_name', i.e. it became as | ||||||
70 | # 'journal name' if not removed. (see: arXiv:quant-ph/9905016) | ||||||
71 | 2 | 6 | $Text =~ s/([,;])\s*Nr\.\s*(\d+[,;])/$1$2/i; | ||||
72 | # "[12] For example, R. Machleidt, ...Phys. Rep. 149, 1 (1987) | ||||||
73 | 2 | 5 | $Text =~ s/^([^a-z]+)for example\W+/$1/i; | ||||
74 | # '[18] *** G. Do Dang, ... (arXiv:nucl-th/9911081) | ||||||
75 | 2 | 4 | $Text =~ s/\*+//g; | ||||
76 | # Phys. Rev. D56 => Phys. Rev. D. 56 | ||||||
77 | 2 | 6 | $Text =~ s/phys.{1,6}rev.{1,6}([a-z])(\d+)/PHYS. REV. $1 $2/ig; | ||||
78 | # Physica 34 D => Physic D 34 | ||||||
79 | 2 | 5 | $Text =~ s/physica\s+(\d+)\s+([a-z])/PHYSICA $2 $1/ig; | ||||
80 | # Nucl. Phys. B567 => Nucl. Phys. B | ||||||
81 | 2 | 5 | $Text =~ s/nuc.{1,6}phys.{1,6}\s+([a-z])(\d+)/NUCL. PHYS. $1 $2/ig; | ||||
82 | 2 | 6 | return $Text; | ||||
83 | }; | ||||||
84 | |||||||
85 | sub chr_valid { | ||||||
86 | 0 | 0 | 0 | 0 | my $c = shift; | ||
87 | 0 | 0 | 0 | 0 | if( $c < 128 || $c > 255 ) { | ||
88 | 0 | 0 | return chr($c); | ||||
89 | } else { | ||||||
90 | 0 | 0 | return ' '; | ||||
91 | } | ||||||
92 | } | ||||||
93 | |||||||
94 | sub normalise_html { | ||||||
95 | 1 | 1 | 0 | 3 | my($Text) = @_; | ||
96 | |||||||
97 | 1 | 1 | 1047 | use utf8; | |||
1 | 12 | ||||||
1 | 12 | ||||||
98 | |||||||
99 | # remove tag |
||||||
100 | 1 | 3 | $Text =~ s/ //ig; |
||||
101 | # Convert HTML entities to Unicode | ||||||
102 | 1 | 3 | $Text =~ s/\(\w+);/chr_valid(hex($1))/eg; | ||||
0 | 0 | ||||||
103 | 1 | 3 | $Text =~ s/\(\d+);/chr_valid($1)/eg; | ||||
0 | 0 | ||||||
104 | 1 | 2 | $Text =~ s/&(\w)acute[;,]/$1/g; # a ' on top of (\w) | ||||
105 | 1 | 4 | $Text =~ s/&(\w)cedil[;,]/$1/g; # a 'tail' under (\w), e.g Francios | ||||
106 | 1 | 2 | $Text =~ s/&(\w)grave[;,]/$1/g; # a ` on top of (\w) | ||||
107 | 1 | 2 | $Text =~ s/&(\w)tilde[;,]/$1/g; | ||||
108 | 1 | 4 | $Text =~ s/&(\w)uml[;,]/$1e/g; # a '..' on top of (\w) | ||||
109 | 1 | 3 | $Text =~ s/&(\w)slash[;,]/$1/g; | ||||
110 | #$Text =~ s/-88;(\w)/$1/g; # as &(\w)uml (see astro-ph/9811179) | ||||||
111 | 1 | 3 | $Text =~ s/-\d+;\s*(\w)/$1/g; | ||||
112 | 1 | 2 | $Text =~ s/ß[;,]/ss/g; | ||||
113 | 1 | 3 | $Text =~ s/&[;,]/ and /g; | ||||
114 | 1 | 3 | $Text =~ s/\s*(\w)/$1/g; # a ~ on top of (\w) | ||||
115 | 1 | 2 | $Text =~ s/\/?i>//g; # cogprints | ||||
116 | 1 | 3 | $Text =~ s/&[a-z]+;//g; # otherwise ';' cause ref line break. | ||||
117 | # $Text =~ s/\d+;//g; # e.g. '' in 'hep-th/0001001 [99]'. | ||||||
118 | 1 | 3 | $Text =~ s/\\"(\w)/$1e/g; # G\"unter => Gueter | ||||
119 | |||||||
120 | 1 | 3 | $Text =~ s/\^//g; # remove ^ | ||||
121 | 1 | 3 | $Text =~ s/([A-Z])\s*&\s*([A-Z])/$1 and $2/g; # replace '&' with 'and' | ||||
122 | 1 | 3 | $Text =~ s/[, ]+& / and /; | ||||
123 | # remove HTML markups ( etc.) | ||||||
124 | 1 | 3 | $Text =~ s/<[a-z\/]{1,3}>//ig; | ||||
125 | 1 | 3 | return $Text | ||||
126 | } | ||||||
127 | |||||||
128 | |||||||
129 | sub normalise_name { | ||||||
130 | 1 | 1 | 0 | 2 | my($Text) = @_; | ||
131 | 1 | 3 | my $Suffix = ''; | ||||
132 | 1 | 3 | $Text =~ s/~//; # remove typo | ||||
133 | # Jr. | ||||||
134 | 1 | 50 | 11 | if ($Text =~ s/[, \.]+(Jr|Sr|Snr)\.?\s*$//i){ | |||
50 | |||||||
135 | 0 | 0 | $Suffix = $1 | ||||
136 | } | ||||||
137 | elsif ($Text =~ s/([, \.]+)(Jr|Sr|Snr)[. ]/$1/i){ | ||||||
138 | 0 | 0 | $Suffix = $2 | ||||
139 | }; | ||||||
140 | |||||||
141 | # van der Buren D => D van der Buren" | ||||||
142 | 1 | 50 | 7 | if ($Text =~ /^\s*(((van|von|de|den|der)\s+)+)(\S\S+)\s+(.+)/i) { | |||
143 | 0 | 0 | $Text = "$5 $1 $4" | ||||
144 | }; | ||||||
145 | 1 | 5 | $Text =~ s/\s+/ /g; # single space | ||||
146 | 1 | 3 | $Text =~ s/^\W+//; | ||||
147 | 1 | 4 | $Text =~ s/\s+$//; | ||||
148 | # "A. Smith" => "A.Smith" | ||||||
149 | 1 | 2 | $Text =~ s/([a-z])s+\./$1\./ig; | ||||
150 | # Ghisellini G. A. ==> G.A. Ghisellini | ||||||
151 | # Konenkov D. Yu. => D.Yu. Konenkov | ||||||
152 | 1 | 50 | 6 | if ($Text =~ /^([^\s.]{2,})\s+(([A-Z][a-zA-Z]?\W+)*)([A-Z][a-zA-Z]?)\W*$/) { | |||
153 | 0 | 0 | $Text = "$2$4 $1" | ||||
154 | }; | ||||||
155 | |||||||
156 | 1 | 50 | 3 | $Text = "$Text $Suffix" if ($Suffix); | |||
157 | 1 | 3 | $Text = tdb_normalise_name($Text); | ||||
158 | 1 | 5 | return $Text; | ||||
159 | }; | ||||||
160 | |||||||
161 | # Based on Tim's Authors::splitauthors and Authors::_cmonauthor(); | ||||||
162 | # This subroutine is called simply because we want the author names | ||||||
163 | # to be transformed to a same style used by Tim's programs. Otherwise | ||||||
164 | # a join on author names in Publication and Reference tables will | ||||||
165 | # miss a lot of targets. | ||||||
166 | sub tdb_normalise_name{ | ||||||
167 | 1 | 1 | 0 | 3 | my $author = shift; | ||
168 | |||||||
169 | # Strip any brackets | ||||||
170 | 1 | 2 | $author =~ s/\s*\([^\)]*\)\s*//g; | ||||
171 | # Get rid of any dashes (except for dashes like Hu-Su) | ||||||
172 | 1 | 3 | $author =~ s/(\W)-/$1/g; | ||||
173 | # Remove any "the"s, e.g. The OPAL Collaboration | ||||||
174 | 1 | 845 | $author =~ s/\bthe\s+//ig; | ||||
175 | # Sort out Jr/Jr. | ||||||
176 | 1 | 5 | $author =~ s/,?\sJr\.?/_Jr/ig; | ||||
177 | |||||||
178 | 1 | 3 | $author =~ s/[\{\}]//g; # Remove any specialisations | ||||
179 | 1 | 2 | $author =~ s/\\.//g; # Remove any escapes | ||||
180 | # Convert Convert Hawking S. to S.Hawking (already done - zj) | ||||||
181 | # ($author =~ /(\w\w+)\s+([\.\w\s]+\.)$/) && ($author = $2.$1); | ||||||
182 | 1 | 5 | $author =~ s/\.\s/\./g; # Convert S.W. Hawking to S.W.Hawking | ||||
183 | 1 | 11 | $author =~ s/([A-Z])\s/$1\./g; # Convert S W Hawking to S.W.Hawking | ||||
184 | 1 | 50 | 69 | ($author = lc($author)) && ($author =~ s/\b(\w)/\U$1/g); | |||
185 | # Convert STEPHEN_HAWKING to Stephen_Hawking | ||||||
186 | 1 | 3 | $author =~ s/\s/_/g; # Convert Stephen W.Hawking to Stephen_W.Hawking | ||||
187 | 1 | 3 | $author =~ s/\.\.+/\./g; # Remove double dots | ||||
188 | 1 | 4 | return $author; | ||||
189 | } | ||||||
190 | |||||||
191 | |||||||
192 | sub normalise_date { | ||||||
193 | 1 | 1 | 0 | 2 | my($Text) = @_; | ||
194 | |||||||
195 | # 12-14 Dec. | ||||||
196 | 1 | 12 | $Text =~ s/[^\w\/][0-3][0-9]?[a-z]*?(\s*\-\s*[0-3][0-9]?[a-z]*?)?\s+ | ||||
197 | (Jan[\.\s]|January\b|Feb[\.\s]|February\b| | ||||||
198 | Mar[\.\s]|March\b|Apr[\.\s]|April\b|May| | ||||||
199 | Jun[\.\s]|June\b|Jul[\.\s]|July\b|Aug[\.\s]|August\b| | ||||||
200 | Sep[\.\s]|September\b|Oct[\.\s]|October\b| | ||||||
201 | Nov[\.\s]|November\b|Dec[\.\s]|December\b)//xig; | ||||||
202 | |||||||
203 | # Dec 12-14 | ||||||
204 | 1 | 11 | $Text =~ s/(Jan[\.\s]|January\b|Feb[\.\s]|February\b| | ||||
205 | Mar[\.\s]|March\b|Apr[\.\s]|April\b|May| | ||||||
206 | Jun[\.\s]|June\b|Jul[\.\s]|July\b|Aug[\.\s]|August\b| | ||||||
207 | Sep[\.\s]|September\b|Oct[\.\s]|October\b| | ||||||
208 | Nov[\.\s]|November\b|Dec[\.\s]|December\b) | ||||||
209 | [^\w\/]*[0-3][0-9]?[a-z]*?(\s*\-\s*[0-3][0-9]?[a-z]*?)?\b//xig; | ||||||
210 | |||||||
211 | 1 | 3 | return $Text; | ||||
212 | }; | ||||||
213 | |||||||
214 | |||||||
215 | sub normalise_journal { | ||||||
216 | 1 | 1 | 0 | 5 | my($Text) = @_; | ||
217 | 1 | 3 | $Text =~ s/^in\s+//i; # "in ..." | ||||
218 | 1 | 4 | $Text =~ s/^(see )?also\s+//i; # "also ..."; | ||||
219 | 1 | 3 | $Text =~ s/\s*:\s*/:/g; | ||||
220 | 1 | 3 | $Text =~ s/\s\s/ /g; # single space; | ||||
221 | 1 | 3 | $Text =~ s/\.\s/\./g; # "J. Physics" => "J.Physics" | ||||
222 | 1 | 2 | $Text =~ s/\.\(/\. \(/g; | ||||
223 | |||||||
224 | 1 | 3 | $Text =~ s/^\W+//; | ||||
225 | 1 | 5 | $Text =~ s/[^\w.]+$//; | ||||
226 | |||||||
227 | # remove anything in brackets at the end. | ||||||
228 | 1 | 2 | $Text =~ s/\s*\([^)]+$//; # e.g. R. Ram, J. Phys. (Paris | ||||
229 | 1 | 3 | $Text =~ s/^[^(]+\)\s*//; # e.g. a) R. Ram, J. Phys. 10, 120, 1998 | ||||
230 | |||||||
231 | # unify cases | ||||||
232 | #($Text = lc($Text)) && ($Text =~ s/\b(\w)/\U$1/g); | ||||||
233 | 1 | 2 | $Text = uc($Text); | ||||
234 | 1 | 5 | return $Text; | ||||
235 | }; | ||||||
236 | |||||||
237 | |||||||
238 | sub num_of_figures { | ||||||
239 | 1 | 1 | 0 | 2 | my($Text) = @_; | ||
240 | 1 | 3 | my($N, @Nlist); | ||||
241 | 1 | 2 | $N = 0; | ||||
242 | 1 | 2 | @Nlist =(); | ||||
243 | |||||||
244 | 1 | 3 | $Text = normalisation($Text); | ||||
245 | # e.g. "p. 24-26" regarded as one number. | ||||||
246 | # ignore 'N = 2' kind of equations, and | ||||||
247 | # ignore '25th' kinds (e.g. Proc. 25th ICRC). | ||||||
248 | # ignore 'protein Aquaporin-1 in ...' | ||||||
249 | # ignore 'hep-th/9901001' | ||||||
250 | # ignore ' ... 1.55, ...' | ||||||
251 | 1 | 15 | while ($Text =~ /(?:^|\b)[a-z]*(\d+)([a-z]*) | ||||
252 | (?:-[a-z]*\d+[a-z]*)* | ||||||
253 | (?:\b|$)/gix) { | ||||||
254 | 3 | 50 | 9 | next if ($2 =~ /^th$/i); | |||
255 | 3 | 50 | 11 | next if ($' =~ /^\.\d+/); | |||
256 | 3 | 50 | 9 | next if ($` =~ /\d+\.$/); | |||
257 | 3 | 50 | 11 | next if ($` =~ /[=<>\/-]\s*$/); | |||
258 | 3 | 36 | push(@Nlist, $1); | ||||
259 | }; | ||||||
260 | 1 | 2 | $N = scalar(@Nlist); | ||||
261 | 1 | 5 | return $N | ||||
262 | } | ||||||
263 | |||||||
264 | |||||||
265 | sub remove_extra_spc { | ||||||
266 | 0 | 0 | 0 | my($Text) = @_; | |||
267 | 0 | $Text =~ s/^\s+//; | |||||
268 | 0 | $Text =~ s/\s+$//; | |||||
269 | 0 | $Text =~ s/\s\s+/ /g; | |||||
270 | 0 | return $Text | |||||
271 | }; | ||||||
272 | |||||||
273 | 1; | ||||||
274 | |||||||
275 | __END__ |