| blib/lib/Biblio/Citation/Parser/Jiao/Utility.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 107 | 122 | 87.7 |
| branch | 10 | 22 | 45.4 |
| condition | 0 | 3 | 0.0 |
| subroutine | 10 | 12 | 83.3 |
| pod | 0 | 9 | 0.0 |
| total | 127 | 168 | 75.6 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | package Biblio::Citation::Parser::Jiao::Utility; | ||||||
| 2 | |||||||
| 3 | ###################################################################### | ||||||
| 4 | # | ||||||
| 5 | # ParaTools::Citation::Parser::Jiao::Utility; | ||||||
| 6 | # | ||||||
| 7 | ###################################################################### | ||||||
| 8 | # | ||||||
| 9 | # This file is part of ParaCite Tools | ||||||
| 10 | # Based on Zhuoan Jiao's (zj@ecs.soton.ac.uk) citation parser (available | ||||||
| 11 | # at http://arabica.ecs.soton.ac.uk/code/doc/ReadMe.html) | ||||||
| 12 | # | ||||||
| 13 | # The code is relatively unchanged, except to bring into compliance | ||||||
| 14 | # with the ParaCite metadata style, and to allow interoperability with | ||||||
| 15 | # the other parsers. | ||||||
| 16 | # | ||||||
| 17 | # Copyright (c) 2002 University of Southampton, UK. SO17 1BJ. | ||||||
| 18 | # | ||||||
| 19 | # ParaTools is free software; you can redistribute it and/or modify | ||||||
| 20 | # it under the terms of the GNU General Public License as published by | ||||||
| 21 | # the Free Software Foundation; either version 2 of the License, or | ||||||
| 22 | # (at your option) any later version. | ||||||
| 23 | # | ||||||
| 24 | # ParaTools is distributed in the hope that it will be useful, | ||||||
| 25 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
| 26 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||||||
| 27 | # GNU General Public License for more details. | ||||||
| 28 | # | ||||||
| 29 | # You should have received a copy of the GNU General Public License | ||||||
| 30 | # along with ParaTools; if not, write to the Free Software | ||||||
| 31 | # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||||||
| 32 | # | ||||||
| 33 | ###################################################################### | ||||||
| 34 | |||||||
| 35 | 1 | 1 | 5 | use strict; | |||
| 1 | 1 | ||||||
| 1 | 37 | ||||||
| 36 | 1 | 1 | 5 | use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); | |||
| 1 | 1 | ||||||
| 1 | 562 | ||||||
| 37 | |||||||
| 38 | require Exporter; | ||||||
| 39 | |||||||
| 40 | @ISA = qw(Exporter); | ||||||
| 41 | @EXPORT = qw(&normalisation &normalise_html &normalise_name | ||||||
| 42 | &normalise_date &normalise_journal &num_of_figures); | ||||||
| 43 | |||||||
| 44 | $VERSION = '0.01'; | ||||||
| 45 | |||||||
| 46 | # | ||||||
| 47 | # Normalisation utilities | ||||||
| 48 | # | ||||||
| 49 | sub normalisation { | ||||||
| 50 | 2 | 2 | 0 | 4 | my($Text) = @_; | ||
| 51 | # replace embedded '\n' with ' ' | ||||||
| 52 | 2 | 6 | $Text =~ s/^\s+//s; | ||||
| 53 | 2 | 10 | $Text =~ s/\s+$//s; | ||||
| 54 | 2 | 24 | $Text =~ s/\s+/ /g; # Use single space | ||||
| 55 | # while ($Text =~ /.+?\n.+?/g) { | ||||||
| 56 | # $Text =~ s/(.+?)\n(.+?)/$1 $2/ | ||||||
| 57 | # }; | ||||||
| 58 | 2 | 5 | $Text =~ s/``(.*?)''/"$1"/sg; # Replace ``A Paper Title'' with "A Paper Title" | ||||
| 59 | |||||||
| 60 | 2 | 17 | $Text =~ s/\s*-\s*/-/g; # remove space around '-' | ||||
| 61 | 2 | 4 | $Text =~ s/\s*'\s*/'/g; # remove space around ' | ||||
| 62 | 2 | 15 | $Text =~ s/\s*:\s*/:/g; # remove space around : | ||||
| 63 | 2 | 7 | $Text =~ s/\(\s+/\(/g; # ( 1998) ==> (1998) | ||||
| 64 | 2 | 9 | $Text =~ s/\s+\)/\)/g; # (1998 ) ==> (1998) | ||||
| 65 | # while ($Text =~/--/g) {$Text =~ s/--/-/}; # use single '-'. | ||||||
| 66 | 2 | 4 | $Text =~ s/--+/-/g; | ||||
| 67 | 2 | 4 | $Text =~ s/~//g; # remove '~' (e.g. C.~B.~Hanna) | ||||
| 68 | 2 | 13 | $Text =~ s/[,;\s]+$//; # remove last ',;\s' on a line | ||||
| 69 | # 'Nr.' caused error in 'find_jnl_name', i.e. it became as | ||||||
| 70 | # 'journal name' if not removed. (see: arXiv:quant-ph/9905016) | ||||||
| 71 | 2 | 6 | $Text =~ s/([,;])\s*Nr\.\s*(\d+[,;])/$1$2/i; | ||||
| 72 | # "[12] For example, R. Machleidt, ...Phys. Rep. 149, 1 (1987) | ||||||
| 73 | 2 | 5 | $Text =~ s/^([^a-z]+)for example\W+/$1/i; | ||||
| 74 | # '[18] *** G. Do Dang, ... (arXiv:nucl-th/9911081) | ||||||
| 75 | 2 | 4 | $Text =~ s/\*+//g; | ||||
| 76 | # Phys. Rev. D56 => Phys. Rev. D. 56 | ||||||
| 77 | 2 | 6 | $Text =~ s/phys.{1,6}rev.{1,6}([a-z])(\d+)/PHYS. REV. $1 $2/ig; | ||||
| 78 | # Physica 34 D => Physic D 34 | ||||||
| 79 | 2 | 5 | $Text =~ s/physica\s+(\d+)\s+([a-z])/PHYSICA $2 $1/ig; | ||||
| 80 | # Nucl. Phys. B567 => Nucl. Phys. B | ||||||
| 81 | 2 | 5 | $Text =~ s/nuc.{1,6}phys.{1,6}\s+([a-z])(\d+)/NUCL. PHYS. $1 $2/ig; | ||||
| 82 | 2 | 6 | return $Text; | ||||
| 83 | }; | ||||||
| 84 | |||||||
| 85 | sub chr_valid { | ||||||
| 86 | 0 | 0 | 0 | 0 | my $c = shift; | ||
| 87 | 0 | 0 | 0 | 0 | if( $c < 128 || $c > 255 ) { | ||
| 88 | 0 | 0 | return chr($c); | ||||
| 89 | } else { | ||||||
| 90 | 0 | 0 | return ' '; | ||||
| 91 | } | ||||||
| 92 | } | ||||||
| 93 | |||||||
| 94 | sub normalise_html { | ||||||
| 95 | 1 | 1 | 0 | 3 | my($Text) = @_; | ||
| 96 | |||||||
| 97 | 1 | 1 | 1047 | use utf8; | |||
| 1 | 12 | ||||||
| 1 | 12 | ||||||
| 98 | |||||||
| 99 | # remove tag |
||||||
| 100 | 1 | 3 | $Text =~ s/ //ig; |
||||
| 101 | # Convert HTML entities to Unicode | ||||||
| 102 | 1 | 3 | $Text =~ s/\(\w+);/chr_valid(hex($1))/eg; | ||||
| 0 | 0 | ||||||
| 103 | 1 | 3 | $Text =~ s/\(\d+);/chr_valid($1)/eg; | ||||
| 0 | 0 | ||||||
| 104 | 1 | 2 | $Text =~ s/&(\w)acute[;,]/$1/g; # a ' on top of (\w) | ||||
| 105 | 1 | 4 | $Text =~ s/&(\w)cedil[;,]/$1/g; # a 'tail' under (\w), e.g Francios | ||||
| 106 | 1 | 2 | $Text =~ s/&(\w)grave[;,]/$1/g; # a ` on top of (\w) | ||||
| 107 | 1 | 2 | $Text =~ s/&(\w)tilde[;,]/$1/g; | ||||
| 108 | 1 | 4 | $Text =~ s/&(\w)uml[;,]/$1e/g; # a '..' on top of (\w) | ||||
| 109 | 1 | 3 | $Text =~ s/&(\w)slash[;,]/$1/g; | ||||
| 110 | #$Text =~ s/-88;(\w)/$1/g; # as &(\w)uml (see astro-ph/9811179) | ||||||
| 111 | 1 | 3 | $Text =~ s/-\d+;\s*(\w)/$1/g; | ||||
| 112 | 1 | 2 | $Text =~ s/ß[;,]/ss/g; | ||||
| 113 | 1 | 3 | $Text =~ s/&[;,]/ and /g; | ||||
| 114 | 1 | 3 | $Text =~ s/\s*(\w)/$1/g; # a ~ on top of (\w) | ||||
| 115 | 1 | 2 | $Text =~ s/\/?i>//g; # cogprints | ||||
| 116 | 1 | 3 | $Text =~ s/&[a-z]+;//g; # otherwise ';' cause ref line break. | ||||
| 117 | # $Text =~ s/\d+;//g; # e.g. '' in 'hep-th/0001001 [99]'. | ||||||
| 118 | 1 | 3 | $Text =~ s/\\"(\w)/$1e/g; # G\"unter => Gueter | ||||
| 119 | |||||||
| 120 | 1 | 3 | $Text =~ s/\^//g; # remove ^ | ||||
| 121 | 1 | 3 | $Text =~ s/([A-Z])\s*&\s*([A-Z])/$1 and $2/g; # replace '&' with 'and' | ||||
| 122 | 1 | 3 | $Text =~ s/[, ]+& / and /; | ||||
| 123 | # remove HTML markups ( etc.) | ||||||
| 124 | 1 | 3 | $Text =~ s/<[a-z\/]{1,3}>//ig; | ||||
| 125 | 1 | 3 | return $Text | ||||
| 126 | } | ||||||
| 127 | |||||||
| 128 | |||||||
| 129 | sub normalise_name { | ||||||
| 130 | 1 | 1 | 0 | 2 | my($Text) = @_; | ||
| 131 | 1 | 3 | my $Suffix = ''; | ||||
| 132 | 1 | 3 | $Text =~ s/~//; # remove typo | ||||
| 133 | # Jr. | ||||||
| 134 | 1 | 50 | 11 | if ($Text =~ s/[, \.]+(Jr|Sr|Snr)\.?\s*$//i){ | |||
| 50 | |||||||
| 135 | 0 | 0 | $Suffix = $1 | ||||
| 136 | } | ||||||
| 137 | elsif ($Text =~ s/([, \.]+)(Jr|Sr|Snr)[. ]/$1/i){ | ||||||
| 138 | 0 | 0 | $Suffix = $2 | ||||
| 139 | }; | ||||||
| 140 | |||||||
| 141 | # van der Buren D => D van der Buren" | ||||||
| 142 | 1 | 50 | 7 | if ($Text =~ /^\s*(((van|von|de|den|der)\s+)+)(\S\S+)\s+(.+)/i) { | |||
| 143 | 0 | 0 | $Text = "$5 $1 $4" | ||||
| 144 | }; | ||||||
| 145 | 1 | 5 | $Text =~ s/\s+/ /g; # single space | ||||
| 146 | 1 | 3 | $Text =~ s/^\W+//; | ||||
| 147 | 1 | 4 | $Text =~ s/\s+$//; | ||||
| 148 | # "A. Smith" => "A.Smith" | ||||||
| 149 | 1 | 2 | $Text =~ s/([a-z])s+\./$1\./ig; | ||||
| 150 | # Ghisellini G. A. ==> G.A. Ghisellini | ||||||
| 151 | # Konenkov D. Yu. => D.Yu. Konenkov | ||||||
| 152 | 1 | 50 | 6 | if ($Text =~ /^([^\s.]{2,})\s+(([A-Z][a-zA-Z]?\W+)*)([A-Z][a-zA-Z]?)\W*$/) { | |||
| 153 | 0 | 0 | $Text = "$2$4 $1" | ||||
| 154 | }; | ||||||
| 155 | |||||||
| 156 | 1 | 50 | 3 | $Text = "$Text $Suffix" if ($Suffix); | |||
| 157 | 1 | 3 | $Text = tdb_normalise_name($Text); | ||||
| 158 | 1 | 5 | return $Text; | ||||
| 159 | }; | ||||||
| 160 | |||||||
| 161 | # Based on Tim's Authors::splitauthors and Authors::_cmonauthor(); | ||||||
| 162 | # This subroutine is called simply because we want the author names | ||||||
| 163 | # to be transformed to a same style used by Tim's programs. Otherwise | ||||||
| 164 | # a join on author names in Publication and Reference tables will | ||||||
| 165 | # miss a lot of targets. | ||||||
| 166 | sub tdb_normalise_name{ | ||||||
| 167 | 1 | 1 | 0 | 3 | my $author = shift; | ||
| 168 | |||||||
| 169 | # Strip any brackets | ||||||
| 170 | 1 | 2 | $author =~ s/\s*\([^\)]*\)\s*//g; | ||||
| 171 | # Get rid of any dashes (except for dashes like Hu-Su) | ||||||
| 172 | 1 | 3 | $author =~ s/(\W)-/$1/g; | ||||
| 173 | # Remove any "the"s, e.g. The OPAL Collaboration | ||||||
| 174 | 1 | 845 | $author =~ s/\bthe\s+//ig; | ||||
| 175 | # Sort out Jr/Jr. | ||||||
| 176 | 1 | 5 | $author =~ s/,?\sJr\.?/_Jr/ig; | ||||
| 177 | |||||||
| 178 | 1 | 3 | $author =~ s/[\{\}]//g; # Remove any specialisations | ||||
| 179 | 1 | 2 | $author =~ s/\\.//g; # Remove any escapes | ||||
| 180 | # Convert Convert Hawking S. to S.Hawking (already done - zj) | ||||||
| 181 | # ($author =~ /(\w\w+)\s+([\.\w\s]+\.)$/) && ($author = $2.$1); | ||||||
| 182 | 1 | 5 | $author =~ s/\.\s/\./g; # Convert S.W. Hawking to S.W.Hawking | ||||
| 183 | 1 | 11 | $author =~ s/([A-Z])\s/$1\./g; # Convert S W Hawking to S.W.Hawking | ||||
| 184 | 1 | 50 | 69 | ($author = lc($author)) && ($author =~ s/\b(\w)/\U$1/g); | |||
| 185 | # Convert STEPHEN_HAWKING to Stephen_Hawking | ||||||
| 186 | 1 | 3 | $author =~ s/\s/_/g; # Convert Stephen W.Hawking to Stephen_W.Hawking | ||||
| 187 | 1 | 3 | $author =~ s/\.\.+/\./g; # Remove double dots | ||||
| 188 | 1 | 4 | return $author; | ||||
| 189 | } | ||||||
| 190 | |||||||
| 191 | |||||||
| 192 | sub normalise_date { | ||||||
| 193 | 1 | 1 | 0 | 2 | my($Text) = @_; | ||
| 194 | |||||||
| 195 | # 12-14 Dec. | ||||||
| 196 | 1 | 12 | $Text =~ s/[^\w\/][0-3][0-9]?[a-z]*?(\s*\-\s*[0-3][0-9]?[a-z]*?)?\s+ | ||||
| 197 | (Jan[\.\s]|January\b|Feb[\.\s]|February\b| | ||||||
| 198 | Mar[\.\s]|March\b|Apr[\.\s]|April\b|May| | ||||||
| 199 | Jun[\.\s]|June\b|Jul[\.\s]|July\b|Aug[\.\s]|August\b| | ||||||
| 200 | Sep[\.\s]|September\b|Oct[\.\s]|October\b| | ||||||
| 201 | Nov[\.\s]|November\b|Dec[\.\s]|December\b)//xig; | ||||||
| 202 | |||||||
| 203 | # Dec 12-14 | ||||||
| 204 | 1 | 11 | $Text =~ s/(Jan[\.\s]|January\b|Feb[\.\s]|February\b| | ||||
| 205 | Mar[\.\s]|March\b|Apr[\.\s]|April\b|May| | ||||||
| 206 | Jun[\.\s]|June\b|Jul[\.\s]|July\b|Aug[\.\s]|August\b| | ||||||
| 207 | Sep[\.\s]|September\b|Oct[\.\s]|October\b| | ||||||
| 208 | Nov[\.\s]|November\b|Dec[\.\s]|December\b) | ||||||
| 209 | [^\w\/]*[0-3][0-9]?[a-z]*?(\s*\-\s*[0-3][0-9]?[a-z]*?)?\b//xig; | ||||||
| 210 | |||||||
| 211 | 1 | 3 | return $Text; | ||||
| 212 | }; | ||||||
| 213 | |||||||
| 214 | |||||||
| 215 | sub normalise_journal { | ||||||
| 216 | 1 | 1 | 0 | 5 | my($Text) = @_; | ||
| 217 | 1 | 3 | $Text =~ s/^in\s+//i; # "in ..." | ||||
| 218 | 1 | 4 | $Text =~ s/^(see )?also\s+//i; # "also ..."; | ||||
| 219 | 1 | 3 | $Text =~ s/\s*:\s*/:/g; | ||||
| 220 | 1 | 3 | $Text =~ s/\s\s/ /g; # single space; | ||||
| 221 | 1 | 3 | $Text =~ s/\.\s/\./g; # "J. Physics" => "J.Physics" | ||||
| 222 | 1 | 2 | $Text =~ s/\.\(/\. \(/g; | ||||
| 223 | |||||||
| 224 | 1 | 3 | $Text =~ s/^\W+//; | ||||
| 225 | 1 | 5 | $Text =~ s/[^\w.]+$//; | ||||
| 226 | |||||||
| 227 | # remove anything in brackets at the end. | ||||||
| 228 | 1 | 2 | $Text =~ s/\s*\([^)]+$//; # e.g. R. Ram, J. Phys. (Paris | ||||
| 229 | 1 | 3 | $Text =~ s/^[^(]+\)\s*//; # e.g. a) R. Ram, J. Phys. 10, 120, 1998 | ||||
| 230 | |||||||
| 231 | # unify cases | ||||||
| 232 | #($Text = lc($Text)) && ($Text =~ s/\b(\w)/\U$1/g); | ||||||
| 233 | 1 | 2 | $Text = uc($Text); | ||||
| 234 | 1 | 5 | return $Text; | ||||
| 235 | }; | ||||||
| 236 | |||||||
| 237 | |||||||
| 238 | sub num_of_figures { | ||||||
| 239 | 1 | 1 | 0 | 2 | my($Text) = @_; | ||
| 240 | 1 | 3 | my($N, @Nlist); | ||||
| 241 | 1 | 2 | $N = 0; | ||||
| 242 | 1 | 2 | @Nlist =(); | ||||
| 243 | |||||||
| 244 | 1 | 3 | $Text = normalisation($Text); | ||||
| 245 | # e.g. "p. 24-26" regarded as one number. | ||||||
| 246 | # ignore 'N = 2' kind of equations, and | ||||||
| 247 | # ignore '25th' kinds (e.g. Proc. 25th ICRC). | ||||||
| 248 | # ignore 'protein Aquaporin-1 in ...' | ||||||
| 249 | # ignore 'hep-th/9901001' | ||||||
| 250 | # ignore ' ... 1.55, ...' | ||||||
| 251 | 1 | 15 | while ($Text =~ /(?:^|\b)[a-z]*(\d+)([a-z]*) | ||||
| 252 | (?:-[a-z]*\d+[a-z]*)* | ||||||
| 253 | (?:\b|$)/gix) { | ||||||
| 254 | 3 | 50 | 9 | next if ($2 =~ /^th$/i); | |||
| 255 | 3 | 50 | 11 | next if ($' =~ /^\.\d+/); | |||
| 256 | 3 | 50 | 9 | next if ($` =~ /\d+\.$/); | |||
| 257 | 3 | 50 | 11 | next if ($` =~ /[=<>\/-]\s*$/); | |||
| 258 | 3 | 36 | push(@Nlist, $1); | ||||
| 259 | }; | ||||||
| 260 | 1 | 2 | $N = scalar(@Nlist); | ||||
| 261 | 1 | 5 | return $N | ||||
| 262 | } | ||||||
| 263 | |||||||
| 264 | |||||||
| 265 | sub remove_extra_spc { | ||||||
| 266 | 0 | 0 | 0 | my($Text) = @_; | |||
| 267 | 0 | $Text =~ s/^\s+//; | |||||
| 268 | 0 | $Text =~ s/\s+$//; | |||||
| 269 | 0 | $Text =~ s/\s\s+/ /g; | |||||
| 270 | 0 | return $Text | |||||
| 271 | }; | ||||||
| 272 | |||||||
| 273 | 1; | ||||||
| 274 | |||||||
| 275 | __END__ |