blib/lib/Biblio/Citation/Compare.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 126 | 153 | 82.3 |
branch | 59 | 102 | 57.8 |
condition | 49 | 84 | 58.3 |
subroutine | 16 | 18 | 88.8 |
pod | 2 | 10 | 20.0 |
total | 252 | 367 | 68.6 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package Biblio::Citation::Compare; | ||||||
2 | |||||||
3 | 2 | 2 | 17254 | use 5.0; | |||
2 | 6 | ||||||
2 | 86 | ||||||
4 | 2 | 2 | 10 | use strict; | |||
2 | 2 | ||||||
2 | 69 | ||||||
5 | 2 | 2 | 9 | use warnings; | |||
2 | 5 | ||||||
2 | 75 | ||||||
6 | 2 | 2 | 1059 | use Text::LevenshteinXS qw(distance); | |||
2 | 34702 | ||||||
2 | 144 | ||||||
7 | 2 | 2 | 1245 | use HTML::Entities; | |||
2 | 706842 | ||||||
2 | 166 | ||||||
8 | 2 | 2 | 2717 | use Text::Names qw/samePerson cleanName parseName/; | |||
2 | 158631 | ||||||
2 | 269 | ||||||
9 | 2 | 2 | 20 | use utf8; | |||
2 | 3 | ||||||
2 | 15 | ||||||
10 | |||||||
11 | require Exporter; | ||||||
12 | |||||||
13 | our @ISA = qw(Exporter); | ||||||
14 | |||||||
15 | our %EXPORT_TAGS = ( 'all' => [ qw( | ||||||
16 | sameWork sameAuthors toString extractEdition | ||||||
17 | ) ] ); | ||||||
18 | |||||||
19 | our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); | ||||||
20 | |||||||
21 | our @EXPORT = qw( ); | ||||||
22 | |||||||
23 | our $VERSION = '0.4'; | ||||||
24 | |||||||
25 | # to correct bogus windows entities. unfixable ones are converted to spaces. | ||||||
26 | my %WIN2UTF = ( | ||||||
27 | hex('80')=> hex('20AC'),# #EURO SIGN | ||||||
28 | hex('81')=> hex('0020'), #UNDEFINED | ||||||
29 | hex('82')=> hex('201A'),# #SINGLE LOW-9 QUOTATION MARK | ||||||
30 | hex('83')=> hex('0192'),# #LATIN SMALL LETTER F WITH HOOK | ||||||
31 | hex('84')=> hex('201E'),# #DOUBLE LOW-9 QUOTATION MARK | ||||||
32 | hex('85')=> hex('2026'),# #HORIZONTAL ELLIPSIS | ||||||
33 | hex('86')=> hex('2020'),# #DAGGER | ||||||
34 | hex('87')=> hex('2021'),# #DOUBLE DAGGER | ||||||
35 | hex('88')=> hex('02C6'),# #MODIFIER LETTER CIRCUMFLEX ACCENT | ||||||
36 | hex('89')=> hex('2030'),# #PER MILLE SIGN | ||||||
37 | hex('8A')=> hex('0160'),# #LATIN CAPITAL LETTER S WITH CARON | ||||||
38 | hex('8B')=> hex('2039'),# #SINGLE LEFT-POINTING ANGLE QUOTATION MARK | ||||||
39 | hex('8C')=> hex('0152'),# #LATIN CAPITAL LIGATURE OE | ||||||
40 | hex('8D')=> hex('0020'),# #UNDEFINED | ||||||
41 | hex('8E')=> hex('017D'),# #LATIN CAPITAL LETTER Z WITH CARON | ||||||
42 | hex('8F')=> hex('0020'),# #UNDEFINED | ||||||
43 | hex('90')=> hex('0020'),# #UNDEFINED | ||||||
44 | hex('91')=> hex('2018'),# #LEFT SINGLE QUOTATION MARK | ||||||
45 | hex('92')=> hex('2019'),# #RIGHT SINGLE QUOTATION MARK | ||||||
46 | hex('93')=> hex('201C'),# #LEFT DOUBLE QUOTATION MARK | ||||||
47 | hex('94')=> hex('201D'),# #RIGHT DOUBLE QUOTATION MARK | ||||||
48 | hex('95')=> hex('2022'),# #BULLET | ||||||
49 | hex('96')=> hex('2013'),# #EN DASH | ||||||
50 | hex('97')=> hex('2014'),# #EM DASH | ||||||
51 | hex('98')=> hex('02DC'),# #SMALL TILDE | ||||||
52 | hex('99')=> hex('2122'),# #TRADE MARK SIGN | ||||||
53 | hex('9A')=> hex('0161'),# #LATIN SMALL LETTER S WITH CARON | ||||||
54 | hex('9B')=> hex('203A'),# #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK | ||||||
55 | hex('9C')=> hex('0153'),# #LATIN SMALL LIGATURE OE | ||||||
56 | hex('9D')=> hex('0020'),# #UNDEFINED | ||||||
57 | hex('9E')=> hex('017E'),# #LATIN SMALL LETTER Z WITH CARON | ||||||
58 | hex('9F')=> hex('0178')# #LATIN CAPITAL LETTER Y WITH DIAERESIS | ||||||
59 | ); | ||||||
60 | my $PARENS = '\s*[\[\(](.+?)[\]\)]\s*'; | ||||||
61 | my $QUOTE = '"“”`¨´‘’‛“”‟„′″‴‵‶‷⁗❛❜❝❞'; | ||||||
62 | my @ED_RES = ( | ||||||
63 | '(first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth)', | ||||||
64 | '([1-9])\s?\w{2,5}\s[ée]d', | ||||||
65 | '\bv\.?(?:ersion)?\s?([0-9IXV]+)', | ||||||
66 | '\s([IXV0-9]+)(?:$|:)' | ||||||
67 | ); | ||||||
68 | |||||||
69 | #die "no" unless "2nd edition" =~ /$EDITION/i; | ||||||
70 | |||||||
71 | #my $TITLE_SPLIT = '(?:\?|\:|\.|!|\"|[$QUOTE]\b)'; | ||||||
72 | my $TITLE_SPLIT = '(?:\?|\:|\.|!)'; | ||||||
73 | |||||||
74 | sub sameAuthors { | ||||||
75 | 42 | 42 | 1 | 2171 | my ($list1, $list2) = @_; | ||
76 | #return 0 if $#$list1 != $#$list2; | ||||||
77 | 42 | 100 | 108 | if ($#$list2 > $#$list1) { | |||
78 | 6 | 11 | my $t = $list1; | ||||
79 | 6 | 9 | $list1 = $list2; | ||||
80 | 6 | 7 | $list2 = $t; | ||||
81 | } | ||||||
82 | 42 | 106 | for (my $i = 0; $i <= $#$list2; $i++) { | ||||
83 | 49 | 100 | 1939 | return 0 unless grep { samePerson($list2->[$i],$_) } @$list1; | |||
87 | 13753 | ||||||
84 | } | ||||||
85 | 40 | 15928 | return 1; | ||||
86 | } | ||||||
87 | |||||||
88 | sub firstAuthor { | ||||||
89 | 0 | 0 | 0 | 0 | my $e = shift; | ||
90 | 0 | 0 | my $a = $e->{authors}; | ||||
91 | 0 | 0 | 0 | if ($#$a > -1) { | |||
92 | 0 | 0 | return $a->[0]; | ||||
93 | } else { | ||||||
94 | 0 | 0 | return undef; | ||||
95 | } | ||||||
96 | } | ||||||
97 | |||||||
98 | sub sameWork { | ||||||
99 | |||||||
100 | 36 | 36 | 1 | 14467 | my $debug = 0; | ||
101 | |||||||
102 | 36 | 55 | my ($e, $c, $threshold,$loose,$nolinks) = @_; | ||||
103 | 36 | 50 | 98 | $loose = 0 unless defined $loose; | |||
104 | 36 | 50 | 71 | $threshold = 0.15 unless $threshold; | |||
105 | |||||||
106 | 36 | 50 | 71 | if ($debug) { | |||
107 | 0 | 0 | warn "sameEntry 1: " . toString($e); | ||||
108 | 0 | 0 | warn "sameEntry 2: " . toString($c); | ||||
109 | } | ||||||
110 | |||||||
111 | 36 | 0 | 33 | 93 | if (defined $e->{doi} and length $e->{doi} and defined $c->{doi} and length $c->{doi}) { | ||
33 | |||||||
0 | |||||||
112 | 0 | 0 | 0 | return 1 if $e->{doi} eq $c->{doi}; | |||
113 | } | ||||||
114 | |||||||
115 | 36 | 50 | 71 | return 0 if (!$c); | |||
116 | |||||||
117 | # normalize encoding of relevant fields | ||||||
118 | 36 | 74 | local $e->{title} = decodeHTMLEntities($e->{title}); | ||||
119 | 36 | 61 | local $c->{title} = decodeHTMLEntities($c->{title}); | ||||
120 | |||||||
121 | # first check if authors,date, and title are almost literally the same | ||||||
122 | 36 | 100 | 126 | my $tsame = (lc $e->{title} eq lc $c->{title}) ? 1 : 0; | |||
123 | 36 | 63 | my $asame = sameAuthors($e->{authors},$c->{authors}); | ||||
124 | 36 | 100 | 100 | 299 | my $dsame = (defined $e->{date} and defined $c->{date} and $e->{date} eq $c->{date}) ? 1 : 0; | ||
125 | |||||||
126 | 36 | 50 | 62 | if ($debug) { | |||
127 | 0 | 0 | warn "tsame: $tsame"; | ||||
128 | 0 | 0 | warn "asame: $asame"; | ||||
129 | 0 | 0 | warn "dsame: $dsame"; | ||||
130 | } | ||||||
131 | |||||||
132 | 36 | 50 | 66 | 133 | return 1 if ($tsame and $asame and $dsame); | ||
66 | |||||||
133 | |||||||
134 | # if authors quite different, not same | ||||||
135 | 29 | 50 | 48 | if (!$asame) { | |||
136 | #print "$lname1, $lname2 "; |
||||||
137 | #print my_dist_text($lname1,$lname2); | ||||||
138 | 0 | 0 | 0 | warn "authors too different" if $debug; | |||
139 | 0 | 0 | return 0; | ||||
140 | } | ||||||
141 | |||||||
142 | 29 | 50 | 60 | warn "pre title length" if $debug; | |||
143 | # if title very different in lengths and do not contain ":" or brackets, not the same | ||||||
144 | 29 | 50 | 100 | 299 | return 0 if !$tsame and ( | ||
100 | |||||||
33 | |||||||
66 | |||||||
33 | |||||||
145 | abs(length($e->{title}) - length($c->{title})) > 20 | ||||||
146 | and | ||||||
147 | ($e->{title} !~ /$TITLE_SPLIT/ and $c->{title} !~ /$TITLE_SPLIT/) | ||||||
148 | and | ||||||
149 | ($e->{title} !~ /$PARENS/ and $c->{title} !~ /$PARENS/) | ||||||
150 | ); | ||||||
151 | |||||||
152 | # Compare links | ||||||
153 | 29 | 50 | 71 | if (!$nolinks) { | |||
154 | 29 | 27 | foreach my $l (@{$e->{links}}) { | ||||
29 | 87 | ||||||
155 | # print "Links e:\n" . join("\n",$e->getLinks); | ||||||
156 | # print "Links c:\n" . join("\n",$c->getLinks); | ||||||
157 | 0 | 0 | 0 | return 1 if grep { $l eq $_} @{$c->{links}}; | |||
0 | 0 | ||||||
0 | 0 | ||||||
158 | } | ||||||
159 | } | ||||||
160 | |||||||
161 | # check dates | ||||||
162 | 29 | 37 | my $compat_dates = $dsame; | ||||
163 | 29 | 50 | 66 | 182 | if (!$dsame and defined $e->{date} and defined $c->{date} and $e->{date} =~ /^\d\d\d\d$/ and $c->{date} =~ /^\d\d\d\d$/ ) { | ||
100 | |||||||
66 | |||||||
66 | |||||||
164 | |||||||
165 | 7 | 9 | $compat_dates = 0; | ||||
166 | #disabled for most cases because we want to conflate editions and republications for now. | ||||||
167 | 7 | 50 | 33 | 67 | if ($e->{title} =~ /^Introduction.?$/ or $e->{title} =~ /^Preface.?$/) { | ||
168 | 0 | 0 | 0 | 0 | return 0 if ($e->{source} and $e->{source} ne $c->{source}) or | ||
0 | |||||||
0 | |||||||
169 | ($e->{volume} and $e->{volume} ne $c->{volume}); | ||||||
170 | } | ||||||
171 | 7 | 50 | 15 | if ($loose) { | |||
172 | 0 | 0 | $threshold /= 2; | ||||
173 | } else { | ||||||
174 | 7 | 17 | $threshold /= 3; | ||||
175 | } | ||||||
176 | } | ||||||
177 | |||||||
178 | # authors same, loosen for title | ||||||
179 | 29 | 100 | 66 | 102 | if ($asame and $compat_dates) { | ||
180 | 21 | 22 | $loose = 1; | ||||
181 | } | ||||||
182 | |||||||
183 | 29 | 50 | 47 | warn "pre loose mode: loose = $loose" if $debug; | |||
184 | |||||||
185 | #print "threshold $lname1,$lname2: $threshold\n"; | ||||||
186 | # ok if distance short enough without doing anything | ||||||
187 | #print "distance: " . distance(lc $e->{title},lc $c->{title}) / (length($e->{title}) +1) . "\n"; | ||||||
188 | |||||||
189 | # perform fuzzy matching | ||||||
190 | #my $str1 = "$e->{date}|$e->{title}"; | ||||||
191 | 29 | 68 | my $str1 = lc _strip_non_word($e->{title}); | ||||
192 | 29 | 51 | my $str2 = lc _strip_non_word($c->{title}); | ||||
193 | |||||||
194 | # check for edition strings | ||||||
195 | 29 | 67 | my $ed1 = extractEdition($str1); | ||||
196 | 29 | 57 | my $ed2 = extractEdition($str2); | ||||
197 | 29 | 50 | 64 | warn "ed1: $ed1" if $debug; | |||
198 | 29 | 50 | 53 | warn "ed2: $ed2" if $debug; | |||
199 | |||||||
200 | 29 | 100 | 100 | 220 | return 0 if ($ed1 and !$ed2) or ($ed2 and !$ed1) or ($ed1 && $ed1 != $ed2); | ||
100 | |||||||
66 | |||||||
100 | |||||||
66 | |||||||
201 | 25 | 50 | 38 | warn "not diff editions" if $debug; | |||
202 | |||||||
203 | # remove brackets | ||||||
204 | 25 | 32 | my ($parens1,$parens2); | ||||
205 | 25 | 192 | $str1 =~ s/$PARENS//g; | ||||
206 | 25 | 47 | $parens1 = $1; | ||||
207 | 25 | 130 | $str2 =~ s/$PARENS//g; | ||||
208 | 25 | 35 | $parens2 = $1; | ||||
209 | 25 | 100 | 66 | 72 | return 0 if $parens1 && $parens2 && numdiff($parens1,$parens2); | ||
100 | |||||||
210 | |||||||
211 | 24 | 50 | 43 | warn "the text comparison is: '$str1' vs '$str2'" if $debug; | |||
212 | |||||||
213 | 24 | 50 | 103 | warn "pre number check" if $debug; | |||
214 | # if titles differ by a number, not the same | ||||||
215 | 24 | 100 | 50 | return 0 if numdiff($str1,$str2); | |||
216 | |||||||
217 | # ultimate test | ||||||
218 | #dbg("$str1\n$str2\n"); | ||||||
219 | #dbg(my_dist_text($str1,$str2)); | ||||||
220 | 22 | 39 | my $score = (my_dist_text($str1,$str2) / (length($str1) +1)); | ||||
221 | |||||||
222 | 22 | 50 | 44 | warn "score: $score (threshold: $threshold)" if $debug; | |||
223 | #print $score . " \n"; |
||||||
224 | 22 | 100 | 98 | return 1 if ( $score < $threshold); | |||
225 | |||||||
226 | # now if loose mode and only one of the titles has a ":" or other punctuation, compare the part before the punc with the other title instead | ||||||
227 | 9 | 100 | 22 | if ($loose) { | |||
228 | |||||||
229 | 7 | 50 | 13 | warn "loose: $str1 -- $str2" if $debug; | |||
230 | 7 | 50 | 20 | return 1 if (my_dist_text($str1,$str2) / (length($str1) +1) < $threshold); | |||
231 | |||||||
232 | 7 | 100 | 396 | if ($e->{title} =~ /(.+)\s*$TITLE_SPLIT\s*(.+)/) { | |||
50 | |||||||
233 | |||||||
234 | 5 | 9 | my $str1 = _strip_non_word($1); | ||||
235 | 5 | 50 | 265 | if ($c->{title} =~ /(.+)\s*$TITLE_SPLIT\s*(.+)/) { | |||
236 | 0 | 0 | return 0; | ||||
237 | } else { | ||||||
238 | 5 | 50 | 10 | if (my_dist_text($str1,$str2) / (length($str1) +1)< $threshold) { | |||
239 | 5 | 23 | return 1; | ||||
240 | } | ||||||
241 | } | ||||||
242 | |||||||
243 | } elsif ($c->{title} =~ /(.+)\s*$TITLE_SPLIT\s*(.+)/) { | ||||||
244 | |||||||
245 | 2 | 5 | my $str2 = _strip_non_word($1); | ||||
246 | 2 | 100 | 5 | if (my_dist_text($str1,$str2) / (length($str1) +1)< $threshold) { | |||
247 | 1 | 6 | return 1; | ||||
248 | } | ||||||
249 | |||||||
250 | } else { | ||||||
251 | |||||||
252 | 0 | 0 | return 0; | ||||
253 | |||||||
254 | } | ||||||
255 | } | ||||||
256 | |||||||
257 | 3 | 13 | return 0; | ||||
258 | } | ||||||
259 | |||||||
260 | sub _strip_non_word { | ||||||
261 | 65 | 65 | 92 | my $str = shift; | |||
262 | 65 | 437 | $str =~ s/[^[0-9a-zA-Z\)\]\(\[]+/ /g; | ||||
263 | 65 | 296 | $str =~ s/\s+/ /g; | ||||
264 | 65 | 86 | $str =~ s/^\s+//; | ||||
265 | 65 | 163 | $str =~ s/\s+$//; | ||||
266 | 65 | 121 | $str; | ||||
267 | } | ||||||
268 | |||||||
269 | my %nums = ( | ||||||
270 | first => 1, | ||||||
271 | second => 2, | ||||||
272 | third => 3, | ||||||
273 | fourth => 4, | ||||||
274 | fifth => 5, | ||||||
275 | sixth => 6, | ||||||
276 | seventh => 7, | ||||||
277 | eighth => 8, | ||||||
278 | ninth => 9, | ||||||
279 | tenth => 10, | ||||||
280 | I => 1, | ||||||
281 | II => 2, | ||||||
282 | III => 3, | ||||||
283 | IV => 4, | ||||||
284 | V => 5, | ||||||
285 | VI => 6, | ||||||
286 | VII => 7, | ||||||
287 | VIII => 8, | ||||||
288 | IX => 9, | ||||||
289 | X => 10, | ||||||
290 | ); | ||||||
291 | sub extract_num { | ||||||
292 | 17 | 17 | 0 | 47 | my $s = shift; | ||
293 | 17 | 100 | 62 | if ($s =~ /\b(\d+)/) { | |||
294 | 11 | 69 | return $1; | ||||
295 | } | ||||||
296 | 6 | 35 | for my $n (keys %nums) { | ||||
297 | 62 | 100 | 404 | if ($s =~ /\b$n\b/i) { | |||
298 | 6 | 33 | return $nums{$n}; | ||||
299 | } | ||||||
300 | } | ||||||
301 | 0 | 0 | return $s; | ||||
302 | } | ||||||
303 | |||||||
304 | sub extractEdition { | ||||||
305 | 69 | 69 | 0 | 3189 | my $s = shift; | ||
306 | 69 | 102 | for my $re (@ED_RES) { | ||||
307 | 253 | 100 | 8020 | if ($s =~ /$re/i) { | |||
308 | 17 | 78 | return extract_num($1); | ||||
309 | } | ||||||
310 | } | ||||||
311 | 52 | 114 | return undef; | ||||
312 | } | ||||||
313 | |||||||
314 | sub numdiff { | ||||||
315 | 29 | 29 | 0 | 39 | my ($s1,$s2) = @_; | ||
316 | #print "----checking numdiff (($s1,$s2))\n"; | ||||||
317 | 29 | 207 | my @n1 = ($s1 =~ /\b([IXV0-9]{1,4}|first|second|third|fourth|fifth|sixth|1st|2nd|3rd|4th|5th|6th|7th|8th|9th)\b/ig); | ||||
318 | 29 | 156 | my @n2 = ($s2 =~ /\b([IXV0-9]{1,4}|first|second|third|fourth|fifth|sixth|1st|2nd|3rd|4th|5th|6th|7th|8th|9th)\b/ig); | ||||
319 | #print "In s1:" . join(",",@n1) . "\n"; | ||||||
320 | #print "In s2:" . join(",",@n2) . "\n"; | ||||||
321 | 29 | 100 | 95 | return 0 if $#n1 ne $#n2; | |||
322 | 27 | 78 | for (0..$#n1) { | ||||
323 | 5 | 100 | 39 | return 1 if lc $n1[$_] ne lc $n2[$_]; | |||
324 | } | ||||||
325 | #print "Not diff\n"; | ||||||
326 | 24 | 74 | return 0; | ||||
327 | =old | ||||||
328 | my $num1 = undef; | ||||||
329 | my $num2 = undef; | ||||||
330 | $num1 = $1 if ($s1 =~ /\W([IV1-9]{1,4})(((\W|$).{0,3}$)|(\W\s*:))/); | ||||||
331 | $num2 = $1 if ($s2 =~ /\W([IV1-9]{1,4})(((\W|$).{0,3}$)|(\W\s*:))/); | ||||||
332 | return $num1 eq $num2 ? 0 : 1; | ||||||
333 | =cut | ||||||
334 | } | ||||||
335 | |||||||
336 | |||||||
337 | sub my_dist_text { | ||||||
338 | 36 | 36 | 0 | 72 | my $a = lc shift; | ||
339 | 36 | 54 | my $b = lc shift; | ||||
340 | 36 | 50 | $a =~ s/_/ /g; | ||||
341 | 36 | 131 | $b =~ s/_/ /g; | ||||
342 | 36 | 444 | return distance($a, $b); | ||||
343 | |||||||
344 | } | ||||||
345 | sub decodeHTMLEntities { | ||||||
346 | 72 | 72 | 0 | 76 | my $in = shift; | ||
347 | 72 | 106 | $in =~ s/&([\d\w\#]+);/&safe_decode($1)/gei; | ||||
0 | 0 | ||||||
348 | 72 | 130 | return $in; | ||||
349 | } | ||||||
350 | |||||||
351 | sub safe_decode { | ||||||
352 | 0 | 0 | 0 | 0 | my $in = shift; | ||
353 | 0 | 0 | 0 | if (substr($in,0,1) eq '#') { | |||
354 | 0 | 0 | 0 | my $num = substr($in,1,1) eq 'x' ? hex(substr($in,1)) : substr($in,1); | |||
355 | # we check and fix cp1232 entities | ||||||
356 | 0 | 0 | 0 | 0 | return ($num < 127 or $num > 159) ? | ||
357 | HTML::Entities::decode_entities("&$in;") : | ||||||
358 | HTML::Entities::decode_entities("" . $WIN2UTF{$num} . ";"); | ||||||
359 | } else { | ||||||
360 | 0 | 0 | HTML::Entities::decode_entities("&$in;") | ||||
361 | } | ||||||
362 | } | ||||||
363 | |||||||
364 | sub toString { | ||||||
365 | 70 | 70 | 0 | 214 | my $h = shift; | ||
366 | 70 | 61 | return join("; ",@{$h->{authors}}) . " ($h->{date}) $h->{title}\n"; | ||||
70 | 390 | ||||||
367 | } | ||||||
368 | |||||||
369 | 1; | ||||||
370 | __END__ |