File Coverage

blib/lib/Biblio/Citation/Compare.pm

Criterion	Covered	Total	%
statement	126	153	82.3
branch	59	102	57.8
condition	49	84	58.3
subroutine	16	18	88.8
pod	2	10	20.0
total	252	367	68.6

line	stmt	bran	cond	sub	pod	time	code
1							package Biblio::Citation::Compare;
2
3	2			2		17254	use 5.0;
	2					6
	2					86
4	2			2		10	use strict;
	2					2
	2					69
5	2			2		9	use warnings;
	2					5
	2					75
6	2			2		1059	use Text::LevenshteinXS qw(distance);
	2					34702
	2					144
7	2			2		1245	use HTML::Entities;
	2					706842
	2					166
8	2			2		2717	use Text::Names qw/samePerson cleanName parseName/;
	2					158631
	2					269
9	2			2		20	use utf8;
	2					3
	2					15
10
11							require Exporter;
12
13							our @ISA = qw(Exporter);
14
15							our %EXPORT_TAGS = ( 'all' => [ qw(
16							sameWork sameAuthors toString extractEdition
17							) ] );
18
19							our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
20
21							our @EXPORT = qw( );
22
23							our $VERSION = '0.4';
24
25							# to correct bogus windows entities. unfixable ones are converted to spaces.
26							my %WIN2UTF = (
27							hex('80')=> hex('20AC'),# #EURO SIGN
28							hex('81')=> hex('0020'), #UNDEFINED
29							hex('82')=> hex('201A'),# #SINGLE LOW-9 QUOTATION MARK
30							hex('83')=> hex('0192'),# #LATIN SMALL LETTER F WITH HOOK
31							hex('84')=> hex('201E'),# #DOUBLE LOW-9 QUOTATION MARK
32							hex('85')=> hex('2026'),# #HORIZONTAL ELLIPSIS
33							hex('86')=> hex('2020'),# #DAGGER
34							hex('87')=> hex('2021'),# #DOUBLE DAGGER
35							hex('88')=> hex('02C6'),# #MODIFIER LETTER CIRCUMFLEX ACCENT
36							hex('89')=> hex('2030'),# #PER MILLE SIGN
37							hex('8A')=> hex('0160'),# #LATIN CAPITAL LETTER S WITH CARON
38							hex('8B')=> hex('2039'),# #SINGLE LEFT-POINTING ANGLE QUOTATION MARK
39							hex('8C')=> hex('0152'),# #LATIN CAPITAL LIGATURE OE
40							hex('8D')=> hex('0020'),# #UNDEFINED
41							hex('8E')=> hex('017D'),# #LATIN CAPITAL LETTER Z WITH CARON
42							hex('8F')=> hex('0020'),# #UNDEFINED
43							hex('90')=> hex('0020'),# #UNDEFINED
44							hex('91')=> hex('2018'),# #LEFT SINGLE QUOTATION MARK
45							hex('92')=> hex('2019'),# #RIGHT SINGLE QUOTATION MARK
46							hex('93')=> hex('201C'),# #LEFT DOUBLE QUOTATION MARK
47							hex('94')=> hex('201D'),# #RIGHT DOUBLE QUOTATION MARK
48							hex('95')=> hex('2022'),# #BULLET
49							hex('96')=> hex('2013'),# #EN DASH
50							hex('97')=> hex('2014'),# #EM DASH
51							hex('98')=> hex('02DC'),# #SMALL TILDE
52							hex('99')=> hex('2122'),# #TRADE MARK SIGN
53							hex('9A')=> hex('0161'),# #LATIN SMALL LETTER S WITH CARON
54							hex('9B')=> hex('203A'),# #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
55							hex('9C')=> hex('0153'),# #LATIN SMALL LIGATURE OE
56							hex('9D')=> hex('0020'),# #UNDEFINED
57							hex('9E')=> hex('017E'),# #LATIN SMALL LETTER Z WITH CARON
58							hex('9F')=> hex('0178')# #LATIN CAPITAL LETTER Y WITH DIAERESIS
59							);
60							my $PARENS = '\s[\[$](.+?)[\]$]\s';
61							my $QUOTE = '"“”`¨´‘’‛“”‟„′″‴‵‶‷⁗❛❜❝❞';
62							my @ED_RES = (
63							'(first\|second\|third\|fourth\|fifth\|sixth\|seventh\|eighth\|ninth\|tenth)',
64							'([1-9])\s?\w{2,5}\s[ée]d',
65							'\bv\.?(?:ersion)?\s?([0-9IXV]+)',
66							'\s([IXV0-9]+)(?:$\|:)'
67							);
68
69							#die "no" unless "2nd edition" =~ /$EDITION/i;
70
71							#my $TITLE_SPLIT = '(?:\?\|\:\|\.\|!\|\"\|[$QUOTE]\b)';
72							my $TITLE_SPLIT = '(?:\?\|\:\|\.\|!)';
73
74							sub sameAuthors {
75	42			42	1	2171	my ($list1, $list2) = @_;
76							#return 0 if $#$list1 != $#$list2;
77	42	100				108	if ($#$list2 > $#$list1) {
78	6					11	my $t = $list1;
79	6					9	$list1 = $list2;
80	6					7	$list2 = $t;
81							}
82	42					106	for (my $i = 0; $i <= $#$list2; $i++) {
83	49	100				1939	return 0 unless grep { samePerson($list2->[$i],$_) } @$list1;
	87					13753
84							}
85	40					15928	return 1;
86							}
87
88							sub firstAuthor {
89	0			0	0	0	my $e = shift;
90	0					0	my $a = $e->{authors};
91	0	0				0	if ($#$a > -1) {
92	0					0	return $a->[0];
93							} else {
94	0					0	return undef;
95							}
96							}
97
98							sub sameWork {
99
100	36			36	1	14467	my $debug = 0;
101
102	36					55	my ($e, $c, $threshold,$loose,$nolinks) = @_;
103	36	50				98	$loose = 0 unless defined $loose;
104	36	50				71	$threshold = 0.15 unless $threshold;
105
106	36	50				71	if ($debug) {
107	0					0	warn "sameEntry 1: " . toString($e);
108	0					0	warn "sameEntry 2: " . toString($c);
109							}
110
111	36	0	33			93	if (defined $e->{doi} and length $e->{doi} and defined $c->{doi} and length $c->{doi}) {
			33
			0
112	0	0				0	return 1 if $e->{doi} eq $c->{doi};
113							}
114
115	36	50				71	return 0 if (!$c);
116
117							# normalize encoding of relevant fields
118	36					74	local $e->{title} = decodeHTMLEntities($e->{title});
119	36					61	local $c->{title} = decodeHTMLEntities($c->{title});
120
121							# first check if authors,date, and title are almost literally the same
122	36	100				126	my $tsame = (lc $e->{title} eq lc $c->{title}) ? 1 : 0;
123	36					63	my $asame = sameAuthors($e->{authors},$c->{authors});
124	36	100	100			299	my $dsame = (defined $e->{date} and defined $c->{date} and $e->{date} eq $c->{date}) ? 1 : 0;
125
126	36	50				62	if ($debug) {
127	0					0	warn "tsame: $tsame";
128	0					0	warn "asame: $asame";
129	0					0	warn "dsame: $dsame";
130							}
131
132	36	50	66			133	return 1 if ($tsame and $asame and $dsame);
			66
133
134							# if authors quite different, not same
135	29	50				48	if (!$asame) {
136							#print "$lname1, $lname2 ";
137							#print my_dist_text($lname1,$lname2);
138	0	0				0	warn "authors too different" if $debug;
139	0					0	return 0;
140							}
141
142	29	50				60	warn "pre title length" if $debug;
143							# if title very different in lengths and do not contain ":" or brackets, not the same
144	29	50	100			299	return 0 if !$tsame and (
			100
			33
			66
			33
145							abs(length($e->{title}) - length($c->{title})) > 20
146							and
147							($e->{title} !~ /$TITLE_SPLIT/ and $c->{title} !~ /$TITLE_SPLIT/)
148							and
149							($e->{title} !~ /$PARENS/ and $c->{title} !~ /$PARENS/)
150							);
151
152							# Compare links
153	29	50				71	if (!$nolinks) {
154	29					27	foreach my $l (@{$e->{links}}) {
	29					87
155							# print "Links e:\n" . join("\n",$e->getLinks);
156							# print "Links c:\n" . join("\n",$c->getLinks);
157	0	0				0	return 1 if grep { $l eq $_} @{$c->{links}};
	0					0
	0					0
158							}
159							}
160
161							# check dates
162	29					37	my $compat_dates = $dsame;
163	29	50	66			182	if (!$dsame and defined $e->{date} and defined $c->{date} and $e->{date} =~ /^\d\d\d\d$/ and $c->{date} =~ /^\d\d\d\d$/ ) {
			100
			66
			66
164
165	7					9	$compat_dates = 0;
166							#disabled for most cases because we want to conflate editions and republications for now.
167	7	50	33			67	if ($e->{title} =~ /^Introduction.?$/ or $e->{title} =~ /^Preface.?$/) {
168	0	0	0			0	return 0 if ($e->{source} and $e->{source} ne $c->{source}) or
			0
			0
169							($e->{volume} and $e->{volume} ne $c->{volume});
170							}
171	7	50				15	if ($loose) {
172	0					0	$threshold /= 2;
173							} else {
174	7					17	$threshold /= 3;
175							}
176							}
177
178							# authors same, loosen for title
179	29	100	66			102	if ($asame and $compat_dates) {
180	21					22	$loose = 1;
181							}
182
183	29	50				47	warn "pre loose mode: loose = $loose" if $debug;
184
185							#print "threshold $lname1,$lname2: $threshold\n";
186							# ok if distance short enough without doing anything
187							#print "distance: " . distance(lc $e->{title},lc $c->{title}) / (length($e->{title}) +1) . "\n";
188
189							# perform fuzzy matching
190							#my $str1 = "$e->{date}\|$e->{title}";
191	29					68	my $str1 = lc _strip_non_word($e->{title});
192	29					51	my $str2 = lc _strip_non_word($c->{title});
193
194							# check for edition strings
195	29					67	my $ed1 = extractEdition($str1);
196	29					57	my $ed2 = extractEdition($str2);
197	29	50				64	warn "ed1: $ed1" if $debug;
198	29	50				53	warn "ed2: $ed2" if $debug;
199
200	29	100	100			220	return 0 if ($ed1 and !$ed2) or ($ed2 and !$ed1) or ($ed1 && $ed1 != $ed2);
			100
			66
			100
			66
201	25	50				38	warn "not diff editions" if $debug;
202
203							# remove brackets
204	25					32	my ($parens1,$parens2);
205	25					192	$str1 =~ s/$PARENS//g;
206	25					47	$parens1 = $1;
207	25					130	$str2 =~ s/$PARENS//g;
208	25					35	$parens2 = $1;
209	25	100	66			72	return 0 if $parens1 && $parens2 && numdiff($parens1,$parens2);
			100
210
211	24	50				43	warn "the text comparison is: '$str1' vs '$str2'" if $debug;
212
213	24	50				103	warn "pre number check" if $debug;
214							# if titles differ by a number, not the same
215	24	100				50	return 0 if numdiff($str1,$str2);
216
217							# ultimate test
218							#dbg("$str1\n$str2\n");
219							#dbg(my_dist_text($str1,$str2));
220	22					39	my $score = (my_dist_text($str1,$str2) / (length($str1) +1));
221
222	22	50				44	warn "score: $score (threshold: $threshold)" if $debug;
223							#print $score . " \n";
224	22	100				98	return 1 if ( $score < $threshold);
225
226							# now if loose mode and only one of the titles has a ":" or other punctuation, compare the part before the punc with the other title instead
227	9	100				22	if ($loose) {
228
229	7	50				13	warn "loose: $str1 -- $str2" if $debug;
230	7	50				20	return 1 if (my_dist_text($str1,$str2) / (length($str1) +1) < $threshold);
231
232	7	100				396	if ($e->{title} =~ /(.+)\s$TITLE_SPLIT\s(.+)/) {
		50
233
234	5					9	my $str1 = _strip_non_word($1);
235	5	50				265	if ($c->{title} =~ /(.+)\s$TITLE_SPLIT\s(.+)/) {
236	0					0	return 0;
237							} else {
238	5	50				10	if (my_dist_text($str1,$str2) / (length($str1) +1)< $threshold) {
239	5					23	return 1;
240							}
241							}
242
243							} elsif ($c->{title} =~ /(.+)\s$TITLE_SPLIT\s(.+)/) {
244
245	2					5	my $str2 = _strip_non_word($1);
246	2	100				5	if (my_dist_text($str1,$str2) / (length($str1) +1)< $threshold) {
247	1					6	return 1;
248							}
249
250							} else {
251
252	0					0	return 0;
253
254							}
255							}
256
257	3					13	return 0;
258							}
259
260							sub _strip_non_word {
261	65			65		92	my $str = shift;
262	65					437	$str =~ s/[^[0-9a-zA-Z\)\]\(\[]+/ /g;
263	65					296	$str =~ s/\s+/ /g;
264	65					86	$str =~ s/^\s+//;
265	65					163	$str =~ s/\s+$//;
266	65					121	$str;
267							}
268
269							my %nums = (
270							first => 1,
271							second => 2,
272							third => 3,
273							fourth => 4,
274							fifth => 5,
275							sixth => 6,
276							seventh => 7,
277							eighth => 8,
278							ninth => 9,
279							tenth => 10,
280							I => 1,
281							II => 2,
282							III => 3,
283							IV => 4,
284							V => 5,
285							VI => 6,
286							VII => 7,
287							VIII => 8,
288							IX => 9,
289							X => 10,
290							);
291							sub extract_num {
292	17			17	0	47	my $s = shift;
293	17	100				62	if ($s =~ /\b(\d+)/) {
294	11					69	return $1;
295							}
296	6					35	for my $n (keys %nums) {
297	62	100				404	if ($s =~ /\b$n\b/i) {
298	6					33	return $nums{$n};
299							}
300							}
301	0					0	return $s;
302							}
303
304							sub extractEdition {
305	69			69	0	3189	my $s = shift;
306	69					102	for my $re (@ED_RES) {
307	253	100				8020	if ($s =~ /$re/i) {
308	17					78	return extract_num($1);
309							}
310							}
311	52					114	return undef;
312							}
313
314							sub numdiff {
315	29			29	0	39	my ($s1,$s2) = @_;
316							#print "----checking numdiff (($s1,$s2))\n";
317	29					207	my @n1 = ($s1 =~ /\b([IXV0-9]{1,4}\|first\|second\|third\|fourth\|fifth\|sixth\|1st\|2nd\|3rd\|4th\|5th\|6th\|7th\|8th\|9th)\b/ig);
318	29					156	my @n2 = ($s2 =~ /\b([IXV0-9]{1,4}\|first\|second\|third\|fourth\|fifth\|sixth\|1st\|2nd\|3rd\|4th\|5th\|6th\|7th\|8th\|9th)\b/ig);
319							#print "In s1:" . join(",",@n1) . "\n";
320							#print "In s2:" . join(",",@n2) . "\n";
321	29	100				95	return 0 if $#n1 ne $#n2;
322	27					78	for (0..$#n1) {
323	5	100				39	return 1 if lc $n1[$_] ne lc $n2[$_];
324							}
325							#print "Not diff\n";
326	24					74	return 0;
327							=old
328							my $num1 = undef;
329							my $num2 = undef;
330							$num1 = $1 if ($s1 =~ /\W([IV1-9]{1,4})(((\W\|$).{0,3}$)\|(\W\s*:))/);
331							$num2 = $1 if ($s2 =~ /\W([IV1-9]{1,4})(((\W\|$).{0,3}$)\|(\W\s*:))/);
332							return $num1 eq $num2 ? 0 : 1;
333							=cut
334							}
335
336
337							sub my_dist_text {
338	36			36	0	72	my $a = lc shift;
339	36					54	my $b = lc shift;
340	36					50	$a =~ s/_/ /g;
341	36					131	$b =~ s/_/ /g;
342	36					444	return distance($a, $b);
343
344							}
345							sub decodeHTMLEntities {
346	72			72	0	76	my $in = shift;
347	72					106	$in =~ s/&([\d\w\#]+);/&safe_decode($1)/gei;
	0					0
348	72					130	return $in;
349							}
350
351							sub safe_decode {
352	0			0	0	0	my $in = shift;
353	0	0				0	if (substr($in,0,1) eq '#') {
354	0	0				0	my $num = substr($in,1,1) eq 'x' ? hex(substr($in,1)) : substr($in,1);
355							# we check and fix cp1232 entities
356	0	0	0			0	return ($num < 127 or $num > 159) ?
357							HTML::Entities::decode_entities("&$in;") :
358							HTML::Entities::decode_entities("&#" . $WIN2UTF{$num} . ";");
359							} else {
360	0					0	HTML::Entities::decode_entities("&$in;")
361							}
362							}
363
364							sub toString {
365	70			70	0	214	my $h = shift;
366	70					61	return join("; ",@{$h->{authors}}) . " ($h->{date}) $h->{title}\n";
	70					390
367							}
368
369							1;
370							__END__