line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Biblio::Citation::Compare; |
2
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
72126
|
use 5.0; |
|
3
|
|
|
|
|
27
|
|
4
|
3
|
|
|
3
|
|
16
|
use strict; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
80
|
|
5
|
3
|
|
|
3
|
|
19
|
use warnings; |
|
3
|
|
|
|
|
16
|
|
|
3
|
|
|
|
|
97
|
|
6
|
3
|
|
|
3
|
|
1334
|
use Text::LevenshteinXS qw(distance); |
|
3
|
|
|
|
|
9084
|
|
|
3
|
|
|
|
|
163
|
|
7
|
3
|
|
|
3
|
|
1546
|
use HTML::Entities; |
|
3
|
|
|
|
|
20933
|
|
|
3
|
|
|
|
|
258
|
|
8
|
3
|
|
|
3
|
|
4654
|
use Text::Names qw/samePerson cleanName parseName parseName2/; |
|
3
|
|
|
|
|
140834
|
|
|
3
|
|
|
|
|
326
|
|
9
|
3
|
|
|
3
|
|
1293
|
use Text::Roman qw/isroman roman2int/; |
|
3
|
|
|
|
|
3385
|
|
|
3
|
|
|
|
|
180
|
|
10
|
3
|
|
|
3
|
|
25
|
use utf8; |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
18
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
require Exporter; |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( 'all' => [ qw( |
17
|
|
|
|
|
|
|
sameWork sameAuthors toString extractEdition sameAuthorBits sameTitle sameAuthorsLoose |
18
|
|
|
|
|
|
|
) ] ); |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
our @EXPORT = qw( ); |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
our $VERSION = '0.56'; |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# to correct bogus windows entities. unfixable ones are converted to spaces. |
27
|
|
|
|
|
|
|
my %WIN2UTF = ( |
28
|
|
|
|
|
|
|
hex('80')=> hex('20AC'),# #EURO SIGN |
29
|
|
|
|
|
|
|
hex('81')=> hex('0020'), #UNDEFINED |
30
|
|
|
|
|
|
|
hex('82')=> hex('201A'),# #SINGLE LOW-9 QUOTATION MARK |
31
|
|
|
|
|
|
|
hex('83')=> hex('0192'),# #LATIN SMALL LETTER F WITH HOOK |
32
|
|
|
|
|
|
|
hex('84')=> hex('201E'),# #DOUBLE LOW-9 QUOTATION MARK |
33
|
|
|
|
|
|
|
hex('85')=> hex('2026'),# #HORIZONTAL ELLIPSIS |
34
|
|
|
|
|
|
|
hex('86')=> hex('2020'),# #DAGGER |
35
|
|
|
|
|
|
|
hex('87')=> hex('2021'),# #DOUBLE DAGGER |
36
|
|
|
|
|
|
|
hex('88')=> hex('02C6'),# #MODIFIER LETTER CIRCUMFLEX ACCENT |
37
|
|
|
|
|
|
|
hex('89')=> hex('2030'),# #PER MILLE SIGN |
38
|
|
|
|
|
|
|
hex('8A')=> hex('0160'),# #LATIN CAPITAL LETTER S WITH CARON |
39
|
|
|
|
|
|
|
hex('8B')=> hex('2039'),# #SINGLE LEFT-POINTING ANGLE QUOTATION MARK |
40
|
|
|
|
|
|
|
hex('8C')=> hex('0152'),# #LATIN CAPITAL LIGATURE OE |
41
|
|
|
|
|
|
|
hex('8D')=> hex('0020'),# #UNDEFINED |
42
|
|
|
|
|
|
|
hex('8E')=> hex('017D'),# #LATIN CAPITAL LETTER Z WITH CARON |
43
|
|
|
|
|
|
|
hex('8F')=> hex('0020'),# #UNDEFINED |
44
|
|
|
|
|
|
|
hex('90')=> hex('0020'),# #UNDEFINED |
45
|
|
|
|
|
|
|
hex('91')=> hex('2018'),# #LEFT SINGLE QUOTATION MARK |
46
|
|
|
|
|
|
|
hex('92')=> hex('2019'),# #RIGHT SINGLE QUOTATION MARK |
47
|
|
|
|
|
|
|
hex('93')=> hex('201C'),# #LEFT DOUBLE QUOTATION MARK |
48
|
|
|
|
|
|
|
hex('94')=> hex('201D'),# #RIGHT DOUBLE QUOTATION MARK |
49
|
|
|
|
|
|
|
hex('95')=> hex('2022'),# #BULLET |
50
|
|
|
|
|
|
|
hex('96')=> hex('2013'),# #EN DASH |
51
|
|
|
|
|
|
|
hex('97')=> hex('2014'),# #EM DASH |
52
|
|
|
|
|
|
|
hex('98')=> hex('02DC'),# #SMALL TILDE |
53
|
|
|
|
|
|
|
hex('99')=> hex('2122'),# #TRADE MARK SIGN |
54
|
|
|
|
|
|
|
hex('9A')=> hex('0161'),# #LATIN SMALL LETTER S WITH CARON |
55
|
|
|
|
|
|
|
hex('9B')=> hex('203A'),# #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK |
56
|
|
|
|
|
|
|
hex('9C')=> hex('0153'),# #LATIN SMALL LIGATURE OE |
57
|
|
|
|
|
|
|
hex('9D')=> hex('0020'),# #UNDEFINED |
58
|
|
|
|
|
|
|
hex('9E')=> hex('017E'),# #LATIN SMALL LETTER Z WITH CARON |
59
|
|
|
|
|
|
|
hex('9F')=> hex('0178')# #LATIN CAPITAL LETTER Y WITH DIAERESIS |
60
|
|
|
|
|
|
|
); |
61
|
|
|
|
|
|
|
my $PARENS = '\s*[\[\(](.+?)[\]\)]\s*'; |
62
|
|
|
|
|
|
|
my $QUOTE = '"“”`¨´‘’‛“”‟„′″‴‵‶‷⁗❛❜❝❞'; |
63
|
|
|
|
|
|
|
my @ED_RES = ( |
64
|
|
|
|
|
|
|
'(first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth)', |
65
|
|
|
|
|
|
|
'([1-9])\s?\w{2,5}\s[ée]d', |
66
|
|
|
|
|
|
|
'\bv\.?(?:ersion)?\s?([0-9IXV]+)', |
67
|
|
|
|
|
|
|
'\s([IXV0-9]+)(?:$|:)' |
68
|
|
|
|
|
|
|
); |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
#die "no" unless "2nd edition" =~ /$EDITION/i; |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
#my $TITLE_SPLIT = '(?:\?|\:|\.|!|\"|[$QUOTE]\b)'; |
73
|
|
|
|
|
|
|
my $TITLE_SPLIT = '(?:\?|\:|\.|!)'; |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
sub sameAuthors { |
76
|
66
|
|
|
66
|
1
|
2961
|
my ($list1, $list2, %opts) = @_; |
77
|
66
|
100
|
100
|
|
|
209
|
return 0 if $#$list1 != $#$list2 and $opts{strict}; |
78
|
60
|
100
|
|
|
|
151
|
if ($#$list2 > $#$list1) { |
79
|
6
|
|
|
|
|
11
|
my $t = $list1; |
80
|
6
|
|
|
|
|
8
|
$list1 = $list2; |
81
|
6
|
|
|
|
|
10
|
$list2 = $t; |
82
|
|
|
|
|
|
|
} |
83
|
60
|
|
|
|
|
166
|
for (my $i = 0; $i <= $#$list2; $i++) { |
84
|
51
|
100
|
|
|
|
2803
|
return 0 unless grep { samePerson($list2->[$i],$_, %opts) } @$list1; |
|
89
|
|
|
|
|
17357
|
|
85
|
|
|
|
|
|
|
} |
86
|
56
|
|
|
|
|
29724
|
return 1; |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
sub firstAuthor { |
90
|
0
|
|
|
0
|
0
|
0
|
my $e = shift; |
91
|
0
|
|
|
|
|
0
|
my $a = $e->{authors}; |
92
|
0
|
0
|
|
|
|
0
|
if ($#$a > -1) { |
93
|
0
|
|
|
|
|
0
|
return $a->[0]; |
94
|
|
|
|
|
|
|
} else { |
95
|
0
|
|
|
|
|
0
|
return undef; |
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
sub sameWork { |
100
|
|
|
|
|
|
|
|
101
|
53
|
|
|
53
|
1
|
28887
|
my ($e, $c, $threshold,$loose,$nolinks,%opts) = @_; |
102
|
|
|
|
|
|
|
|
103
|
53
|
|
50
|
|
|
235
|
my $debug = $opts{debug} || 0; |
104
|
|
|
|
|
|
|
|
105
|
53
|
50
|
|
|
|
127
|
$loose = 0 unless defined $loose; |
106
|
53
|
50
|
|
|
|
108
|
$threshold = 0.15 unless $threshold; |
107
|
53
|
50
|
|
|
|
117
|
$opts{loose} = 1 if $loose; |
108
|
|
|
|
|
|
|
|
109
|
53
|
50
|
|
|
|
132
|
if ($debug) { |
110
|
0
|
|
|
|
|
0
|
warn "sameWork 1: " . toString($e); |
111
|
0
|
|
|
|
|
0
|
warn "sameWork 2: " . toString($c); |
112
|
|
|
|
|
|
|
} |
113
|
|
|
|
|
|
|
|
114
|
53
|
0
|
33
|
|
|
150
|
if (defined $e->{doi} and length $e->{doi} and defined $c->{doi} and length $c->{doi}) { |
|
|
|
33
|
|
|
|
|
|
|
|
0
|
|
|
|
|
115
|
0
|
0
|
|
|
|
0
|
if ($e->{doi} eq $c->{doi}) { |
116
|
|
|
|
|
|
|
# we don't use doi to say 1 because often we have dois that are for a whole issue |
117
|
|
|
|
|
|
|
# however same doi lowers the threshold |
118
|
0
|
0
|
|
|
|
0
|
$threshold /= 2 if $e->{doi} eq $c->{doi}; |
119
|
0
|
|
|
|
|
0
|
$loose = 1; |
120
|
0
|
|
|
|
|
0
|
$opts{loose} = 1; |
121
|
|
|
|
|
|
|
} else { |
122
|
0
|
|
|
|
|
0
|
return 0; |
123
|
|
|
|
|
|
|
} |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
|
126
|
53
|
50
|
|
|
|
114
|
return 0 if (!$c); |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
# normalize encoding of relevant fields |
129
|
53
|
|
|
|
|
125
|
local $e->{title} = decodeHTMLEntities($e->{title}); |
130
|
53
|
|
|
|
|
105
|
local $c->{title} = decodeHTMLEntities($c->{title}); |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
# first check if authors,date, and title are almost literally the same |
133
|
53
|
100
|
|
|
|
182
|
my $tsame = (lc $e->{title} eq lc $c->{title}) ? 1 : 0; |
134
|
53
|
|
|
|
|
143
|
my $asame = sameAuthors($e->{authors},$c->{authors},strict=>1); |
135
|
53
|
|
100
|
|
|
507
|
my $asame_loose = $asame || sameAuthors($e->{authors},$c->{authors},strict=>0); #asame_loose will be 1 while same is 0 when there are extra authors in one paper but all overlap authors match |
136
|
53
|
|
66
|
|
|
417
|
my $asame_bits = $asame_loose || sameAuthorBits($e->{authors},$c->{authors}); |
137
|
53
|
100
|
100
|
|
|
286
|
my $dsame = (defined $e->{date} and defined $c->{date} and $e->{date} eq $c->{date}) ? 1 : 0; |
138
|
|
|
|
|
|
|
|
139
|
53
|
50
|
|
|
|
111
|
if ($debug) { |
140
|
0
|
|
|
|
|
0
|
warn "tsame: $tsame"; |
141
|
0
|
|
|
|
|
0
|
warn "asame: $asame"; |
142
|
0
|
|
|
|
|
0
|
warn "asame_loose: $asame_loose"; |
143
|
0
|
|
|
|
|
0
|
warn "asame_bits: $asame_bits"; |
144
|
0
|
|
|
|
|
0
|
warn "dsame: $dsame"; |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
|
147
|
53
|
100
|
100
|
|
|
152
|
return 1 if ($tsame and $asame and $dsame); |
|
|
|
66
|
|
|
|
|
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
# if authors quite different, not same |
150
|
48
|
100
|
|
|
|
92
|
if (!$asame_bits) { |
151
|
1
|
50
|
|
|
|
25
|
warn "authors too different" if $debug; |
152
|
1
|
|
|
|
|
5
|
return 0; |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
# at this point the authors are plausibly the same |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
# check dates |
157
|
47
|
|
|
|
|
73
|
my $date_wildcards = '^forthcoming|in press|manuscript|unknown|web$'; |
158
|
47
|
|
33
|
|
|
276
|
my $compat_dates = ($dsame or ($e->{date} && $e->{date} =~ /$date_wildcards/) or ($c->{date} && $c->{date} =~ /$date_wildcards/)); |
159
|
47
|
100
|
66
|
|
|
135
|
if (!$dsame and !$compat_dates) { |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
#disabled for most cases because we want to conflate editions and republications for now. |
162
|
24
|
50
|
33
|
|
|
110
|
if ($e->{title} =~ /^Introduction.?$/ or $e->{title} =~ /^Preface.?$/) { |
163
|
|
|
|
|
|
|
return 0 if ($e->{source} and $e->{source} ne $c->{source}) or |
164
|
0
|
0
|
0
|
|
|
0
|
($e->{volume} and $e->{volume} ne $c->{volume}); |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# numeric dates |
168
|
24
|
100
|
66
|
|
|
118
|
if ($e->{date} and $e->{date} =~ /^\d\d\d\d$/ and $c->{date} and $c->{date} =~ /^\d\d\d\d$/) { |
|
|
|
100
|
|
|
|
|
|
|
|
66
|
|
|
|
|
169
|
7
|
|
|
|
|
19
|
my $date_diff = $e->{date} - $c->{date}; |
170
|
|
|
|
|
|
|
# quite often people misremember dates so we permit some slack |
171
|
|
|
|
|
|
|
# we will consider the dates compat if they close in time |
172
|
|
|
|
|
|
|
# if dates are far apart, we know they are not exactly the same publicatoins. |
173
|
|
|
|
|
|
|
# but they might be reprints of the same thing, which we want to conflate. |
174
|
7
|
100
|
66
|
|
|
27
|
if ($date_diff > 3 or $date_diff < -3) { |
175
|
3
|
50
|
|
|
|
7
|
if ($asame_bits) { |
176
|
3
|
|
|
|
|
8
|
$threshold /= 2; |
177
|
3
|
50
|
|
|
|
8
|
warn "dates different, lowering similarity threshold" if $debug; |
178
|
|
|
|
|
|
|
} else { |
179
|
0
|
0
|
|
|
|
0
|
warn "dates+authors too different" if $debug; |
180
|
0
|
|
|
|
|
0
|
return 0; |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
} else { |
184
|
|
|
|
|
|
|
# nearby date |
185
|
4
|
|
|
|
|
11
|
$threshold /= 2; |
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
} else { |
189
|
|
|
|
|
|
|
#messed up dates, assume the worst |
190
|
17
|
|
|
|
|
47
|
$threshold /=2; |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
} else { |
194
|
23
|
50
|
33
|
|
|
61
|
$loose = 1 if $asame_loose or $asame_bits; |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
|
200
|
47
|
50
|
|
|
|
84
|
warn "pre title length" if $debug; |
201
|
|
|
|
|
|
|
# if title very different in lengths and do not contain ":" or brackets, not the same |
202
|
|
|
|
|
|
|
return 0 if !$tsame and ( |
203
|
|
|
|
|
|
|
abs(length($e->{title}) - length($c->{title})) > 20 |
204
|
|
|
|
|
|
|
and |
205
|
|
|
|
|
|
|
($e->{title} !~ /$TITLE_SPLIT/ and $c->{title} !~ /$TITLE_SPLIT/) |
206
|
|
|
|
|
|
|
and |
207
|
47
|
50
|
100
|
|
|
431
|
($e->{title} !~ /$PARENS/ and $c->{title} !~ /$PARENS/) |
|
|
|
100
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
208
|
|
|
|
|
|
|
); |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
# Compare links |
211
|
|
|
|
|
|
|
# if (!$nolinks) { |
212
|
|
|
|
|
|
|
# foreach my $l (@{$e->{links}}) { |
213
|
|
|
|
|
|
|
# print "Links e:\n" . join("\n",$e->getLinks); |
214
|
|
|
|
|
|
|
# print "Links c:\n" . join("\n",$c->getLinks); |
215
|
|
|
|
|
|
|
# return 1 if grep { $l eq $_} @{$c->{links}}; |
216
|
|
|
|
|
|
|
# } |
217
|
|
|
|
|
|
|
# } |
218
|
|
|
|
|
|
|
|
219
|
47
|
50
|
|
|
|
131
|
warn "pre loose mode: loose = $loose" if $debug; |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
#print "threshold $lname1,$lname2: $threshold\n"; |
222
|
|
|
|
|
|
|
# ok if distance short enough without doing anything |
223
|
|
|
|
|
|
|
#print "distance: " . distance(lc $e->{title},lc $c->{title}) / (length($e->{title}) +1) . "\n"; |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
# perform fuzzy matching |
226
|
|
|
|
|
|
|
#my $str1 = "$e->{date}|$e->{title}"; |
227
|
47
|
|
|
|
|
120
|
my $str1 = lc _strip_non_word($e->{title}); |
228
|
47
|
|
|
|
|
101
|
my $str2 = lc _strip_non_word($c->{title}); |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
# check for edition strings |
231
|
47
|
|
|
|
|
121
|
my $ed1 = extractEdition($str1); |
232
|
47
|
|
|
|
|
659
|
my $ed2 = extractEdition($str2); |
233
|
47
|
50
|
|
|
|
249
|
warn "ed1: $ed1" if $debug; |
234
|
47
|
50
|
|
|
|
100
|
warn "ed2: $ed2" if $debug; |
235
|
47
|
100
|
100
|
|
|
167
|
$loose =1 if $ed1 and $ed2 and $ed1 == $ed2 and !$dsame and $asame_loose; |
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
66
|
|
|
|
|
236
|
|
|
|
|
|
|
|
237
|
47
|
100
|
100
|
|
|
313
|
return 0 if ($ed1 and !$ed2) or ($ed2 and !$ed1) or ($ed1 && $ed1 != $ed2); |
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
238
|
40
|
50
|
|
|
|
85
|
warn "not diff editions" if $debug; |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
# remove brackets |
241
|
40
|
|
|
|
|
67
|
my ($parens1,$parens2); |
242
|
40
|
|
|
|
|
233
|
$str1 =~ s/$PARENS//g; |
243
|
40
|
|
|
|
|
107
|
$parens1 = $1; |
244
|
40
|
|
|
|
|
203
|
$str2 =~ s/$PARENS//g; |
245
|
40
|
|
|
|
|
93
|
$parens2 = $1; |
246
|
40
|
100
|
66
|
|
|
119
|
return 0 if $parens1 && $parens2 && numdiff($parens1,$parens2); |
|
|
|
100
|
|
|
|
|
247
|
|
|
|
|
|
|
|
248
|
38
|
50
|
|
|
|
70
|
warn "the text comparison is: '$str1' vs '$str2'" if $debug; |
249
|
|
|
|
|
|
|
|
250
|
38
|
50
|
|
|
|
75
|
warn "pre number check" if $debug; |
251
|
|
|
|
|
|
|
# if titles differ by a number, not the same |
252
|
38
|
100
|
|
|
|
76
|
return 0 if numdiff($str1,$str2); |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
# ultimate test |
255
|
|
|
|
|
|
|
#dbg("$str1\n$str2\n"); |
256
|
|
|
|
|
|
|
#dbg(my_dist_text($str1,$str2)); |
257
|
33
|
|
|
|
|
79
|
my $score = (my_dist_text($str1,$str2) / (length($str1) +1)); |
258
|
|
|
|
|
|
|
|
259
|
33
|
50
|
|
|
|
72
|
warn "score: $score (threshold: $threshold)" if $debug; |
260
|
|
|
|
|
|
|
#print $score . " \n"; |
261
|
33
|
100
|
|
|
|
198
|
return 1 if ( $score < $threshold); |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
# now if loose mode and only one of the titles has a ":" or other punctuation, compare the part before the punc with the other title instead |
264
|
11
|
100
|
|
|
|
38
|
if ($loose) { |
265
|
|
|
|
|
|
|
|
266
|
7
|
50
|
|
|
|
14
|
warn "loose: $str1 -- $str2" if $debug; |
267
|
|
|
|
|
|
|
|
268
|
7
|
100
|
|
|
|
321
|
if ($e->{title} =~ /(.+?)\s*$TITLE_SPLIT\s*(.+)/) { |
|
|
50
|
|
|
|
|
|
269
|
|
|
|
|
|
|
|
270
|
5
|
|
|
|
|
14
|
my $str1 = _strip_non_word($1); |
271
|
5
|
50
|
|
|
|
255
|
if ($c->{title} =~ /(.+?)\s*$TITLE_SPLIT\s*(.+)/) { |
272
|
0
|
|
|
|
|
0
|
return 0; |
273
|
|
|
|
|
|
|
} else { |
274
|
5
|
50
|
|
|
|
15
|
if (my_dist_text($str1,$str2) / (length($str1) +1)< $threshold) { |
275
|
5
|
|
|
|
|
28
|
return 1; |
276
|
|
|
|
|
|
|
} |
277
|
|
|
|
|
|
|
} |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
} elsif ($c->{title} =~ /(.+?)\s*$TITLE_SPLIT\s*(.+)/) { |
280
|
|
|
|
|
|
|
|
281
|
2
|
|
|
|
|
6
|
my $str2 = _strip_non_word($1); |
282
|
2
|
100
|
|
|
|
6
|
if (my_dist_text($str1,$str2) / (length($str1) +1)< $threshold) { |
283
|
1
|
|
|
|
|
6
|
return 1; |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
} else { |
287
|
|
|
|
|
|
|
|
288
|
0
|
|
|
|
|
0
|
return 0; |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
} |
292
|
|
|
|
|
|
|
|
293
|
5
|
|
|
|
|
33
|
return 0; |
294
|
|
|
|
|
|
|
} |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
sub sameAuthorsLoose { |
297
|
0
|
|
|
0
|
0
|
0
|
my ($a, $b) = @_; |
298
|
0
|
|
|
|
|
0
|
my $asame = sameAuthors($a,$b,strict=>1); |
299
|
0
|
|
0
|
|
|
0
|
my $asame_loose = $asame || sameAuthors($a,$b,strict=>0); |
300
|
0
|
|
0
|
|
|
0
|
return $asame_loose || sameAuthorBits($a,$b); |
301
|
|
|
|
|
|
|
} |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
sub sameAuthorBits { |
304
|
3
|
|
|
3
|
0
|
161
|
my ($a, $b) = @_; |
305
|
3
|
|
|
|
|
7
|
my (@alist, @blist); |
306
|
3
|
|
|
|
|
9
|
for (@$a) { |
307
|
4
|
|
|
|
|
12
|
my $v = lc $_; # we copy so we don't modify the original |
308
|
4
|
|
|
|
|
21
|
$v =~ s/[,\.]//g; |
309
|
|
|
|
|
|
|
#$v =~ s/(\p{Ll})(\p{Lu})/$1 $2/g; |
310
|
4
|
|
|
|
|
34
|
push @alist, split(/\s+/, $v); |
311
|
|
|
|
|
|
|
} |
312
|
3
|
|
|
|
|
7
|
for (@$b) { |
313
|
3
|
|
|
|
|
8
|
my $v = lc $_; |
314
|
3
|
|
|
|
|
13
|
$v =~ s/[,\.]//g; |
315
|
|
|
|
|
|
|
#$v =~ s/(\p{Ll})(\p{Lu})/$1 $2/g; |
316
|
3
|
|
|
|
|
17
|
push @blist, split(/\s+/, $v); |
317
|
|
|
|
|
|
|
} |
318
|
|
|
|
|
|
|
#use Data::Dumper; |
319
|
3
|
|
|
|
|
13
|
@alist = sort @alist; |
320
|
3
|
|
|
|
|
8
|
@blist = sort @blist; |
321
|
|
|
|
|
|
|
#print Dumper(\@alist); |
322
|
|
|
|
|
|
|
#print Dumper(\@blist); |
323
|
3
|
50
|
|
|
|
23
|
return 0 if $#alist != $#blist; |
324
|
3
|
|
|
|
|
11
|
for (my $i=0; $i<= $#alist; $i++) { |
325
|
6
|
100
|
|
|
|
26
|
return 0 if lc $alist[$i] ne lc $blist[$i]; |
326
|
|
|
|
|
|
|
} |
327
|
1
|
|
|
|
|
6
|
return 1; |
328
|
|
|
|
|
|
|
} |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
#wip |
331
|
|
|
|
|
|
|
#sub author_bits { |
332
|
|
|
|
|
|
|
# my $list_ref = shift; |
333
|
|
|
|
|
|
|
# my @new; |
334
|
|
|
|
|
|
|
# for (@$list_ref) { |
335
|
|
|
|
|
|
|
# my $v = $_; # we copy so we don't modify the original |
336
|
|
|
|
|
|
|
# $v =~ s/,//; |
337
|
|
|
|
|
|
|
# $v =~ s/(\p{Ll}\p |
338
|
|
|
|
|
|
|
# push @alist, split(/\s+/, $v); |
339
|
|
|
|
|
|
|
# } |
340
|
|
|
|
|
|
|
#} |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
sub _strip_non_word { |
343
|
101
|
|
|
101
|
|
161
|
my $str = shift; |
344
|
|
|
|
|
|
|
#abbreviation "volume" v |
345
|
101
|
|
|
|
|
335
|
$str =~ s/\bvolume\b/v/gi; |
346
|
101
|
|
|
|
|
283
|
$str =~ s/\bvol\.?\b/v/gi; |
347
|
101
|
|
|
|
|
260
|
$str =~ s/\bv\.\b/v/gi; |
348
|
|
|
|
|
|
|
|
349
|
101
|
|
|
|
|
623
|
$str =~ s/[^[0-9a-zA-Z\)\]\(\[]+/ /g; |
350
|
101
|
|
|
|
|
527
|
$str =~ s/\s+/ /g; |
351
|
101
|
|
|
|
|
205
|
$str =~ s/^\s+//; |
352
|
101
|
|
|
|
|
291
|
$str =~ s/\s+$//; |
353
|
101
|
|
|
|
|
262
|
$str; |
354
|
|
|
|
|
|
|
} |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
my %nums = ( |
357
|
|
|
|
|
|
|
first => 1, |
358
|
|
|
|
|
|
|
second => 2, |
359
|
|
|
|
|
|
|
third => 3, |
360
|
|
|
|
|
|
|
fourth => 4, |
361
|
|
|
|
|
|
|
fifth => 5, |
362
|
|
|
|
|
|
|
sixth => 6, |
363
|
|
|
|
|
|
|
seventh => 7, |
364
|
|
|
|
|
|
|
eighth => 8, |
365
|
|
|
|
|
|
|
ninth => 9, |
366
|
|
|
|
|
|
|
tenth => 10, |
367
|
|
|
|
|
|
|
); |
368
|
|
|
|
|
|
|
sub extract_num { |
369
|
28
|
|
|
28
|
0
|
98
|
my $s = shift; |
370
|
28
|
100
|
|
|
|
155
|
if ($s =~ /\b(\d+)/) { |
371
|
13
|
|
|
|
|
78
|
return $1; |
372
|
|
|
|
|
|
|
} |
373
|
15
|
100
|
|
|
|
61
|
if (isroman($s)) { |
374
|
10
|
|
|
|
|
647
|
return roman2int($s); |
375
|
|
|
|
|
|
|
} |
376
|
|
|
|
|
|
|
|
377
|
5
|
|
|
|
|
132
|
for my $n (keys %nums) { |
378
|
35
|
100
|
|
|
|
247
|
if ($s =~ /\b$n\b/i) { |
379
|
5
|
|
|
|
|
31
|
return $nums{$n}; |
380
|
|
|
|
|
|
|
} |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
} |
383
|
0
|
|
|
|
|
0
|
return $s; |
384
|
|
|
|
|
|
|
} |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
sub extractEdition { |
387
|
106
|
|
|
106
|
0
|
6220
|
my $s = shift; |
388
|
106
|
|
|
|
|
221
|
for my $re (@ED_RES) { |
389
|
391
|
100
|
|
|
|
11603
|
if ($s =~ /$re/i) { |
390
|
28
|
|
|
|
|
103
|
return extract_num($1); |
391
|
|
|
|
|
|
|
} |
392
|
|
|
|
|
|
|
} |
393
|
78
|
|
|
|
|
220
|
return undef; |
394
|
|
|
|
|
|
|
} |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
sub numdiff { |
397
|
48
|
|
|
48
|
0
|
107
|
my ($s1,$s2) = @_; |
398
|
48
|
|
|
|
|
347
|
my @n1 = ($s1 =~ /\b([IXV0-9]{1,4}|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|eleventh|twelveth|1st|2nd|3rd|4th|5th|6th|7th|8th|9th|10th|11th|12th)\b/ig); |
399
|
48
|
|
|
|
|
252
|
my @n2 = ($s2 =~ /\b([IXV0-9]{1,4}|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|eleventh|twelveth|1st|2nd|3rd|4th|5th|6th|7th|8th|9th|10th|11th|12th)\b/ig); |
400
|
|
|
|
|
|
|
#print "In s1:" . join(",",@n1) . "\n"; |
401
|
|
|
|
|
|
|
#print "In s2:" . join(",",@n2) . "\n"; |
402
|
48
|
100
|
|
|
|
181
|
return 0 if $#n1 ne $#n2; |
403
|
44
|
|
|
|
|
136
|
for (0..$#n1) { |
404
|
13
|
100
|
|
|
|
126
|
return 1 if lc $n1[$_] ne lc $n2[$_]; |
405
|
|
|
|
|
|
|
} |
406
|
37
|
|
|
|
|
102
|
return 0; |
407
|
|
|
|
|
|
|
} |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
sub my_dist_text { |
411
|
40
|
|
|
40
|
0
|
87
|
my $a = lc shift; |
412
|
40
|
|
|
|
|
77
|
my $b = lc shift; |
413
|
40
|
|
|
|
|
82
|
$a =~ s/_/ /g; |
414
|
40
|
|
|
|
|
69
|
$b =~ s/_/ /g; |
415
|
40
|
|
|
|
|
395
|
return distance($a, $b); |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
} |
418
|
|
|
|
|
|
|
sub decodeHTMLEntities { |
419
|
106
|
|
|
106
|
0
|
190
|
my $in = shift; |
420
|
106
|
|
|
|
|
215
|
$in =~ s/&([\d\w\#]+);/&safe_decode($1)/gei; |
|
0
|
|
|
|
|
0
|
|
421
|
106
|
|
|
|
|
236
|
return $in; |
422
|
|
|
|
|
|
|
} |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
sub safe_decode { |
425
|
0
|
|
|
0
|
0
|
0
|
my $in = shift; |
426
|
0
|
0
|
|
|
|
0
|
if (substr($in,0,1) eq '#') { |
427
|
0
|
0
|
|
|
|
0
|
my $num = substr($in,1,1) eq 'x' ? hex(substr($in,1)) : substr($in,1); |
428
|
|
|
|
|
|
|
# we check and fix cp1232 entities |
429
|
|
|
|
|
|
|
return ($num < 127 or $num > 159) ? |
430
|
|
|
|
|
|
|
HTML::Entities::decode_entities("&$in;") : |
431
|
0
|
0
|
0
|
|
|
0
|
HTML::Entities::decode_entities("" . $WIN2UTF{$num} . ";"); |
432
|
|
|
|
|
|
|
} else { |
433
|
0
|
|
|
|
|
0
|
HTML::Entities::decode_entities("&$in;") |
434
|
|
|
|
|
|
|
} |
435
|
|
|
|
|
|
|
} |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
sub toString { |
438
|
72
|
|
|
72
|
0
|
181
|
my $h = shift; |
439
|
72
|
|
|
|
|
103
|
return join("; ",@{$h->{authors}}) . " ($h->{date}) $h->{title}\n"; |
|
72
|
|
|
|
|
413
|
|
440
|
|
|
|
|
|
|
} |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
sub sameTitle { |
443
|
16
|
|
|
16
|
0
|
5252
|
my ($a, $b, $threshold,$loose,$nolinks,%opts) = @_; |
444
|
16
|
|
|
|
|
80
|
return sameWork({ title => $a }, { title => $b }, $threshold,$loose,$nolinks,%opts); |
445
|
|
|
|
|
|
|
} |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
1; |
448
|
|
|
|
|
|
|
__END__ |