line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Fingerprint; |
2
|
|
|
|
|
|
|
# ABSTRACT: perform simple text clustering by key collision |
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
|
5
|
2
|
|
|
2
|
|
53580
|
use strict; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
72
|
|
6
|
2
|
|
|
2
|
|
10
|
use utf8; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
18
|
|
7
|
2
|
|
|
2
|
|
48
|
use warnings qw(all); |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
78
|
|
8
|
|
|
|
|
|
|
|
9
|
2
|
|
|
2
|
|
9
|
use base q(Exporter); |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
436
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our %EXPORT_TAGS = (all => [qw(fingerprint fingerprint_ngram)]); |
12
|
|
|
|
|
|
|
our @EXPORT_OK = (@{$EXPORT_TAGS{all}}); |
13
|
|
|
|
|
|
|
our @EXPORT = qw(); |
14
|
|
|
|
|
|
|
|
15
|
2
|
|
|
2
|
|
1754
|
use List::MoreUtils qw(uniq); |
|
2
|
|
|
|
|
2688
|
|
|
2
|
|
|
|
|
197
|
|
16
|
2
|
|
|
2
|
|
2112
|
use Text::Unidecode; |
|
2
|
|
|
|
|
5151
|
|
|
2
|
|
|
|
|
713
|
|
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
our $VERSION = '0.006'; # VERSION |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
my $NON_WORD = qr{ [\W_]+ }x; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
sub fingerprint ($) { |
24
|
1
|
|
|
1
|
1
|
9
|
my ($string) = @_; |
25
|
|
|
|
|
|
|
|
26
|
1
|
|
|
|
|
6
|
$string = lc unidecode $string; |
27
|
1
|
|
|
|
|
3871
|
$string =~ s{^ $NON_WORD | $NON_WORD $}{}gosx; |
28
|
|
|
|
|
|
|
|
29
|
1
|
|
|
|
|
1174
|
return join q( ) => |
30
|
|
|
|
|
|
|
sort( |
31
|
|
|
|
|
|
|
uniq( |
32
|
|
|
|
|
|
|
split( |
33
|
|
|
|
|
|
|
m{ $NON_WORD }ox, |
34
|
|
|
|
|
|
|
$string |
35
|
|
|
|
|
|
|
) |
36
|
|
|
|
|
|
|
) |
37
|
|
|
|
|
|
|
); |
38
|
|
|
|
|
|
|
} |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
sub fingerprint_ngram ($;$) { |
42
|
4
|
|
|
4
|
1
|
459
|
my ($string, $n) = (@_, 2); |
43
|
|
|
|
|
|
|
|
44
|
4
|
|
|
|
|
13
|
$string = lc unidecode $string; |
45
|
4
|
|
|
|
|
358
|
$string =~ s{ $NON_WORD }{}gosx; |
46
|
|
|
|
|
|
|
|
47
|
4
|
|
|
|
|
507
|
return join '' => |
48
|
|
|
|
|
|
|
sort( |
49
|
|
|
|
|
|
|
uniq( |
50
|
|
|
|
|
|
|
$string =~ m{ |
51
|
|
|
|
|
|
|
(?= |
52
|
|
|
|
|
|
|
(.{$n}) |
53
|
|
|
|
|
|
|
) |
54
|
|
|
|
|
|
|
}gx |
55
|
|
|
|
|
|
|
) |
56
|
|
|
|
|
|
|
); |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
1; |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
__END__ |