line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Search::Fulltext::Tokenizer::MeCab; |
2
|
3
|
|
|
3
|
|
101210
|
use strict; |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
97
|
|
3
|
3
|
|
|
3
|
|
35
|
use warnings; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
88
|
|
4
|
|
|
|
|
|
|
|
5
|
3
|
|
|
3
|
|
21
|
use Carp; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
203
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our $VERSION = '1.05'; |
8
|
3
|
|
|
3
|
|
4135
|
use Text::MeCab; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use Encode; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
use File::Basename; |
12
|
|
|
|
|
|
|
use Cwd; |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
use constant PREINSTALL_DICS => 'op.dic'; # '1.dic, 2.dic, 3.dic' |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
sub _mk_userdic_paths { |
17
|
|
|
|
|
|
|
my $libdir = Cwd::realpath(dirname(__FILE__)); |
18
|
|
|
|
|
|
|
my $dicdir = "${libdir}/../../../../share/dic"; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# to pass tests even if this module file is put under blib/ directory. |
21
|
|
|
|
|
|
|
# FIXME: too ugly... |
22
|
|
|
|
|
|
|
unless (-d $dicdir) { $dicdir = "${libdir}/../../../../../share/dic" } |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
my $p = "${dicdir}/" . PREINSTALL_DICS; |
25
|
|
|
|
|
|
|
if ($ENV{'MECABDIC_USERDIC'}) { $p .= ", $ENV{'MECABDIC_USERDIC'}" } |
26
|
|
|
|
|
|
|
$p; |
27
|
|
|
|
|
|
|
} |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
sub _dbglog { |
30
|
|
|
|
|
|
|
my $str = shift; |
31
|
|
|
|
|
|
|
binmode(STDERR, ":utf8"); |
32
|
|
|
|
|
|
|
if ($ENV{'MECABDIC_DEBUG'} && $ENV{'MECABDIC_DEBUG'} != '0') { |
33
|
|
|
|
|
|
|
print STDERR "$str"; |
34
|
|
|
|
|
|
|
} |
35
|
|
|
|
|
|
|
} |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
sub tokenizer { |
38
|
|
|
|
|
|
|
my $mecab = Text::MeCab->new({ |
39
|
|
|
|
|
|
|
userdic => _mk_userdic_paths, |
40
|
|
|
|
|
|
|
}); |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
return sub { |
43
|
|
|
|
|
|
|
my $string = shift; |
44
|
|
|
|
|
|
|
my $term_index = 0; |
45
|
|
|
|
|
|
|
my $node = $mecab->parse($string); |
46
|
|
|
|
|
|
|
_dbglog "string to be parsed: $string (" . length($string) . ")\n"; |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
return sub { |
49
|
|
|
|
|
|
|
my $term = Encode::decode_utf8 $node->surface or return; |
50
|
|
|
|
|
|
|
my $len = length $term; |
51
|
|
|
|
|
|
|
_dbglog "token: $term ($len)\n"; |
52
|
|
|
|
|
|
|
my $start = index($string, $term); |
53
|
|
|
|
|
|
|
my $end = $start + $len; |
54
|
|
|
|
|
|
|
$start >= 0 or croak '$term must be included in $string'; |
55
|
|
|
|
|
|
|
$node = $node->next or return; |
56
|
|
|
|
|
|
|
return ($term, $len, $start, $end, $term_index++); |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
}; |
59
|
|
|
|
|
|
|
} |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
1; |
62
|
|
|
|
|
|
|
__END__ |