File Coverage

blib/lib/Lingua/JA/Name/Splitter.pm

Criterion	Covered	Total	%
statement	68	88	77.2
branch	23	38	60.5
condition	2	3	66.6
subroutine	9	9	100.0
pod	3	3	100.0
total	105	141	74.4

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::JA::Name::Splitter;
2
3	1			1		69170	use warnings;
	1					2
	1					32
4	1			1		6	use strict;
	1					5
	1					85
5							require Exporter;
6							our @ISA = qw(Exporter);
7							our @EXPORT_OK = qw/split_kanji_name split_romaji_name $kkre kkname/;
8							our %EXPORT_TAGS = ('all' => \@EXPORT_OK);
9							our $VERSION = '0.11';
10	1			1		666	use utf8;
	1					15
	1					4
11	1			1		31	use Carp;
	1					2
	1					68
12	1			1		604	use Lingua::JA::Moji ':all';
	1					50059
	1					541
13
14							# The probabilities that these characters are part of the family name.
15
16							my %known;
17
18							my $file = __FILE__;
19							$file =~ s/Splitter\.pm/probabilities.txt/;
20	1			1		8	open my $in, "<:encoding(utf8)", $file or die $!;
	1					2
	1					18
21							while (<$in>) {
22							my ($kanji, $prob) = split /\s/, $_;
23							$known{$kanji} = $prob;
24							}
25							close $in or die $!;
26
27							# The weight to give the position in the kanji if it is a known
28							# kanji.
29
30							our $length_weight = 0.735; # 42030 successes
31
32							# The cutoff for splitting the name
33
34							our $split_cutoff = 0.5;
35
36							# Set this to a true value to print debugging messages.
37
38							my $debug;
39
40							=head2 $kkre
41
42							Kanji-kana regular expression. This is intended to match kanji and
43							kana names.
44
45							=cut
46
47							our $kkre = qr!
48							\p{InCJKUnifiedIdeographs}\|
49							[々〆]\|
50							\p{InKana}
51							!x;
52
53							sub kkname
54							{
55	7			7	1	12	my ($kanji) = @_;
56	7	50				92	if ($kanji !~ /^($kkre)+$/) {
57	0					0	return undef;
58							}
59	7					2173	return 1;
60							}
61
62							sub split_kanji_name
63							{
64	7			7	1	4230	my ($kanji) = @_;
65							# Validate the user's input
66	7	50				20	if (! $kanji) {
67	0					0	carp "No valid name was provided to split_kanji_name";
68	0					0	return undef;
69							}
70	7	50				24	if (length $kanji == 1) {
71	0					0	carp "$kanji is only one character long, so there is nothing to split";
72	0					0	return ($kanji, '');
73							}
74	7	50				17	if (! kkname ($kanji)) {
75	0					0	carp "$kanji does not look like a kanji/kana name";
76							}
77	7	50				16	if (! wantarray ()) {
78	0					0	carp "The return value of split_kanji_name is an array";
79							}
80							# If the name is only two characters, there is only one possibility.
81	7	50				18	if (length $kanji == 2) {
82	0					0	return split '', $kanji;
83							}
84
85							# What we guess is the given name part of the name
86	7					13	my $given;
87							# What we guess is the family name part of the name
88							my $family;
89							# The characters in the name, which may not be kanji.
90	7					31	my @kanji = split '', $kanji;
91							# Probability this character is part of the family name.
92	7					12	my @probability;
93							# First character is definitely part of the family name.
94	7					13	$probability[0] = 1;
95							# Last character is definitely part of the given name.
96	7					14	$probability[$#kanji] = 0;
97	7					13	my $length = length $kanji;
98							# Loop from the second kanji to the second-from-last kanji
99	7					21	for my $i (1..$#kanji - 1) {
100	14					33	my $p = 1 - $i / ($length - 1);
101	14					19	my $moji = $kanji[$i];
102	14	100				35	if (is_kana ($moji)) {
		100
		50
103							# Assume that hiragana is not part of surname (not correct
104							# in practice).
105	2					21	$p = 0;
106							}
107							elsif ($known{$moji}) {
108	11					129	$p = $length_weight * $p + (1 - $length_weight) * $known{$moji};
109							}
110							elsif ($moji eq '々') {
111							# This repeated kanji has the same probability as the
112							# original kanji
113	1					18	$p = $probability[$i - 1];
114							}
115	14					34	$probability[$i] = $p;
116							}
117	7	50				23	if ($debug) {
118	0					0	print "@probability\n";
119	0					0	print "@kanji\n";
120							}
121	7					12	my $in_given;
122	7					17	for my $i (0..$#kanji) {
123	28	100				60	if ($probability[$i] < $split_cutoff) {
124	15					20	$in_given = 1;
125							}
126	28	100				45	if ($in_given) {
127	15					34	$given .= $kanji[$i];
128							}
129							else {
130	13					23	$family .= $kanji[$i];
131							}
132							}
133	7					33	return ($family, $given);
134							}
135
136							sub split_romaji_name
137							{
138	2			2	1	1165	my ($name) = @_;
139	2	50				8	if (! $name) {
140	0					0	carp "No name given to split_romaji_name";
141	0					0	return undef;
142							}
143	2	50				5	if (! wantarray ()) {
144	0					0	carp "The return value of split_romaji_name is an array";
145							}
146
147							# What we guess is the family name
148	2					5	my $last;
149							# What we guess is the personal name
150							my $first;
151	2	50				27	if ($name !~ /\s\|,/) {
152	0	0				0	if ($name =~ /^([A-Z][a-z]+)([A-Z]+)$/) {
153	0					0	$first = $1;
154	0					0	$last = $2;
155							}
156							else {
157							# If there is no space or comma, assume that this is the last name.
158	0					0	$first = '';
159	0					0	$last = $name;
160							}
161							}
162							else {
163							# Remove leading and trailing spaces.
164	2					10	$name =~ s/^\s+\|\s+$//g;
165	2					13	my @parts = split /,?\s+/, $name;
166	2					6	for (@parts) {
167	4	50				6635	if (! is_romaji_strict ($_)) {
168	0					0	carp "'$_' doesn't look like Japanese romaji";
169							}
170							}
171							# If there are more than two parts to the name after splitting by spaces
172	2	50				1040	if (@parts > 2) {
173	0					0	carp "Strange Japanese name '$name' with middle name?";
174							}
175							# If the last name is capitalized, or if there is a comma in the
176							# name.
177	2	100	66			20	if ($parts[0] =~ /^[A-Z]+$/ \|\| $name =~ /,/) {
178	1					4	$last = $parts[0];
179	1					2	$first = $parts[1];
180							}
181							else {
182	1					3	$last = $parts[1];
183	1					3	$first = $parts[0];
184							}
185							}
186							# Regularise the name
187	2					6	$first = ucfirst lc $first;
188	2					5	$last = ucfirst lc $last;
189	2					8	return ($first, $last);
190							}
191
192							1;