File Coverage

blib/lib/Lingua/JA/Name/Splitter.pm

Criterion	Covered	Total	%
statement	79	79	100.0
branch	34	34	100.0
condition	3	3	100.0
subroutine	9	9	100.0
pod	3	3	100.0
total	128	128	100.0

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::JA::Name::Splitter;
2
3	1			1		86000	use warnings;
	1					8
	1					34
4	1			1		5	use strict;
	1					2
	1					90
5							require Exporter;
6							our @ISA = qw(Exporter);
7							our @EXPORT_OK = qw/split_kanji_name split_romaji_name $kkre kkname/;
8							our %EXPORT_TAGS = ('all' => \@EXPORT_OK);
9							our $VERSION = '0.12';
10	1			1		684	use utf8;
	1					14
	1					4
11	1			1		31	use Carp;
	1					2
	1					57
12	1			1		609	use Lingua::JA::Moji ':all';
	1					49626
	1					545
13
14							# The probabilities that these characters are part of the family name.
15
16							my %known;
17
18							my $file = __FILE__;
19							$file =~ s/Splitter\.pm/probabilities.txt/;
20	1			1		6	open my $in, "<:encoding(utf8)", $file or die $!;
	1					1
	1					8
21							while (<$in>) {
22							my ($kanji, $prob) = split /\s/, $_;
23							$known{$kanji} = $prob;
24							}
25							close $in or die $!;
26
27							# The weight to give the position in the kanji if it is a known
28							# kanji.
29
30							our $length_weight = 0.735; # 42030 successes
31
32							# The cutoff for splitting the name
33
34							our $split_cutoff = 0.5;
35
36							# Set this to a true value to print debugging messages.
37
38							#my $debug = 1;
39							#use open qw(:std :encoding(UTF-8));a # when debugging
40
41							=head2 $kkre
42
43							Kanji-kana regular expression. This is intended to match kanji and
44							kana names.
45
46							=cut
47
48							our $kkre = qr!
49							\p{InCJKUnifiedIdeographs}\|
50							[々〆]\|
51							\p{InKana}
52							!x;
53
54							sub kkname
55							{
56	11			11	1	21	my ($kanji) = @_;
57	11	100				129	if ($kanji !~ /^($kkre)+$/) {
58	1					4	return undef;
59							}
60	10					2091	return 1;
61							}
62
63							sub split_kanji_name
64							{
65	13			13	1	20468	my ($kanji) = @_;
66							# Validate the user's input
67	13	100				38	if (! $kanji) {
68	1					171	carp "No valid name was provided to split_kanji_name";
69	1					8	return undef;
70							}
71	12	100				35	if (length $kanji == 1) {
72	1					189	carp "$kanji is only one character long, so there is nothing to split";
73	1					10	return ($kanji, '');
74							}
75	11	100				22	if (! kkname ($kanji)) {
76	1					114	carp "$kanji does not look like a kanji/kana name";
77							}
78	11	100				34	if (! wantarray ()) {
79	1					124	carp "The return value of split_kanji_name is an array";
80							}
81							# If the name is only two characters, there is only one possibility.
82	11	100				40	if (length $kanji == 2) {
83	1					9	return split '', $kanji;
84							}
85
86							# The characters in the name, which may not be kanji.
87	10					38	my @kanji = split '', $kanji;
88							# Probability this character is part of the family name.
89	10					18	my @probability;
90							# First character is definitely part of the family name.
91	10					22	$probability[0] = 1;
92							# Last character is definitely part of the given name.
93	10					19	$probability[$#kanji] = 0;
94	10					19	my $length = length $kanji;
95							# Loop from the second kanji to the second-from-last kanji
96	10					27	for my $i (1..$#kanji - 1) {
97	18					41	my $p = 1 - $i / ($length - 1);
98	18					65	my $moji = $kanji[$i];
99	18	100				49	if (is_kana ($moji)) {
		100
		100
100							# Assume that hiragana is not part of surname (not correct
101							# in practice).
102	2					29	$p = 0;
103							}
104							elsif ($known{$moji}) {
105	13					143	$p = $length_weight * $p + (1 - $length_weight) * $known{$moji};
106							}
107							elsif ($moji eq '々') {
108							# This repeated kanji has the same probability as the
109							# original kanji
110	1					23	$p = $probability[$i - 1];
111							}
112	18					57	$probability[$i] = $p;
113							#if ($debug) { # Commented out to improve test coverage
114							# print STDERR "$kanji[$i] i=$i p=$p\n";
115							#}
116	18	100				55	if ($probability[$i] < $split_cutoff) {
117	9					64	return (substr ($kanji, 0, $i), substr ($kanji, $i));
118							}
119							}
120	1					7	return (substr ($kanji, 0, -1), substr ($kanji, -1));
121							}
122
123							sub split_romaji_name
124							{
125	9			9	1	13164	my ($name) = @_;
126	9	100				26	if (! $name) {
127	1					105	carp "No name given to split_romaji_name";
128	1					18	return undef;
129							}
130	8	100				20	if (! wantarray ()) {
131	1					102	carp "The return value of split_romaji_name is an array";
132							}
133
134							# What we guess is the family name
135	8					21	my $last;
136							# What we guess is the personal name
137							my $first;
138	8	100				43	if ($name !~ /\s\|,/) {
139	2	100				20	if ($name =~ /^([A-Z][a-z]+)([A-Z]+)$/) {
140	1					3	$first = $1;
141	1					3	$last = $2;
142							}
143							else {
144							# If there is no space or comma, assume that this is the last name.
145	1					3	$first = '';
146	1					3	$last = $name;
147							}
148							}
149							else {
150							# Remove leading and trailing spaces.
151	6					30	$name =~ s/^\s+\|\s+$//g;
152	6					38	my @parts = split /,?\s+/, $name;
153	6					16	for (@parts) {
154	13	100				8952	if (! is_romaji_strict ($_)) {
155	1					172	carp "'$_' doesn't look like Japanese romaji";
156							}
157							}
158							# If there are more than two parts to the name after splitting by spaces
159	6	100				2571	if (@parts > 2) {
160	1					105	carp "Strange Japanese name '$name' with middle name?";
161							}
162							# If the last name is capitalized, or if there is a comma in the
163							# name.
164	6	100	100			62	if ($parts[0] =~ /^[A-Z]+$/ \|\| $name =~ /,/) {
165	2					5	$last = $parts[0];
166	2					4	$first = $parts[1];
167							}
168							else {
169	4					10	$last = $parts[1];
170	4					9	$first = $parts[0];
171							}
172							}
173							# Regularise the name
174	8					21	$first = ucfirst lc $first;
175	8					14	$last = ucfirst lc $last;
176	8					28	return ($first, $last);
177							}
178
179							1;