File Coverage

blib/lib/Lingua/JA/Jtruncate.pm

Criterion	Covered	Total	%
statement	35	35	100.0
branch	13	18	72.2
condition	3	6	50.0
subroutine	6	6	100.0
pod	1	2	50.0
total	58	67	86.5

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::JA::Jtruncate;
2
3							#------------------------------------------------------------------------------
4							#
5							# Start of POD
6							#
7							#------------------------------------------------------------------------------
8
9							=head1 NAME
10
11							Lingua::JA::Jtruncate - module to truncate Japanese encoded text.
12
13							=head1 SYNOPSIS
14
15							use Lingua::JA::Jtruncate qw( jtruncate );
16							$truncated_jtext = jtruncate( $jtext, $length );
17
18							=head1 DESCRIPTION
19
20							The jtruncate function truncates text to a length $length less than bytes. It
21							is designed to cope with Japanese text which has been encoded using one of the
22							standard encoding schemes - EUC, JIS, and Shift-JIS.
23							It uses the L module to detect what encoding is being used.
24							If the text is none of the above Japanese encodings,
25							the text is just truncated using substr.
26							If it is detected as Japanese text, it tries to truncate the text as well as
27							possible without breaking the multi-byte encoding. It does this by detecting
28							the character encoding of the text, and recursively deleting Japanese (possibly
29							multi-byte) characters from the end of the text until it is underneath the
30							length specified. It should work for EUC, JIS and Shift-JIS encodings.
31
32							=head1 FUNCTIONS
33
34							=head2 jtruncate( $jtext, $length )
35
36							B takes some japanese text and a byte length as arguments, and
37							returns the japanese text truncated to that byte length.
38
39							$truncated_jtext = jtruncate( $jtext, $length );
40
41							=head1 SEE ALSO
42
43							L
44
45							=head1 REPOSITORY
46
47							L
48
49							=head1 AUTHOR
50
51							Originally written by Ave Wrigley (AWRIGLEY),
52							now maintained by Neil Bowers (NEILB).
53
54							=head1 COPYRIGHT
55
56							Copyright (c) 1997 Canon Research Centre Europe (CRE). All rights reserved.
57
58							This is free software; you can redistribute it and/or modify it under
59							the same terms as the Perl 5 programming language system itself.
60
61							=cut
62
63							#------------------------------------------------------------------------------
64							#
65							# End of POD
66							#
67							#------------------------------------------------------------------------------
68
69							#------------------------------------------------------------------------------
70							#
71							# Pragmas
72							#
73							#------------------------------------------------------------------------------
74
75							require 5.006;
76	2			2		10	use strict;
	2					10
	2					49
77	2			2		10	use warnings;
	2					4
	2					53
78
79							#==============================================================================
80							#
81							# Modules
82							#
83							#==============================================================================
84
85							# use Lingua::JA::Jcode;
86	2			2		1561	use Jcode;
	2					157079
	2					317
87							require Exporter;
88
89							#==============================================================================
90							#
91							# Public globals
92							#
93							#==============================================================================
94
95	2					1624	use vars qw(
96							$VERSION
97							@ISA
98							@EXPORT_OK
99							%euc_code_set
100							%sjis_code_set
101							%jis_code_set
102							%char_re
103	2			2		24	);
	2					5
104
105							$VERSION = '0.020';
106							@ISA = qw( Exporter );
107							@EXPORT_OK = qw( jtruncate );
108
109							%euc_code_set = (
110							ASCII_JIS_ROMAN => '[\x00-\x7f]',
111							JIS_X_0208_1997 => '[\xa1-\xfe][\xa1-\xfe]',
112							HALF_WIDTH_KATAKANA => '\x8e[\xa0-\xdf]',
113							JIS_X_0212_1990 => '\x8f[\xa1-\xfe][\xa1-\xfe]',
114							);
115
116							%sjis_code_set = (
117							ASCII_JIS_ROMAN => '[\x21-\x7e]',
118							HALF_WIDTH_KATAKANA => '[\xa1-\xdf]',
119							TWO_BYTE_CHAR => '[\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]',
120							);
121
122							%jis_code_set = (
123							TWO_BYTE_ESC =>
124							'(?:' .
125							join( '\|',
126							'\x1b\x24\x40',
127							'\x1b\x24\x42',
128							'\x1b\x26\x40\x1b\x24\x42',
129							'\x1b\x24\x28\x44',
130							) .
131							')'
132							,
133							TWO_BYTE_CHAR => '(?:[\x21-\x7e][\x21-\x7e])',
134							ONE_BYTE_ESC => '(?:\x1b\x28[\x4a\x48\x42\x49])',
135							ONE_BYTE_CHAR =>
136							'(?:' .
137							join( '\|',
138							'[\x21-\x5f]', # JIS7 Half width katakana
139							'\x0f[\xa1-\xdf]*\x0e', # JIS8 Half width katakana
140							'[\x21-\x7e]', # ASCII / JIS-Roman
141							) .
142							')'
143							);
144
145							%char_re = (
146							'euc' => '(?:' . join( '\|', values %euc_code_set ) . ')',
147							'sjis' => '(?:' . join( '\|', values %sjis_code_set ) . ')',
148							'jis' => '(?:' . join( '\|', values %jis_code_set ) . ')',
149							);
150
151							#==============================================================================
152							#
153							# Public exported functions
154							#
155							#==============================================================================
156
157							#------------------------------------------------------------------------------
158							#
159							# jtruncate( $text, $length )
160							#
161							# truncate a string safely (i.e. don't break japanese encoding)
162							#
163							#------------------------------------------------------------------------------
164
165							sub jtruncate
166							{
167	25			25	1	50	my $text = shift;
168	25					42	my $length = shift;
169
170							# sanity checks
171
172	25	50				63	return '' if $length == 0;
173	25	50				62	return undef if not defined $length;
174	25	50				66	return undef if $length < 0;
175	25	50				56	return $text if length( $text ) <= $length;
176
177							# save the original text, in case we need to bomb out with a substr
178
179	25					33	my $orig_text = $text;
180
181	25					896	my $encoding = Jcode::getcode( \$text );
182	25	100	33			2916	if ( not defined $encoding or $encoding !~ /^(?:euc\|s?jis)$/ )
183							{
184
185							# not euc/sjis/jis - just use substr
186
187	7					30	return substr( $text, 0, $length );
188							}
189
190	18					53	$text = chop_jchars( $text, $length, $encoding );
191	18	100				51	return substr( $orig_text, 0, $length ) unless defined $text;
192
193							# JIS encoding uses escape sequences to shift in and out of single-byte /
194							# multi-byte modes. If the truncation process leaves the text ending in
195							# multi-byte mode, we need to add the single-byte escape sequence.
196							# Therefore, we truncate (at least) 3 more bytes from JIS encoded
197							# string, so we have room to add the single-byte escape sequence without
198							# going over the $length limit
199
200	15	100	66			139	if ( $encoding eq 'jis' and $text =~ /$jis_code_set{ TWO_BYTE_CHAR }$/ )
201							{
202	6					19	$text = chop_jchars( $text, $length - 3, $encoding );
203	6	50				17	return substr( $orig_text, 0, $length ) unless defined $text;
204	6					14	$text .= "\x1b\x28\x42";
205							}
206	15					50	return $text;
207							}
208
209							sub chop_jchars
210							{
211	24			24	0	37	my $text = shift;
212	24					34	my $length = shift;
213	24					30	my $encoding = shift;
214
215	24					68	while( length( $text ) > $length )
216							{
217	1099	100				5369	return undef unless $text =~ s!$char_re{ $encoding }$!!o;
218							}
219
220	21					49	return $text;
221							}
222
223							#==============================================================================
224							#
225							# Return true
226							#
227							#==============================================================================
228
229							1;