File Coverage

blib/lib/App/Uni.pm

Criterion	Covered	Total	%
statement	82	128	64.0
branch	25	60	41.6
condition	5	18	27.7
subroutine	13	22	59.0
pod	0	12	0.0
total	125	240	52.0

line	stmt	bran	cond	sub	pod	time	code
1	1			1		485	use strict;
	1					8
	1					29
2	1			1		5	use warnings;
	1					2
	1					57
3							package App::Uni 9.006;
4							# ABSTRACT: command-line utility to find or display Unicode characters
5
6							#pod =encoding utf8
7							#pod
8							#pod =head1 NAME
9							#pod
10							#pod App::Uni - Command-line utility to grep UnicodeData.txt
11							#pod
12							#pod =head1 SYNOPSIS
13							#pod
14							#pod $ uni smiling face
15							#pod 263A ☺ WHITE SMILING FACE
16							#pod 263B ☻ BLACK SMILING FACE
17							#pod
18							#pod $ uni ☺
19							#pod 263A ☺ WHITE SMILING FACE
20							#pod
21							#pod # Only on Perl 5.14+
22							#pod $ uni wry
23							#pod 1F63C CAT FACE WITH WRY SMILE
24							#pod
25							#pod =head1 DESCRIPTION
26							#pod
27							#pod This module installs a simple program, F, that helps grepping through
28							#pod the Unicode database included in the current Perl 5 installation.
29							#pod
30							#pod For information on how to use F consult the L documentation.
31							#pod
32							#pod =head1 ACKNOWLEDGEMENTS
33							#pod
34							#pod This is a re-implementation of a program written by Audrey Tang in Taiwan. I
35							#pod used that program for years before deciding I wanted to add a few features,
36							#pod which I did by rewriting from scratch.
37							#pod
38							#pod That program, in turn, was a re-implementation of a same-named program Larry
39							#pod copied to me, which accompanied Audrey for years. However, that program was
40							#pod lost during a hard disk failure, so she coded it up from memory.
41							#pod
42							#pod Thank-you, Larry, for everything. ♡
43							#pod
44							#pod =cut
45
46	1			1		11	use 5.10.0; # for \v
	1					4
47	1			1		5	use warnings;
	1					2
	1					28
48
49	1			1		542	use charnames ();
	1					33143
	1					30
50	1			1		538	use Encode qw(encode_utf8);
	1					8535
	1					79
51	1			1		719	use Getopt::Long;
	1					12414
	1					7
52	1			1		182	use List::Util qw(max);
	1					2
	1					110
53	1			1		409	use Unicode::GCString;
	1					15211
	1					963
54
55							sub _do_help {
56	0			0		0	my $class = shift;
57
58	0	0				0	die
59							join qq{\n }, join(qq{\n}, @_, @_ ? "" : (), "usage:"),
60							"uni SEARCH-TERMS... - find codepoints with matching names or values",
61							"uni [-s] ONE-CHARACTER - print the codepoint and name of one character",
62							"uni -n SEARCH-TERMS... - find codepoints with matching names",
63							"uni -c STRINGS... - print out the codepoints in a string",
64							"uni -u CODEPOINTS... - look up and print hex codepoints",
65							"uni -x HEX-OCTETS... - given the sequence of octets, in hex, decode",
66							"",
67							"Other switches:",
68							" -8 - also show the UTF-8 bytes to encode\n";
69							}
70
71							sub run {
72	1			1	0	1861	my ($class, @argv) = @_;
73
74	1					2	my %opt;
75							{
76	1					2	my $exit;
	1					3
77	1					3	local @ARGV = @argv;
78							GetOptions(
79							"c" => \$opt{explode},
80							"u" => \$opt{u_numbers},
81							"n" => \$opt{names},
82							"s" => \$opt{single},
83							"x" => \$opt{hex_octets},
84							"8" => \$opt{utf8},
85							"help\|?" => \$opt{help},
86	1					17	);
87	1					599	@argv = @ARGV;
88							}
89
90	1	50				6	$class->_do_help if $opt{help};
91
92	1					5	my $n = grep { $_ } @opt{qw(explode u_numbers names single hex_octets)};
	5					15
93
94	1	50				4	$class->_do_help("ERROR: only one mode switch allowed!") if $n > 1;
95
96	1	50				4	$class->_do_help if ! @argv;
97
98							my $todo = $opt{explode} ? \&do_explode
99							: $opt{u_numbers} ? \&do_u_numbers
100							: $opt{names} ? \&do_names
101							: $opt{single} ? \&do_single
102	1	50	33			13	: $opt{hex_octets} ? \&do_hex_octets
		50
		50
		50
		50
		50
103							: @argv == 1 && length $argv[0] == 1 ? \&do_single
104							: \&do_dwim;
105
106	1					5	$todo->(\@argv, \%opt);
107							}
108
109							sub do_single {
110	0			0	0	0	my @chars = grep { length } @{ $_[0] };
	0					0
	0					0
111	0	0				0	if (my @too_long = grep { length > 1 } @chars) {
	0					0
112	0					0	die "some arguments were too long for use with -s: @too_long\n";
113							}
114	0					0	print_chars(\@chars, $_[1]);
115							}
116
117							sub do_explode {
118	0			0	0	0	print_chars( explode_strings($_[0]), $_[1] );
119							}
120
121							sub do_hex_octets {
122	0			0	0	0	my $string = '';
123	0					0	for my $hunk (@{ $_[0] }) {
	0					0
124	0	0	0			0	die "input hunk $hunk is not an even-length hex string\n"
125							unless $hunk =~ /\A[0-9A-F]+\z/i && length($hunk) % 2 == 0;
126
127	0					0	$string .= chr oct "0x$_" for $hunk =~ /(..)/g;
128							}
129
130	0					0	print_chars( explode_strings([ Encode::decode_utf8($string) ], $_[1]) );
131							}
132
133							sub explode_strings {
134	0			0	0	0	my ($strings) = @_;
135
136	0					0	my @chars;
137
138	0					0	while (my $str = shift @$strings) {
139	0					0	push @chars, split '', $str;
140	0	0				0	push @chars, undef if @$strings;
141							}
142
143	0					0	return \@chars;
144							}
145
146							sub do_u_numbers {
147	0			0	0	0	print_chars( chars_by_u_numbers($_[0]), $_[1] );
148							}
149
150							sub print_chars {
151	1			1	0	5	my ($chars, $opt) = @_;
152
153							my @to_print = $opt->{utf8}
154	0		0			0	? (map {; [ $_ => defined && encode_utf8($_) ] } @$chars)
155	1	50				7	: (map {; [ $_ ] } @$chars);
	17					33
156
157	1					2	my $width;
158	1	50				4	if ($opt->{utf8}) {
159	0					0	my $max_bytes = 0;
160	0					0	for my $todo (@to_print) {
161	0					0	$max_bytes = max($max_bytes, length $todo->[1]);
162	0	0				0	last if $max_bytes == 4; # maximum ever
163							}
164
165	0					0	$width = 2 * $max_bytes + $max_bytes - 1;
166							}
167
168	1					4	for my $todo (@to_print) {
169	17					47	my ($c, $u) = @$todo;
170
171	17	50				39	unless (defined $c) { print "\n"; next }
	0					0
	0					0
172
173							# U+25CC DOTTED CIRCLE
174	17	50				106	my $c2 = Unicode::GCString->new(
175							$c =~ /\pM/ ? "\x{25CC}$c" : $c
176							);
177	17					281	my $l = $c2->columns;
178
179							# I'm not 100% sure why I need this in all cases. It would make sense in
180							# some, since for example COMBINING GRAVE beginning a line becomes its
181							# own extended grapheme cluster (right?), but why does INVISIBLE TIMES at
182							# the beginning of a line take up a column despite being printing width
183							# zero? The world may never know. Until Tom tells me.
184							# -- rjbs, 2014-10-04
185	17	50				36	$l = 1 if $l == 0; # ???
186
187							# Yeah, probably there's some insane %*0s$ invocation of printf to use
188							# here, but... just no. -- rjbs, 2014-10-04
189	17					124	(my $p = "$c2") =~ s/\v/ /g;
190	17					48	$p .= (' ' x (2 - $l));
191
192	17					31	my $chr = ord($c);
193	17					46	my $name = charnames::viacode($chr);
194							my $utf8 = $opt->{utf8}
195							? (sprintf " - %${width}s",
196	17	50				19732	join q{ }, map {; sprintf '%02X', ord } split //, $u)
	0					0
197							: '';
198
199	17					225	printf "%s- U+%05X%s - %s\n", $p, $chr, $utf8, $name;
200							}
201							}
202
203							sub chars_by_u_numbers {
204	0			0	0	0	my ($points) = @_;
205	0					0	my @chars = map {; /\A(?:u\+)?(.+)/; chr hex $1 } @$points;
	0					0
	0					0
206	0					0	return \@chars;
207							}
208
209							sub do_names {
210	0			0	0	0	my ($terms, $opt) = @_;
211
212	0					0	print_chars( chars_by_name( $terms ), $opt );
213							}
214
215							sub chars_by_name {
216	1			1	0	3	my ($input_terms, $arg) = @_;
217	1	50				2	my @terms = map {; { pattern => s{\A/(.+)/\z}{$1} ? qr/$_/i : qr/\b$_\b/i } }
	2					25
218							@$input_terms;
219
220	1	50	33			8	if ($arg && $arg->{match_codepoints}) {
221	1					5	for (0 .. $#terms) {
222	2	100				14	$terms[$_]{ord} = hex $input_terms->[$_]
223							if $input_terms->[$_] =~ /\A[0-9A-Fa-f]+\z/;
224							}
225							}
226
227	1					11850	state $corpus = do 'unicore/Name.pl';
228	1	50				17	unless (defined $corpus) {
229	0	0				0	die "couldn't parse unicore/Name.pl: $@" if $@;
230	0	0				0	die "couldn't read unicore/Name.pl: $!" if $!;
231	0					0	die "unicore/Name.pl returned undef";
232							}
233
234							# https://github.com/perl/perl5/commit/b555069b72f93a232deba173dc7bf7892cfa5868
235	1	50				10	my ($entry_sep, $field_sep) = "$]" >= 5.031010 ? ("\n\n", "\n") : ("\n", "\t");
236	1					7965	my @entries = split $entry_sep, $corpus;
237	1					6	my @chars;
238
239							my %seen;
240	1					3	ENTRY: for my $entry (@entries) {
241	30378					53501	my $i = index($entry, $field_sep);
242	30378	100				47805	next if rindex($entry, " ", $i) >= 0; # no sequences
243
244	29948					54665	my $name = substr($entry, $i+1);
245	29948					43054	my $ord = hex substr($entry, 0, $i);
246
247	29948					39376	for (@terms) {
248							next ENTRY unless $name =~ $_->{pattern}
249	29965	50	33			155041	or defined $_->{ord} && $_->{ord} == $ord;
			66
250							}
251
252	17					42	my $c = chr hex substr $entry, 0, $i;
253	17	50				58	next if $seen{$c}++;
254	17					51	push @chars, chr hex substr $entry, 0, $i;
255							}
256
257	1					1473	return \@chars;
258							}
259
260							sub smerge {
261	0			0	0	0	my %splat = map {; $_ => 1 } map { @$_ } @_;
	0					0
	0					0
262	0					0	return [ sort keys %splat ];
263							}
264
265							sub do_dwim {
266	1			1	0	3	my ($argv, $opt) = @_;
267	1					5	my $chars = chars_by_name($argv, { match_codepoints => 1 });
268	1					8	print_chars($chars, $opt);
269							}
270
271							1;
272
273							__END__