File Coverage

blib/lib/App/Uni.pm
Criterion Covered Total %
statement 82 128 64.0
branch 25 60 41.6
condition 5 18 27.7
subroutine 13 22 59.0
pod 0 12 0.0
total 125 240 52.0


line stmt bran cond sub pod time code
1 1     1   485 use strict;
  1         8  
  1         29  
2 1     1   5 use warnings;
  1         2  
  1         57  
3             package App::Uni 9.006;
4             # ABSTRACT: command-line utility to find or display Unicode characters
5              
6             #pod =encoding utf8
7             #pod
8             #pod =head1 NAME
9             #pod
10             #pod App::Uni - Command-line utility to grep UnicodeData.txt
11             #pod
12             #pod =head1 SYNOPSIS
13             #pod
14             #pod $ uni smiling face
15             #pod 263A ☺ WHITE SMILING FACE
16             #pod 263B ☻ BLACK SMILING FACE
17             #pod
18             #pod $ uni ☺
19             #pod 263A ☺ WHITE SMILING FACE
20             #pod
21             #pod # Only on Perl 5.14+
22             #pod $ uni wry
23             #pod 1F63C CAT FACE WITH WRY SMILE
24             #pod
25             #pod =head1 DESCRIPTION
26             #pod
27             #pod This module installs a simple program, F, that helps grepping through
28             #pod the Unicode database included in the current Perl 5 installation.
29             #pod
30             #pod For information on how to use F consult the L documentation.
31             #pod
32             #pod =head1 ACKNOWLEDGEMENTS
33             #pod
34             #pod This is a re-implementation of a program written by Audrey Tang in Taiwan. I
35             #pod used that program for years before deciding I wanted to add a few features,
36             #pod which I did by rewriting from scratch.
37             #pod
38             #pod That program, in turn, was a re-implementation of a same-named program Larry
39             #pod copied to me, which accompanied Audrey for years. However, that program was
40             #pod lost during a hard disk failure, so she coded it up from memory.
41             #pod
42             #pod Thank-you, Larry, for everything. ♡
43             #pod
44             #pod =cut
45              
46 1     1   11 use 5.10.0; # for \v
  1         4  
47 1     1   5 use warnings;
  1         2  
  1         28  
48              
49 1     1   542 use charnames ();
  1         33143  
  1         30  
50 1     1   538 use Encode qw(encode_utf8);
  1         8535  
  1         79  
51 1     1   719 use Getopt::Long;
  1         12414  
  1         7  
52 1     1   182 use List::Util qw(max);
  1         2  
  1         110  
53 1     1   409 use Unicode::GCString;
  1         15211  
  1         963  
54              
55             sub _do_help {
56 0     0   0 my $class = shift;
57              
58 0 0       0 die
59             join qq{\n }, join(qq{\n}, @_, @_ ? "" : (), "usage:"),
60             "uni SEARCH-TERMS... - find codepoints with matching names or values",
61             "uni [-s] ONE-CHARACTER - print the codepoint and name of one character",
62             "uni -n SEARCH-TERMS... - find codepoints with matching names",
63             "uni -c STRINGS... - print out the codepoints in a string",
64             "uni -u CODEPOINTS... - look up and print hex codepoints",
65             "uni -x HEX-OCTETS... - given the sequence of octets, in hex, decode",
66             "",
67             "Other switches:",
68             " -8 - also show the UTF-8 bytes to encode\n";
69             }
70              
71             sub run {
72 1     1 0 1861 my ($class, @argv) = @_;
73              
74 1         2 my %opt;
75             {
76 1         2 my $exit;
  1         3  
77 1         3 local @ARGV = @argv;
78             GetOptions(
79             "c" => \$opt{explode},
80             "u" => \$opt{u_numbers},
81             "n" => \$opt{names},
82             "s" => \$opt{single},
83             "x" => \$opt{hex_octets},
84             "8" => \$opt{utf8},
85             "help|?" => \$opt{help},
86 1         17 );
87 1         599 @argv = @ARGV;
88             }
89              
90 1 50       6 $class->_do_help if $opt{help};
91              
92 1         5 my $n = grep { $_ } @opt{qw(explode u_numbers names single hex_octets)};
  5         15  
93              
94 1 50       4 $class->_do_help("ERROR: only one mode switch allowed!") if $n > 1;
95              
96 1 50       4 $class->_do_help if ! @argv;
97              
98             my $todo = $opt{explode} ? \&do_explode
99             : $opt{u_numbers} ? \&do_u_numbers
100             : $opt{names} ? \&do_names
101             : $opt{single} ? \&do_single
102 1 50 33     13 : $opt{hex_octets} ? \&do_hex_octets
    50          
    50          
    50          
    50          
    50          
103             : @argv == 1 && length $argv[0] == 1 ? \&do_single
104             : \&do_dwim;
105              
106 1         5 $todo->(\@argv, \%opt);
107             }
108              
109             sub do_single {
110 0     0 0 0 my @chars = grep { length } @{ $_[0] };
  0         0  
  0         0  
111 0 0       0 if (my @too_long = grep { length > 1 } @chars) {
  0         0  
112 0         0 die "some arguments were too long for use with -s: @too_long\n";
113             }
114 0         0 print_chars(\@chars, $_[1]);
115             }
116              
117             sub do_explode {
118 0     0 0 0 print_chars( explode_strings($_[0]), $_[1] );
119             }
120              
121             sub do_hex_octets {
122 0     0 0 0 my $string = '';
123 0         0 for my $hunk (@{ $_[0] }) {
  0         0  
124 0 0 0     0 die "input hunk $hunk is not an even-length hex string\n"
125             unless $hunk =~ /\A[0-9A-F]+\z/i && length($hunk) % 2 == 0;
126              
127 0         0 $string .= chr oct "0x$_" for $hunk =~ /(..)/g;
128             }
129              
130 0         0 print_chars( explode_strings([ Encode::decode_utf8($string) ], $_[1]) );
131             }
132              
133             sub explode_strings {
134 0     0 0 0 my ($strings) = @_;
135              
136 0         0 my @chars;
137              
138 0         0 while (my $str = shift @$strings) {
139 0         0 push @chars, split '', $str;
140 0 0       0 push @chars, undef if @$strings;
141             }
142              
143 0         0 return \@chars;
144             }
145              
146             sub do_u_numbers {
147 0     0 0 0 print_chars( chars_by_u_numbers($_[0]), $_[1] );
148             }
149              
150             sub print_chars {
151 1     1 0 5 my ($chars, $opt) = @_;
152              
153             my @to_print = $opt->{utf8}
154 0   0     0 ? (map {; [ $_ => defined && encode_utf8($_) ] } @$chars)
155 1 50       7 : (map {; [ $_ ] } @$chars);
  17         33  
156              
157 1         2 my $width;
158 1 50       4 if ($opt->{utf8}) {
159 0         0 my $max_bytes = 0;
160 0         0 for my $todo (@to_print) {
161 0         0 $max_bytes = max($max_bytes, length $todo->[1]);
162 0 0       0 last if $max_bytes == 4; # maximum ever
163             }
164              
165 0         0 $width = 2 * $max_bytes + $max_bytes - 1;
166             }
167              
168 1         4 for my $todo (@to_print) {
169 17         47 my ($c, $u) = @$todo;
170              
171 17 50       39 unless (defined $c) { print "\n"; next }
  0         0  
  0         0  
172              
173             # U+25CC DOTTED CIRCLE
174 17 50       106 my $c2 = Unicode::GCString->new(
175             $c =~ /\pM/ ? "\x{25CC}$c" : $c
176             );
177 17         281 my $l = $c2->columns;
178              
179             # I'm not 100% sure why I need this in all cases. It would make sense in
180             # some, since for example COMBINING GRAVE beginning a line becomes its
181             # own extended grapheme cluster (right?), but why does INVISIBLE TIMES at
182             # the beginning of a line take up a column despite being printing width
183             # zero? The world may never know. Until Tom tells me.
184             # -- rjbs, 2014-10-04
185 17 50       36 $l = 1 if $l == 0; # ???
186              
187             # Yeah, probably there's some insane %*0s$ invocation of printf to use
188             # here, but... just no. -- rjbs, 2014-10-04
189 17         124 (my $p = "$c2") =~ s/\v/ /g;
190 17         48 $p .= (' ' x (2 - $l));
191              
192 17         31 my $chr = ord($c);
193 17         46 my $name = charnames::viacode($chr);
194             my $utf8 = $opt->{utf8}
195             ? (sprintf " - %${width}s",
196 17 50       19732 join q{ }, map {; sprintf '%02X', ord } split //, $u)
  0         0  
197             : '';
198              
199 17         225 printf "%s- U+%05X%s - %s\n", $p, $chr, $utf8, $name;
200             }
201             }
202              
203             sub chars_by_u_numbers {
204 0     0 0 0 my ($points) = @_;
205 0         0 my @chars = map {; /\A(?:u\+)?(.+)/; chr hex $1 } @$points;
  0         0  
  0         0  
206 0         0 return \@chars;
207             }
208              
209             sub do_names {
210 0     0 0 0 my ($terms, $opt) = @_;
211              
212 0         0 print_chars( chars_by_name( $terms ), $opt );
213             }
214              
215             sub chars_by_name {
216 1     1 0 3 my ($input_terms, $arg) = @_;
217 1 50       2 my @terms = map {; { pattern => s{\A/(.+)/\z}{$1} ? qr/$_/i : qr/\b$_\b/i } }
  2         25  
218             @$input_terms;
219              
220 1 50 33     8 if ($arg && $arg->{match_codepoints}) {
221 1         5 for (0 .. $#terms) {
222 2 100       14 $terms[$_]{ord} = hex $input_terms->[$_]
223             if $input_terms->[$_] =~ /\A[0-9A-Fa-f]+\z/;
224             }
225             }
226              
227 1         11850 state $corpus = do 'unicore/Name.pl';
228 1 50       17 unless (defined $corpus) {
229 0 0       0 die "couldn't parse unicore/Name.pl: $@" if $@;
230 0 0       0 die "couldn't read unicore/Name.pl: $!" if $!;
231 0         0 die "unicore/Name.pl returned undef";
232             }
233              
234             # https://github.com/perl/perl5/commit/b555069b72f93a232deba173dc7bf7892cfa5868
235 1 50       10 my ($entry_sep, $field_sep) = "$]" >= 5.031010 ? ("\n\n", "\n") : ("\n", "\t");
236 1         7965 my @entries = split $entry_sep, $corpus;
237 1         6 my @chars;
238              
239             my %seen;
240 1         3 ENTRY: for my $entry (@entries) {
241 30378         53501 my $i = index($entry, $field_sep);
242 30378 100       47805 next if rindex($entry, " ", $i) >= 0; # no sequences
243              
244 29948         54665 my $name = substr($entry, $i+1);
245 29948         43054 my $ord = hex substr($entry, 0, $i);
246              
247 29948         39376 for (@terms) {
248             next ENTRY unless $name =~ $_->{pattern}
249 29965 50 33     155041 or defined $_->{ord} && $_->{ord} == $ord;
      66        
250             }
251              
252 17         42 my $c = chr hex substr $entry, 0, $i;
253 17 50       58 next if $seen{$c}++;
254 17         51 push @chars, chr hex substr $entry, 0, $i;
255             }
256              
257 1         1473 return \@chars;
258             }
259              
260             sub smerge {
261 0     0 0 0 my %splat = map {; $_ => 1 } map { @$_ } @_;
  0         0  
  0         0  
262 0         0 return [ sort keys %splat ];
263             }
264              
265             sub do_dwim {
266 1     1 0 3 my ($argv, $opt) = @_;
267 1         5 my $chars = chars_by_name($argv, { match_codepoints => 1 });
268 1         8 print_chars($chars, $opt);
269             }
270              
271             1;
272              
273             __END__