line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
1
|
|
|
1
|
|
457
|
use strict; |
|
1
|
|
|
|
|
6
|
|
|
1
|
|
|
|
|
25
|
|
2
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
47
|
|
3
|
|
|
|
|
|
|
package App::Uni 9.005; |
4
|
|
|
|
|
|
|
# ABSTRACT: command-line utility to find or display Unicode characters |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
#pod =encoding utf8 |
7
|
|
|
|
|
|
|
#pod |
8
|
|
|
|
|
|
|
#pod =head1 NAME |
9
|
|
|
|
|
|
|
#pod |
10
|
|
|
|
|
|
|
#pod App::Uni - Command-line utility to grep UnicodeData.txt |
11
|
|
|
|
|
|
|
#pod |
12
|
|
|
|
|
|
|
#pod =head1 SYNOPSIS |
13
|
|
|
|
|
|
|
#pod |
14
|
|
|
|
|
|
|
#pod $ uni smiling face |
15
|
|
|
|
|
|
|
#pod 263A ☺ WHITE SMILING FACE |
16
|
|
|
|
|
|
|
#pod 263B ☻ BLACK SMILING FACE |
17
|
|
|
|
|
|
|
#pod |
18
|
|
|
|
|
|
|
#pod $ uni ☺ |
19
|
|
|
|
|
|
|
#pod 263A ☺ WHITE SMILING FACE |
20
|
|
|
|
|
|
|
#pod |
21
|
|
|
|
|
|
|
#pod # Only on Perl 5.14+ |
22
|
|
|
|
|
|
|
#pod $ uni wry |
23
|
|
|
|
|
|
|
#pod 1F63C CAT FACE WITH WRY SMILE |
24
|
|
|
|
|
|
|
#pod |
25
|
|
|
|
|
|
|
#pod =head1 DESCRIPTION |
26
|
|
|
|
|
|
|
#pod |
27
|
|
|
|
|
|
|
#pod This module installs a simple program, F, that helps grepping through |
28
|
|
|
|
|
|
|
#pod the Unicode database included in the current Perl 5 installation. |
29
|
|
|
|
|
|
|
#pod |
30
|
|
|
|
|
|
|
#pod For information on how to use F consult the L documentation. |
31
|
|
|
|
|
|
|
#pod |
32
|
|
|
|
|
|
|
#pod =head1 ACKNOWLEDGEMENTS |
33
|
|
|
|
|
|
|
#pod |
34
|
|
|
|
|
|
|
#pod This is a re-implementation of a program written by Audrey Tang in Taiwan. I |
35
|
|
|
|
|
|
|
#pod used that program for years before deciding I wanted to add a few features, |
36
|
|
|
|
|
|
|
#pod which I did by rewriting from scratch. |
37
|
|
|
|
|
|
|
#pod |
38
|
|
|
|
|
|
|
#pod That program, in turn, was a re-implementation of a same-named program Larry |
39
|
|
|
|
|
|
|
#pod copied to me, which accompanied Audrey for years. However, that program was |
40
|
|
|
|
|
|
|
#pod lost during a hard disk failure, so she coded it up from memory. |
41
|
|
|
|
|
|
|
#pod |
42
|
|
|
|
|
|
|
#pod Thank-you, Larry, for everything. ♡ |
43
|
|
|
|
|
|
|
#pod |
44
|
|
|
|
|
|
|
#pod =cut |
45
|
|
|
|
|
|
|
|
46
|
1
|
|
|
1
|
|
9
|
use 5.10.0; # for \v |
|
1
|
|
|
|
|
3
|
|
47
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
23
|
|
48
|
|
|
|
|
|
|
|
49
|
1
|
|
|
1
|
|
545
|
use charnames (); |
|
1
|
|
|
|
|
28587
|
|
|
1
|
|
|
|
|
26
|
|
50
|
1
|
|
|
1
|
|
492
|
use Encode qw(encode_utf8); |
|
1
|
|
|
|
|
7522
|
|
|
1
|
|
|
|
|
65
|
|
51
|
1
|
|
|
1
|
|
666
|
use Getopt::Long; |
|
1
|
|
|
|
|
10818
|
|
|
1
|
|
|
|
|
5
|
|
52
|
1
|
|
|
1
|
|
177
|
use List::Util qw(max); |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
89
|
|
53
|
1
|
|
|
1
|
|
385
|
use Unicode::GCString; |
|
1
|
|
|
|
|
12389
|
|
|
1
|
|
|
|
|
799
|
|
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
sub _do_help { |
56
|
0
|
|
|
0
|
|
0
|
my $class = shift; |
57
|
|
|
|
|
|
|
|
58
|
0
|
0
|
|
|
|
0
|
die |
59
|
|
|
|
|
|
|
join qq{\n }, join(qq{\n}, @_, @_ ? "" : (), "usage:"), |
60
|
|
|
|
|
|
|
"uni SEARCH-TERMS... - find codepoints with matching names or values", |
61
|
|
|
|
|
|
|
"uni [-s] ONE-CHARACTER - print the codepoint and name of one character", |
62
|
|
|
|
|
|
|
"uni -n SEARCH-TERMS... - find codepoints with matching names", |
63
|
|
|
|
|
|
|
"uni -c STRINGS... - print out the codepoints in a string", |
64
|
|
|
|
|
|
|
"uni -u CODEPOINTS... - look up and print hex codepoints", |
65
|
|
|
|
|
|
|
"uni -x HEX-OCTETS... - given the sequence of octets, in hex, decode", |
66
|
|
|
|
|
|
|
"", |
67
|
|
|
|
|
|
|
"Other switches:", |
68
|
|
|
|
|
|
|
" -8 - also show the UTF-8 bytes to encode\n"; |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
sub run { |
72
|
1
|
|
|
1
|
0
|
1639
|
my ($class, @argv) = @_; |
73
|
|
|
|
|
|
|
|
74
|
1
|
|
|
|
|
2
|
my %opt; |
75
|
|
|
|
|
|
|
{ |
76
|
1
|
|
|
|
|
1
|
my $exit; |
|
1
|
|
|
|
|
2
|
|
77
|
1
|
|
|
|
|
3
|
local @ARGV = @argv; |
78
|
|
|
|
|
|
|
GetOptions( |
79
|
|
|
|
|
|
|
"c" => \$opt{explode}, |
80
|
|
|
|
|
|
|
"u" => \$opt{u_numbers}, |
81
|
|
|
|
|
|
|
"n" => \$opt{names}, |
82
|
|
|
|
|
|
|
"s" => \$opt{single}, |
83
|
|
|
|
|
|
|
"x" => \$opt{hex_octets}, |
84
|
|
|
|
|
|
|
"8" => \$opt{utf8}, |
85
|
|
|
|
|
|
|
"help|?" => \$opt{help}, |
86
|
1
|
|
|
|
|
13
|
); |
87
|
1
|
|
|
|
|
471
|
@argv = @ARGV; |
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
|
90
|
1
|
50
|
|
|
|
4
|
$class->_do_help if $opt{help}; |
91
|
|
|
|
|
|
|
|
92
|
1
|
|
|
|
|
4
|
my $n = grep { $_ } @opt{qw(explode u_numbers names single hex_octets)}; |
|
5
|
|
|
|
|
13
|
|
93
|
|
|
|
|
|
|
|
94
|
1
|
50
|
|
|
|
3
|
$class->_do_help("ERROR: only one mode switch allowed!") if $n > 1; |
95
|
|
|
|
|
|
|
|
96
|
1
|
50
|
|
|
|
3
|
$class->_do_help if ! @argv; |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
my $todo = $opt{explode} ? \&do_explode |
99
|
|
|
|
|
|
|
: $opt{u_numbers} ? \&do_u_numbers |
100
|
|
|
|
|
|
|
: $opt{names} ? \&do_names |
101
|
|
|
|
|
|
|
: $opt{single} ? \&do_single |
102
|
1
|
50
|
33
|
|
|
10
|
: $opt{hex_octets} ? \&do_hex_octets |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
103
|
|
|
|
|
|
|
: @argv == 1 && length $argv[0] == 1 ? \&do_single |
104
|
|
|
|
|
|
|
: \&do_dwim; |
105
|
|
|
|
|
|
|
|
106
|
1
|
|
|
|
|
3
|
$todo->(\@argv, \%opt); |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
sub do_single { |
110
|
0
|
|
|
0
|
0
|
0
|
my @chars = grep { length } @{ $_[0] }; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
111
|
0
|
0
|
|
|
|
0
|
if (my @too_long = grep { length > 1 } @chars) { |
|
0
|
|
|
|
|
0
|
|
112
|
0
|
|
|
|
|
0
|
die "some arguments were too long for use with -s: @too_long\n"; |
113
|
|
|
|
|
|
|
} |
114
|
0
|
|
|
|
|
0
|
print_chars(\@chars, $_[1]); |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
sub do_explode { |
118
|
0
|
|
|
0
|
0
|
0
|
print_chars( explode_strings($_[0]), $_[1] ); |
119
|
|
|
|
|
|
|
} |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub do_hex_octets { |
122
|
0
|
|
|
0
|
0
|
0
|
my $string = ''; |
123
|
0
|
|
|
|
|
0
|
for my $hunk (@{ $_[0] }) { |
|
0
|
|
|
|
|
0
|
|
124
|
0
|
0
|
0
|
|
|
0
|
die "input hunk $hunk is not an even-length hex string\n" |
125
|
|
|
|
|
|
|
unless $hunk =~ /\A[0-9A-F]+\z/i && length($hunk) % 2 == 0; |
126
|
|
|
|
|
|
|
|
127
|
0
|
|
|
|
|
0
|
$string .= chr oct "0x$_" for $hunk =~ /(..)/g; |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
|
130
|
0
|
|
|
|
|
0
|
print_chars( explode_strings([ Encode::decode_utf8($string) ], $_[1]) ); |
131
|
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub explode_strings { |
134
|
0
|
|
|
0
|
0
|
0
|
my ($strings) = @_; |
135
|
|
|
|
|
|
|
|
136
|
0
|
|
|
|
|
0
|
my @chars; |
137
|
|
|
|
|
|
|
|
138
|
0
|
|
|
|
|
0
|
while (my $str = shift @$strings) { |
139
|
0
|
|
|
|
|
0
|
push @chars, split '', $str; |
140
|
0
|
0
|
|
|
|
0
|
push @chars, undef if @$strings; |
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
|
143
|
0
|
|
|
|
|
0
|
return \@chars; |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
sub do_u_numbers { |
147
|
0
|
|
|
0
|
0
|
0
|
print_chars( chars_by_u_numbers($_[0]), $_[1] ); |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
sub print_chars { |
151
|
1
|
|
|
1
|
0
|
5
|
my ($chars, $opt) = @_; |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
my @to_print = $opt->{utf8} |
154
|
0
|
|
0
|
|
|
0
|
? (map {; [ $_ => defined && encode_utf8($_) ] } @$chars) |
155
|
1
|
50
|
|
|
|
7
|
: (map {; [ $_ ] } @$chars); |
|
17
|
|
|
|
|
28
|
|
156
|
|
|
|
|
|
|
|
157
|
1
|
|
|
|
|
2
|
my $width; |
158
|
1
|
50
|
|
|
|
5
|
if ($opt->{utf8}) { |
159
|
0
|
|
|
|
|
0
|
my $max_bytes = 0; |
160
|
0
|
|
|
|
|
0
|
for my $todo (@to_print) { |
161
|
0
|
|
|
|
|
0
|
$max_bytes = max($max_bytes, length $todo->[1]); |
162
|
0
|
0
|
|
|
|
0
|
last if $max_bytes == 4; # maximum ever |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
|
165
|
0
|
|
|
|
|
0
|
$width = 2 * $max_bytes + $max_bytes - 1; |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
|
168
|
1
|
|
|
|
|
4
|
for my $todo (@to_print) { |
169
|
17
|
|
|
|
|
40
|
my ($c, $u) = @$todo; |
170
|
|
|
|
|
|
|
|
171
|
17
|
50
|
|
|
|
38
|
unless (defined $c) { print "\n"; next } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
# U+25CC DOTTED CIRCLE |
174
|
17
|
50
|
|
|
|
99
|
my $c2 = Unicode::GCString->new( |
175
|
|
|
|
|
|
|
$c =~ /\pM/ ? "\x{25CC}$c" : $c |
176
|
|
|
|
|
|
|
); |
177
|
17
|
|
|
|
|
265
|
my $l = $c2->columns; |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
# I'm not 100% sure why I need this in all cases. It would make sense in |
180
|
|
|
|
|
|
|
# some, since for example COMBINING GRAVE beginning a line becomes its |
181
|
|
|
|
|
|
|
# own extended grapheme cluster (right?), but why does INVISIBLE TIMES at |
182
|
|
|
|
|
|
|
# the beginning of a line take up a column despite being printing width |
183
|
|
|
|
|
|
|
# zero? The world may never know. Until Tom tells me. |
184
|
|
|
|
|
|
|
# -- rjbs, 2014-10-04 |
185
|
17
|
50
|
|
|
|
34
|
$l = 1 if $l == 0; # ??? |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
# Yeah, probably there's some insane %*0s$ invocation of printf to use |
188
|
|
|
|
|
|
|
# here, but... just no. -- rjbs, 2014-10-04 |
189
|
17
|
|
|
|
|
116
|
(my $p = "$c2") =~ s/\v/ /g; |
190
|
17
|
|
|
|
|
41
|
$p .= (' ' x (2 - $l)); |
191
|
|
|
|
|
|
|
|
192
|
17
|
|
|
|
|
26
|
my $chr = ord($c); |
193
|
17
|
|
|
|
|
36
|
my $name = charnames::viacode($chr); |
194
|
|
|
|
|
|
|
my $utf8 = $opt->{utf8} |
195
|
|
|
|
|
|
|
? (sprintf " - %${width}s", |
196
|
17
|
50
|
|
|
|
16731
|
join q{ }, map {; sprintf '%02X', ord } split //, $u) |
|
0
|
|
|
|
|
0
|
|
197
|
|
|
|
|
|
|
: ''; |
198
|
|
|
|
|
|
|
|
199
|
17
|
|
|
|
|
193
|
printf "%s- U+%05X%s - %s\n", $p, $chr, $utf8, $name; |
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
sub chars_by_u_numbers { |
204
|
0
|
|
|
0
|
0
|
0
|
my ($points) = @_; |
205
|
0
|
|
|
|
|
0
|
my @chars = map {; /\A(?:u\+)?(.+)/; chr hex $1 } @$points; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
206
|
0
|
|
|
|
|
0
|
return \@chars; |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
sub do_names { |
210
|
0
|
|
|
0
|
0
|
0
|
my ($terms, $opt) = @_; |
211
|
|
|
|
|
|
|
|
212
|
0
|
|
|
|
|
0
|
print_chars( chars_by_name( $terms ), $opt ); |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
sub chars_by_name { |
216
|
1
|
|
|
1
|
0
|
2
|
my ($input_terms, $arg) = @_; |
217
|
1
|
50
|
|
|
|
2
|
my @terms = map {; { pattern => s{\A/(.+)/\z}{$1} ? qr/$_/i : qr/\b$_\b/i } } |
|
2
|
|
|
|
|
22
|
|
218
|
|
|
|
|
|
|
@$input_terms; |
219
|
|
|
|
|
|
|
|
220
|
1
|
50
|
33
|
|
|
7
|
if ($arg && $arg->{match_codepoints}) { |
221
|
1
|
|
|
|
|
4
|
for (0 .. $#terms) { |
222
|
2
|
100
|
|
|
|
11
|
$terms[$_]{ord} = hex $input_terms->[$_] |
223
|
|
|
|
|
|
|
if $input_terms->[$_] =~ /\A[0-9A-Fa-f]+\z/; |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
|
227
|
1
|
|
|
|
|
9948
|
state $corpus = do 'unicore/Name.pl'; |
228
|
1
|
50
|
|
|
|
17
|
unless (defined $corpus) { |
229
|
0
|
0
|
|
|
|
0
|
die "couldn't parse unicore/Name.pl: $@" if $@; |
230
|
0
|
0
|
|
|
|
0
|
die "couldn't read unicore/Name.pl: $!" if $!; |
231
|
0
|
|
|
|
|
0
|
die "unicore/Name.pl returned undef"; |
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
# https://github.com/perl/perl5/commit/b555069b72f93a232deba173dc7bf7892cfa5868 |
235
|
1
|
50
|
|
|
|
8
|
my ($entry_sep, $field_sep) = "$]" >= 5.031010 ? ("\n\n", "\n") : ("\n", "\t"); |
236
|
1
|
|
|
|
|
7220
|
my @entries = split $entry_sep, $corpus; |
237
|
1
|
|
|
|
|
9
|
my @chars; |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
my %seen; |
240
|
1
|
|
|
|
|
3
|
ENTRY: for my $entry (@entries) { |
241
|
30378
|
|
|
|
|
36246
|
my $i = index($entry, $field_sep); |
242
|
30378
|
100
|
|
|
|
41936
|
next if rindex($entry, " ", $i) >= 0; # no sequences |
243
|
|
|
|
|
|
|
|
244
|
29948
|
|
|
|
|
37637
|
my $name = substr($entry, $i+1); |
245
|
29948
|
|
|
|
|
34578
|
my $ord = hex substr($entry, 0, $i); |
246
|
|
|
|
|
|
|
|
247
|
29948
|
|
|
|
|
31799
|
for (@terms) { |
248
|
|
|
|
|
|
|
next ENTRY unless $name =~ $_->{pattern} |
249
|
29965
|
50
|
33
|
|
|
106372
|
or defined $_->{ord} && $_->{ord} == $ord; |
|
|
|
66
|
|
|
|
|
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
|
252
|
17
|
|
|
|
|
37
|
my $c = chr hex substr $entry, 0, $i; |
253
|
17
|
50
|
|
|
|
47
|
next if $seen{$c}++; |
254
|
17
|
|
|
|
|
51
|
push @chars, chr hex substr $entry, 0, $i; |
255
|
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
|
|
257
|
1
|
|
|
|
|
1250
|
return \@chars; |
258
|
|
|
|
|
|
|
} |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
sub smerge { |
261
|
0
|
|
|
0
|
0
|
0
|
my %splat = map {; $_ => 1 } map { @$_ } @_; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
262
|
0
|
|
|
|
|
0
|
return [ sort keys %splat ]; |
263
|
|
|
|
|
|
|
} |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
sub do_dwim { |
266
|
1
|
|
|
1
|
0
|
3
|
my ($argv, $opt) = @_; |
267
|
1
|
|
|
|
|
4
|
my $chars = chars_by_name($argv, { match_codepoints => 1 }); |
268
|
1
|
|
|
|
|
7
|
print_chars($chars, $opt); |
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
1; |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
__END__ |