| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
1
|
|
|
1
|
|
75271
|
use strict; |
|
|
1
|
|
|
|
|
11
|
|
|
|
1
|
|
|
|
|
37
|
|
|
2
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
81
|
|
|
3
|
|
|
|
|
|
|
package App::Uni; |
|
4
|
|
|
|
|
|
|
# ABSTRACT: command-line utility to find or display Unicode characters |
|
5
|
|
|
|
|
|
|
$App::Uni::VERSION = '9.004'; |
|
6
|
|
|
|
|
|
|
#pod =encoding utf8 |
|
7
|
|
|
|
|
|
|
#pod |
|
8
|
|
|
|
|
|
|
#pod =head1 NAME |
|
9
|
|
|
|
|
|
|
#pod |
|
10
|
|
|
|
|
|
|
#pod App::Uni - Command-line utility to grep UnicodeData.txt |
|
11
|
|
|
|
|
|
|
#pod |
|
12
|
|
|
|
|
|
|
#pod =head1 SYNOPSIS |
|
13
|
|
|
|
|
|
|
#pod |
|
14
|
|
|
|
|
|
|
#pod $ uni smiling face |
|
15
|
|
|
|
|
|
|
#pod 263A ☺ WHITE SMILING FACE |
|
16
|
|
|
|
|
|
|
#pod 263B ☻ BLACK SMILING FACE |
|
17
|
|
|
|
|
|
|
#pod |
|
18
|
|
|
|
|
|
|
#pod $ uni ☺ |
|
19
|
|
|
|
|
|
|
#pod 263A ☺ WHITE SMILING FACE |
|
20
|
|
|
|
|
|
|
#pod |
|
21
|
|
|
|
|
|
|
#pod # Only on Perl 5.14+ |
|
22
|
|
|
|
|
|
|
#pod $ uni wry |
|
23
|
|
|
|
|
|
|
#pod 1F63C CAT FACE WITH WRY SMILE |
|
24
|
|
|
|
|
|
|
#pod |
|
25
|
|
|
|
|
|
|
#pod =head1 DESCRIPTION |
|
26
|
|
|
|
|
|
|
#pod |
|
27
|
|
|
|
|
|
|
#pod This module installs a simple program, F, that helps grepping through |
|
28
|
|
|
|
|
|
|
#pod the Unicode database included in the current Perl 5 installation. |
|
29
|
|
|
|
|
|
|
#pod |
|
30
|
|
|
|
|
|
|
#pod For information on how to use F consult the L documentation. |
|
31
|
|
|
|
|
|
|
#pod |
|
32
|
|
|
|
|
|
|
#pod =head1 ACKNOWLEDGEMENTS |
|
33
|
|
|
|
|
|
|
#pod |
|
34
|
|
|
|
|
|
|
#pod This is a re-implementation of a program written by Audrey Tang in Taiwan. I |
|
35
|
|
|
|
|
|
|
#pod used that program for years before deciding I wanted to add a few features, |
|
36
|
|
|
|
|
|
|
#pod which I did by rewriting from scratch. |
|
37
|
|
|
|
|
|
|
#pod |
|
38
|
|
|
|
|
|
|
#pod That program, in turn, was a re-implementation of a same-named program Larry |
|
39
|
|
|
|
|
|
|
#pod copied to me, which accompanied Audrey for years. However, that program was |
|
40
|
|
|
|
|
|
|
#pod lost during a hard disk failure, so she coded it up from memory. |
|
41
|
|
|
|
|
|
|
#pod |
|
42
|
|
|
|
|
|
|
#pod Thank-you, Larry, for everything. ♡ |
|
43
|
|
|
|
|
|
|
#pod |
|
44
|
|
|
|
|
|
|
#pod =cut |
|
45
|
|
|
|
|
|
|
|
|
46
|
1
|
|
|
1
|
|
16
|
use 5.10.0; # for \v |
|
|
1
|
|
|
|
|
4
|
|
|
47
|
1
|
|
|
1
|
|
7
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
30
|
|
|
48
|
|
|
|
|
|
|
|
|
49
|
1
|
|
|
1
|
|
1103
|
use charnames (); |
|
|
1
|
|
|
|
|
37790
|
|
|
|
1
|
|
|
|
|
30
|
|
|
50
|
1
|
|
|
1
|
|
621
|
use Encode qw(encode_utf8); |
|
|
1
|
|
|
|
|
8960
|
|
|
|
1
|
|
|
|
|
73
|
|
|
51
|
1
|
|
|
1
|
|
756
|
use Getopt::Long; |
|
|
1
|
|
|
|
|
13003
|
|
|
|
1
|
|
|
|
|
7
|
|
|
52
|
1
|
|
|
1
|
|
189
|
use List::Util qw(max); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
107
|
|
|
53
|
1
|
|
|
1
|
|
451
|
use Unicode::GCString; |
|
|
1
|
|
|
|
|
15000
|
|
|
|
1
|
|
|
|
|
970
|
|
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
sub _do_help { |
|
56
|
0
|
|
|
0
|
|
0
|
my $class = shift; |
|
57
|
|
|
|
|
|
|
|
|
58
|
0
|
0
|
|
|
|
0
|
die |
|
59
|
|
|
|
|
|
|
join qq{\n }, join(qq{\n}, @_, @_ ? "" : (), "usage:"), |
|
60
|
|
|
|
|
|
|
"uni SEARCH-TERMS... - find codepoints with matching names or values", |
|
61
|
|
|
|
|
|
|
"uni [-s] ONE-CHARACTER - print the codepoint and name of one character", |
|
62
|
|
|
|
|
|
|
"uni -n SEARCH-TERMS... - find codepoints with matching names", |
|
63
|
|
|
|
|
|
|
"uni -c STRINGS... - print out the codepoints in a string", |
|
64
|
|
|
|
|
|
|
"uni -u CODEPOINTS... - look up and print hex codepoints", |
|
65
|
|
|
|
|
|
|
"uni -x HEX-OCTETS... - given the sequence of octets, in hex, decode", |
|
66
|
|
|
|
|
|
|
"", |
|
67
|
|
|
|
|
|
|
"Other switches:", |
|
68
|
|
|
|
|
|
|
" -8 - also show the UTF-8 bytes to encode\n"; |
|
69
|
|
|
|
|
|
|
} |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
sub run { |
|
72
|
1
|
|
|
1
|
0
|
2044
|
my ($class, @argv) = @_; |
|
73
|
|
|
|
|
|
|
|
|
74
|
1
|
|
|
|
|
3
|
my %opt; |
|
75
|
|
|
|
|
|
|
{ |
|
76
|
1
|
|
|
|
|
3
|
my $exit; |
|
|
1
|
|
|
|
|
2
|
|
|
77
|
1
|
|
|
|
|
3
|
local @ARGV = @argv; |
|
78
|
|
|
|
|
|
|
GetOptions( |
|
79
|
|
|
|
|
|
|
"c" => \$opt{explode}, |
|
80
|
|
|
|
|
|
|
"u" => \$opt{u_numbers}, |
|
81
|
|
|
|
|
|
|
"n" => \$opt{names}, |
|
82
|
|
|
|
|
|
|
"s" => \$opt{single}, |
|
83
|
|
|
|
|
|
|
"x" => \$opt{hex_octets}, |
|
84
|
|
|
|
|
|
|
"8" => \$opt{utf8}, |
|
85
|
|
|
|
|
|
|
"help|?" => \$opt{help}, |
|
86
|
1
|
|
|
|
|
15
|
); |
|
87
|
1
|
|
|
|
|
551
|
@argv = @ARGV; |
|
88
|
|
|
|
|
|
|
} |
|
89
|
|
|
|
|
|
|
|
|
90
|
1
|
50
|
|
|
|
4
|
$class->_do_help if $opt{help}; |
|
91
|
|
|
|
|
|
|
|
|
92
|
1
|
|
|
|
|
11
|
my $n = grep { $_ } @opt{qw(explode u_numbers names single hex_octets)}; |
|
|
5
|
|
|
|
|
9
|
|
|
93
|
|
|
|
|
|
|
|
|
94
|
1
|
50
|
|
|
|
3
|
$class->_do_help("ERROR: only one mode switch allowed!") if $n > 1; |
|
95
|
|
|
|
|
|
|
|
|
96
|
1
|
50
|
|
|
|
4
|
$class->_do_help if ! @argv; |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
my $todo = $opt{explode} ? \&do_explode |
|
99
|
|
|
|
|
|
|
: $opt{u_numbers} ? \&do_u_numbers |
|
100
|
|
|
|
|
|
|
: $opt{names} ? \&do_names |
|
101
|
|
|
|
|
|
|
: $opt{single} ? \&do_single |
|
102
|
1
|
50
|
33
|
|
|
12
|
: $opt{hex_octets} ? \&do_hex_octets |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
: @argv == 1 && length $argv[0] == 1 ? \&do_single |
|
104
|
|
|
|
|
|
|
: \&do_dwim; |
|
105
|
|
|
|
|
|
|
|
|
106
|
1
|
|
|
|
|
5
|
$todo->(\@argv, \%opt); |
|
107
|
|
|
|
|
|
|
} |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
sub do_single { |
|
110
|
0
|
|
|
0
|
0
|
0
|
my @chars = grep { length } @{ $_[0] }; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
111
|
0
|
0
|
|
|
|
0
|
if (my @too_long = grep { length > 1 } @chars) { |
|
|
0
|
|
|
|
|
0
|
|
|
112
|
0
|
|
|
|
|
0
|
die "some arguments were too long for use with -s: @too_long\n"; |
|
113
|
|
|
|
|
|
|
} |
|
114
|
0
|
|
|
|
|
0
|
print_chars(\@chars, $_[1]); |
|
115
|
|
|
|
|
|
|
} |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
sub do_explode { |
|
118
|
0
|
|
|
0
|
0
|
0
|
print_chars( explode_strings($_[0]), $_[1] ); |
|
119
|
|
|
|
|
|
|
} |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub do_hex_octets { |
|
122
|
0
|
|
|
0
|
0
|
0
|
my $string = ''; |
|
123
|
0
|
|
|
|
|
0
|
for my $hunk (@{ $_[0] }) { |
|
|
0
|
|
|
|
|
0
|
|
|
124
|
0
|
0
|
0
|
|
|
0
|
die "input hunk $hunk is not an even-length hex string\n" |
|
125
|
|
|
|
|
|
|
unless $hunk =~ /\A[0-9A-F]+\z/i && length($hunk) % 2 == 0; |
|
126
|
|
|
|
|
|
|
|
|
127
|
0
|
|
|
|
|
0
|
$string .= chr oct "0x$_" for $hunk =~ /(..)/g; |
|
128
|
|
|
|
|
|
|
} |
|
129
|
|
|
|
|
|
|
|
|
130
|
0
|
|
|
|
|
0
|
print_chars( explode_strings([ Encode::decode_utf8($string) ], $_[1]) ); |
|
131
|
|
|
|
|
|
|
} |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub explode_strings { |
|
134
|
0
|
|
|
0
|
0
|
0
|
my ($strings) = @_; |
|
135
|
|
|
|
|
|
|
|
|
136
|
0
|
|
|
|
|
0
|
my @chars; |
|
137
|
|
|
|
|
|
|
|
|
138
|
0
|
|
|
|
|
0
|
while (my $str = shift @$strings) { |
|
139
|
0
|
|
|
|
|
0
|
push @chars, split '', $str; |
|
140
|
0
|
0
|
|
|
|
0
|
push @chars, undef if @$strings; |
|
141
|
|
|
|
|
|
|
} |
|
142
|
|
|
|
|
|
|
|
|
143
|
0
|
|
|
|
|
0
|
return \@chars; |
|
144
|
|
|
|
|
|
|
} |
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
sub do_u_numbers { |
|
147
|
0
|
|
|
0
|
0
|
0
|
print_chars( chars_by_u_numbers($_[0]), $_[1] ); |
|
148
|
|
|
|
|
|
|
} |
|
149
|
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
sub print_chars { |
|
151
|
1
|
|
|
1
|
0
|
5
|
my ($chars, $opt) = @_; |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
my @to_print = $opt->{utf8} |
|
154
|
0
|
|
0
|
|
|
0
|
? (map {; [ $_ => defined && encode_utf8($_) ] } @$chars) |
|
155
|
1
|
50
|
|
|
|
7
|
: (map {; [ $_ ] } @$chars); |
|
|
17
|
|
|
|
|
31
|
|
|
156
|
|
|
|
|
|
|
|
|
157
|
1
|
|
|
|
|
3
|
my $width; |
|
158
|
1
|
50
|
|
|
|
5
|
if ($opt->{utf8}) { |
|
159
|
0
|
|
|
|
|
0
|
my $max_bytes = 0; |
|
160
|
0
|
|
|
|
|
0
|
for my $todo (@to_print) { |
|
161
|
0
|
|
|
|
|
0
|
$max_bytes = max($max_bytes, length $todo->[1]); |
|
162
|
0
|
0
|
|
|
|
0
|
last if $max_bytes == 4; # maximum ever |
|
163
|
|
|
|
|
|
|
} |
|
164
|
|
|
|
|
|
|
|
|
165
|
0
|
|
|
|
|
0
|
$width = 2 * $max_bytes + $max_bytes - 1; |
|
166
|
|
|
|
|
|
|
} |
|
167
|
|
|
|
|
|
|
|
|
168
|
1
|
|
|
|
|
5
|
for my $todo (@to_print) { |
|
169
|
17
|
|
|
|
|
48
|
my ($c, $u) = @$todo; |
|
170
|
|
|
|
|
|
|
|
|
171
|
17
|
50
|
|
|
|
37
|
unless (defined $c) { print "\n"; next } |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
172
|
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
# U+25CC DOTTED CIRCLE |
|
174
|
17
|
50
|
|
|
|
138
|
my $c2 = Unicode::GCString->new( |
|
175
|
|
|
|
|
|
|
$c =~ /\pM/ ? "\x{25CC}$c" : $c |
|
176
|
|
|
|
|
|
|
); |
|
177
|
17
|
|
|
|
|
287
|
my $l = $c2->columns; |
|
178
|
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
# I'm not 100% sure why I need this in all cases. It would make sense in |
|
180
|
|
|
|
|
|
|
# some, since for example COMBINING GRAVE beginning a line becomes its |
|
181
|
|
|
|
|
|
|
# own extended grapheme cluster (right?), but why does INVISIBLE TIMES at |
|
182
|
|
|
|
|
|
|
# the beginning of a line take up a column despite being printing width |
|
183
|
|
|
|
|
|
|
# zero? The world may never know. Until Tom tells me. |
|
184
|
|
|
|
|
|
|
# -- rjbs, 2014-10-04 |
|
185
|
17
|
50
|
|
|
|
38
|
$l = 1 if $l == 0; # ??? |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
# Yeah, probably there's some insane %*0s$ invocation of printf to use |
|
188
|
|
|
|
|
|
|
# here, but... just no. -- rjbs, 2014-10-04 |
|
189
|
17
|
|
|
|
|
128
|
(my $p = "$c2") =~ s/\v/ /g; |
|
190
|
17
|
|
|
|
|
46
|
$p .= (' ' x (2 - $l)); |
|
191
|
|
|
|
|
|
|
|
|
192
|
17
|
|
|
|
|
31
|
my $chr = ord($c); |
|
193
|
17
|
|
|
|
|
43
|
my $name = charnames::viacode($chr); |
|
194
|
|
|
|
|
|
|
my $utf8 = $opt->{utf8} |
|
195
|
|
|
|
|
|
|
? (sprintf " - %${width}s", |
|
196
|
17
|
50
|
|
|
|
20218
|
join q{ }, map {; sprintf '%02X', ord } split //, $u) |
|
|
0
|
|
|
|
|
0
|
|
|
197
|
|
|
|
|
|
|
: ''; |
|
198
|
|
|
|
|
|
|
|
|
199
|
17
|
|
|
|
|
222
|
printf "%s- U+%05X%s - %s\n", $p, $chr, $utf8, $name; |
|
200
|
|
|
|
|
|
|
} |
|
201
|
|
|
|
|
|
|
} |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
sub chars_by_u_numbers { |
|
204
|
0
|
|
|
0
|
0
|
0
|
my ($points) = @_; |
|
205
|
0
|
|
|
|
|
0
|
my @chars = map {; /\A(?:u\+)?(.+)/; chr hex $1 } @$points; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
206
|
0
|
|
|
|
|
0
|
return \@chars; |
|
207
|
|
|
|
|
|
|
} |
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
sub do_names { |
|
210
|
0
|
|
|
0
|
0
|
0
|
my ($terms, $opt) = @_; |
|
211
|
|
|
|
|
|
|
|
|
212
|
0
|
|
|
|
|
0
|
print_chars( chars_by_name( $terms ), $opt ); |
|
213
|
|
|
|
|
|
|
} |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
sub chars_by_name { |
|
216
|
1
|
|
|
1
|
0
|
3
|
my ($input_terms, $arg) = @_; |
|
217
|
1
|
50
|
|
|
|
3
|
my @terms = map {; { pattern => s{\A/(.+)/\z}{$1} ? qr/$_/i : qr/\b$_\b/i } } |
|
|
2
|
|
|
|
|
27
|
|
|
218
|
|
|
|
|
|
|
@$input_terms; |
|
219
|
|
|
|
|
|
|
|
|
220
|
1
|
50
|
33
|
|
|
9
|
if ($arg && $arg->{match_codepoints}) { |
|
221
|
1
|
|
|
|
|
5
|
for (0 .. $#terms) { |
|
222
|
2
|
100
|
|
|
|
14
|
$terms[$_]{ord} = hex $input_terms->[$_] |
|
223
|
|
|
|
|
|
|
if $input_terms->[$_] =~ /\A[0-9A-Fa-f]+\z/; |
|
224
|
|
|
|
|
|
|
} |
|
225
|
|
|
|
|
|
|
} |
|
226
|
|
|
|
|
|
|
|
|
227
|
1
|
|
|
|
|
11987
|
state $corpus = do 'unicore/Name.pl'; |
|
228
|
1
|
50
|
|
|
|
18
|
unless (defined $corpus) { |
|
229
|
0
|
0
|
|
|
|
0
|
die "couldn't parse unicore/Name.pl: $@" if $@; |
|
230
|
0
|
0
|
|
|
|
0
|
die "couldn't read unicore/Name.pl: $!" if $!; |
|
231
|
0
|
|
|
|
|
0
|
die "unicore/Name.pl returned undef"; |
|
232
|
|
|
|
|
|
|
} |
|
233
|
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
# https://github.com/perl/perl5/commit/b555069b72f93a232deba173dc7bf7892cfa5868 |
|
235
|
1
|
50
|
|
|
|
10
|
my ($entry_sep, $field_sep) = "$]" >= 5.031010 ? ("\n\n", "\n") : ("\n", "\t"); |
|
236
|
1
|
|
|
|
|
8716
|
my @entries = split $entry_sep, $corpus; |
|
237
|
1
|
|
|
|
|
8
|
my @chars; |
|
238
|
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
my %seen; |
|
240
|
1
|
|
|
|
|
4
|
ENTRY: for my $entry (@entries) { |
|
241
|
30378
|
|
|
|
|
45104
|
my $i = index($entry, $field_sep); |
|
242
|
30378
|
100
|
|
|
|
49095
|
next if rindex($entry, " ", $i) >= 0; # no sequences |
|
243
|
|
|
|
|
|
|
|
|
244
|
29948
|
|
|
|
|
46155
|
my $name = substr($entry, $i+1); |
|
245
|
29948
|
|
|
|
|
42524
|
my $ord = hex substr($entry, 0, $i); |
|
246
|
|
|
|
|
|
|
|
|
247
|
29948
|
|
|
|
|
39813
|
for (@terms) { |
|
248
|
|
|
|
|
|
|
next ENTRY unless $name =~ $_->{pattern} |
|
249
|
29965
|
50
|
33
|
|
|
132655
|
or defined $_->{ord} && $_->{ord} == $ord; |
|
|
|
|
66
|
|
|
|
|
|
250
|
|
|
|
|
|
|
} |
|
251
|
|
|
|
|
|
|
|
|
252
|
17
|
|
|
|
|
42
|
my $c = chr hex substr $entry, 0, $i; |
|
253
|
17
|
50
|
|
|
|
54
|
next if $seen{$c}++; |
|
254
|
17
|
|
|
|
|
59
|
push @chars, chr hex substr $entry, 0, $i; |
|
255
|
|
|
|
|
|
|
} |
|
256
|
|
|
|
|
|
|
|
|
257
|
1
|
|
|
|
|
1449
|
return \@chars; |
|
258
|
|
|
|
|
|
|
} |
|
259
|
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
sub smerge { |
|
261
|
0
|
|
|
0
|
0
|
0
|
my %splat = map {; $_ => 1 } map { @$_ } @_; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
262
|
0
|
|
|
|
|
0
|
return [ sort keys %splat ]; |
|
263
|
|
|
|
|
|
|
} |
|
264
|
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
sub do_dwim { |
|
266
|
1
|
|
|
1
|
0
|
2
|
my ($argv, $opt) = @_; |
|
267
|
1
|
|
|
|
|
6
|
my $chars = chars_by_name($argv, { match_codepoints => 1 }); |
|
268
|
1
|
|
|
|
|
9
|
print_chars($chars, $opt); |
|
269
|
|
|
|
|
|
|
} |
|
270
|
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
1; |
|
272
|
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
__END__ |