File Coverage

blib/lib/App/Greple/charcode.pm

Criterion	Covered	Total	%
statement	38	98	38.7
branch	0	46	0.0
condition	0	5	0.0
subroutine	13	32	40.6
pod	10	18	55.5
total	61	199	30.6

line	stmt	bran	cond	sub	pod	time	code
1							package App::Greple::charcode;
2
3	1			1		240846	use 5.024;
	1					4
4	1			1		5	use warnings;
	1					1
	1					52
5	1			1		486	use utf8;
	1					230
	1					4
6
7							our $VERSION = "0.9909";
8
9							=encoding utf-8
10
11							=head1 NAME
12
13							App::Greple::charcode - greple module to annotate unicode character data
14
15							=for html
16
17
18
19							=head1 SYNOPSIS
20
21							greple -Mcharcode [ module option -- ] [ command option ] ...
22
23							COMMAND OPTION
24							--no-annotate do not print annotation
25							--[no-]align align annotations
26							--align-all align to the same column for all lines
27							--align-side align to the longest line
28
29							PATTERNS
30							--composite find composite character (combining character sequence)
31							--precomposed find precomposed character
32							--combined find both composite and precomposed characters
33							--outstand find --combined and non-ASCII characters
34							--dt=type specify decomposition type
35							--surrogate find character in UTF-16 surrogate pair range
36							--outstand find non-ASCII combining characters
37							-p/-P prop find \p{prop} or \P{prop} characters
38							--ansicode find ANSI terminal control sequences
39
40							MODULE OPTION
41							--column[=#] display column number
42							--visible[=#] display character name
43							--char[=#] display character itself
44							--width[=#] display width
45							--utf8[=#] display UTF-8 encoding
46							--utf16[=#] display UTF-16 encoding
47							--code[=#] display Unicode code point
48							--name[=#] display character name
49							--nfd[=#] display Unicode Normalization Form D
50							--nfc[=#] display Unicode Normalization Form C
51							--nfkd[=#] display Unicode Normalization Form KD
52							--nfkc[=#] display Unicode Normalization Form KC
53							--split[=#] put annotattion for each character
54							--alignto[=#] align annotation to #
55
56
57							--config KEY[=VALUE],...
58
59							greple -Mcc [ module option -- ] [ command option ] ...
60
61							-Mcc alias module for -Mcharcode
62
63							=head1 VERSION
64
65							Version 0.9909
66
67							=head1 DESCRIPTION
68
69							Greple module C<-Mcharcode> (or C<-Mcc> for short) displays
70							information about the matched characters. It can visualize Unicode
71							zero-width combining or hidden characters, which can be useful for
72							examining text containing visually indistinguishable or imperceptible
73							elements.
74
75							The following output, retrieved from this document for non-ASCII
76							characters (C<\P{ASCII}>), shows that the character C<\N{VARIATION
77							SELECTOR-15}> is included after the copyright character. The same
78							character, presumably left over from editing, is also included after a
79							normal ASCII C character.
80
81							$ greple -Mcharcode '\P{ASCII}' charcode.pm
82
83							┌─── 12 \x{fe0e} \N{VARIATION SELECTOR-15}
84							│ ┌─ 14 \x{a9} \N{COPYRIGHT SIGN}
85							│ ├─ 14 \x{fe0e} \N{VARIATION SELECTOR-15}
86							Copyright︎ ©︎ 2025 Kazumasa Utashiro.
87
88							The nasal sound of the K line (カ行) in Japanese is sometimes
89							represented by adding a semivoiced dot to the K line character, and
90							since Unicode does not define a corresponding character, it is
91							represented by combining the original character with a combining
92							character. This module allows you to see how it is done.
93
94							┌───────── 0 \x{30ab} \N{KATAKANA LETTER KA}
95							├───────── 0 \x{309a} \N{COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK}
96							│ ┌─────── 2 \x{30ad} \N{KATAKANA LETTER KI}
97							│ ├─────── 2 \x{309a} \N{COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK}
98							│ │ ┌───── 4 \x{30af} \N{KATAKANA LETTER KU}
99							│ │ ├───── 4 \x{309a} \N{COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK}
100							│ │ │ ┌─── 6 \x{30b1} \N{KATAKANA LETTER KE}
101							│ │ │ ├─── 6 \x{309a} \N{COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK}
102							│ │ │ │ ┌─ 8 \x{30b3} \N{KATAKANA LETTER KO}
103							│ │ │ │ ├─ 8 \x{309a} \N{COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK}
104							カ゚キ゚ク゚ケ゚コ゚
105
106							=for html
107
108
109
110							=head1 COMMAND OPTIONS
111
112							=over 7
113
114							=item B<--annotate>, B<--no-annotate>
115
116							Print annotation or not. Enabled by default, so use C<--no-annotate>
117							to disable it.
118
119							=item B<-->[B]B
120
121							Align annotation or not.
122							Default true.
123
124							=item B<--align-all>
125
126							Align to the same column for all lines
127
128							=item B<--align-side>
129
130							Align to the longest line length, regardless of match position.
131
132							=back
133
134							=head1 PATTERN OPTIONS
135
136							If multiple patterns are given to B, it normally prints only
137							the lines that match all of the patterns. However, for the purposes
138							of this module, it is desirable to display lines that match any of
139							them, so the C<--need=1> option is specified by default.
140
141							If multiple patterns are specified, the strings matching each pattern
142							will be displayed in a different color.
143
144							=over 7
145
146							=item B<--composite>
147
148							Search for composite characters (combining character sequence)
149							composed of base and combining characters.
150
151							=item B<--precomposed>
152
153							Search for precomposed characters (C<\p{Dt=Canonical}>).
154
155							=item B<--combined>
156
157							Find both B and B characters.
158
159							=item B<--dt>=I, B<--decomposition-type>=I
160
161							Specifies the C. It can take three values:
162							C, C (C), or C.
163
164							=item B<--outstand>
165
166							Matches outstanding characters, those are non-ASCII combining
167							characters.
168
169							=item B<--surrogate>
170
171							Matches to characters in UTF-16 surragate pair range (U+10000 to
172							U+10FFFF).
173
174							=item B<-p> I, B<-P> I
175
176							Short cut for C<-E '\p{prop}'> and C<-E '\P{prop}'>.
177
178							You will not be able to use greple's C<-p> option, but it probably
179							won't be a problem. If you must use it, use C<--pargraph>.
180
181							=item B<--ansicode>
182
183							Search ANSI terminal control sequence. Automatically disables C
184							and C `parameter and activates C. Colorized output is`
185							disabled too.
186
187							To be precise, it searches for CSI Control sequences defined in
188							ECMA-48. Pattern is defined as this.
189
190							(?x)
191							# see ECMA-48 5.4 Control sequences
192							(?: \e\[ \| \x9b ) # csi
193							[\x30-\x3f]* # parameter bytes
194							[\x20-\x2f]* # intermediate bytes
195							[\x40-\x7e] # final byte
196
197							=for html
198
199
200
201							=back
202
203							=head1 MODULE OPTIONS and PARAMS
204
205							Module-specific options are specified between C<-Mcharcode> and C<-->.
206
207							greple -Mcharcode --config width,name=0 -- ...
208
209							Parameters can be set in two ways, one using the C<--config> option
210							and the other using dedicated options. See the L
211							section for more information.
212
213							=over 7
214
215							=item B<--config>=I
216
217							Set configuration parameters.
218
219							=item B
220
221							=item B<--column>[=I<#>]
222
223							Show column number.
224							Default C<1>.
225
226							=item B
227
228							=item B<--visible>[=I<#>]
229
230							Display invisible characters in a visible string representation.
231							Default C<0>.
232
233							=item B
234
235							=item B<--char>[=I<#>]
236
237							Show the character itself.
238							Default C<0>.
239
240							=item B
241
242							=item B<--width>[=I<#>]
243
244							Show the width.
245							Default C<0>.
246
247							=item B
248
249							=item B<--utf8>[=I<#>]
250
251							Show the UTF-8 encoding in hex.
252							Default C<0>.
253
254							=item B
255
256							=item B<--utf16>[=I<#>]
257
258							Show the UTF-16 encoding in hex.
259							Default C<0>.
260
261							=item B
262
263							=item B<--code>[=I<#>]
264
265							Show the character code point in hex.
266							Default C<1>.
267
268							=item B, B, B, B
269
270							=item B<--nfd>[=I<#>], B<--nfc>[=I<#>], B<--nfkd>[=I<#>], B<--nfkc>[=I<#>]
271
272							Show the Unicode Normalization Form D, C, KD and KC.
273							See L.
274
275							=item B
276
277							=item B<--name>[=I<#>]
278
279							Show the Unicode name of the character.
280							Default C<1>.
281
282							=item B
283
284							=item B<--split>[=I<#>]
285
286							If a pattern matching multiple characters is given, annotate each
287							character independently.
288
289							=item B=I
290
291							=item B<--alignto>=I
292
293							Align annotation messages. Defaults to C<1>, which aligns to the
294							rightmost column; C<0> means no align; if a value of C<2> or greater
295							is given, it aligns to that numbered column.
296
297							I can be negative; if C<-1> is specified, align to the same
298							column for all lines. If C<-2> is specified, align to the longest
299							line length, regardless of match position.
300
301							=back
302
303							=head1 CONFIGURATION
304
305							Configuration parameters can be set in several ways.
306
307							=head2 MODULE START FUNCTION
308
309							The start function of a module can be specified at the same time as
310							the module declaration.
311
312							greple -Mcharcode::config(alignto=0)
313
314							greple -Mcharcode::config=alignto=80
315
316							=head2 PRIVATE MODULE OPTION
317
318							Module-specific options are specified between C<-Mcharcode> and C<-->.
319
320							greple -Mcharcode --config alignto=80 -- ...
321
322							greple -Mcharcode --alignto=80 -- ...
323
324							=head2 GENERIC MODULE OPTION
325
326							Module-specific C<---config> option can be called by normal command
327							line option C<--charcode::config>.
328
329							greple -Mcharcode --charcode::config alignto=80 ...
330
331							=head1 EXAMPLES
332
333							=head2 HOMOGLYPH
334
335							greple -Mcc -P ASCII --align-side --cm=S t/homoglyph
336
337							=for html
338
339
340
341							=head2 BOX DRAWINGS
342
343							perldoc -m App::ansicolumn::Border \| greple -Mcc --code -- --outstand --mc=10,
344
345							=for html
346
347
348
349							=head2 AYNU ITAK
350
351							greple -Mcc --outstand --split t/ainu.txt
352
353							=for html
354
355
356
357							=head1 INSTALL
358
359							cpanm -n App::Greple::charcode
360
361							=head1 SEE ALSO
362
363							L
364
365							L
366
367							L
368
369							=head1 LICENSE
370
371							Copyright︎ ©︎ 2025 Kazumasa Utashiro.
372
373							This library is free software; you can redistribute it and/or modify
374							it under the same terms as Perl itself.
375
376							=head1 AUTHOR
377
378							Kazumasa Utashiro
379
380							=cut
381
382	1			1		101	use Exporter qw(import);
	1					1
	1					56
383							our @EXPORT_OK = qw(config);
384							our %EXPORT_TAGS = (alias => \@EXPORT_OK);
385
386	1			1		4	use Encode ();
	1					1
	1					12
387	1			1		499	use Getopt::EX::Config;
	1					23323
	1					8
388	1			1		52	use Hash::Util qw(lock_keys);
	1					1
	1					5
389	1			1		46	use Data::Dumper;
	1					2
	1					33
390	1			1		448	use Text::ANSI::Fold::Util qw(ansi_width);
	1					45002
	1					60
391
392	1			1		488	use App::Greple::annotate;
	1					3
	1					297
393
394							our $config = Getopt::EX::Config->new(
395							column => 1,
396							visible => 1,
397							char => 0,
398							width => 0,
399							utf8 => 0,
400							utf16 => 0,
401							nfd => 0,
402							nfc => 0,
403							nfkd => 0,
404							nfkc => 0,
405							code => 0,
406							name => 1,
407							split => \$App::Greple::annotate::config->{split},
408							alignto => \$App::Greple::annotate::config->{alignto},
409							);
410							lock_keys %{$config};
411							my %type = ( '*' => ':1' );
412	0		0	0	0		sub optspec { $_[0] . ( $type{$_[0]} // $type{'*'} // '' ) }
			0
413
414							our %CONFIG_TAGS = (
415							field => [ qw(column visible char width utf8 utf16 code name) ],
416							);
417
418							sub finalize {
419	0			0	0		our($mod, $argv) = @_;
420							$config->deal_with(
421							$argv,
422	0						map(optspec($_), keys %{$config}),
423							'all:1' => sub {
424	0			0			for ($CONFIG_TAGS{field}->@*) {
425	0	0					my $ref = ref $config->{$_} ? $config->{$_} : \$config->{$_};
426	0						$$ref = $_[1];
427							}
428							},
429	0						);
430							}
431
432	1			1		1296	use Unicode::UCD qw(charinfo);
	1					20738
	1					79
433	1			1		8	use Unicode::Normalize;
	1					1
	1					1039
434
435							sub charname {
436	0	0		0	0		local $_ = @_ ? shift : $_;
437	0						s/(.)/name($1)/sger;
	0
438							}
439
440							sub name {
441	0			0	1		my $char = shift;
442	0	0					if (my $info = Unicode::UCD::charinfo(ord($char))) {
443	0						"\\N{" . $info->{name} . "}";
444							} else {
445	0						"[noinfo]";
446							}
447							}
448
449							sub charcode {
450	0	0		0	0		local *_ = @_ ? \$_[0] : \$_;
451	0						s/(.)/code($1)/sger;
	0
452							}
453
454	0			0	1		sub utf8 { encode('UTF-8', @_) }
455	0			0	1		sub utf16 { encode('UTF-16', @_) }
456							sub encode {
457	0			0	0		my $code = shift;
458	0	0					local *_ = @_ ? \$_[0] : \$_;
459	0						Encode::encode($code, $_) =~ s/(.)/code($1)/ger;
	0
460							}
461
462							sub normalize {
463	0			0	0		my $sub = shift;
464	0	0					local *_ = @_ ? \$_[0] : \$_;
465	0						$sub->($_);
466							}
467	0			0	1		sub nfd { charcode normalize \&NFD => @_ }
468	0			0	1		sub nfc { charcode normalize \&NFC => @_ }
469	0			0	1		sub nfkd { charcode normalize \&NFKD => @_ }
470	0			0	1		sub nfkc { charcode normalize \&NFKC => @_ }
471
472							sub code {
473	0			0	1		state $format = [ qw(\x{%02x} \x{%04x}) ];
474	0						my $ord = ord $_[0];
475	0						sprintf $format->[$ord > 0xff], $ord;
476							}
477
478							my %cmap = (
479							"\t" => '\t',
480							"\n" => '\n',
481							"\r" => '\r',
482							"\f" => '\f',
483							"\b" => '\b',
484							"\a" => '\a',
485							"\e" => '\e',
486							);
487
488							sub control {
489	0	0		0	0		local $_ = @_ ? $_[0] : $_;
490	0	0					if (s/\A([\t\n\r\f\b\a\e])/$cmap{$1}/e) {
	0	0
491	0						$_;
492	0						} elsif (s/\A([\x00-\x1f])/sprintf "\\c%c", ord($1)+0x40/e) {
493	0						$_;
494							} else {
495	0						code($_);
496							}
497							}
498
499							my $invisible_re = $ENV{INVISIBLE_RE} = qr/[^\pL\pN\pP\pS]/;
500
501							sub visible {
502	0	0		0	1		local *_ = @_ ? \$_[0] : \$_;
503	0						s{($invisible_re)}{control($1)}ger;
	0
504							}
505
506							sub width {
507	0	0		0	1		local *_ = @_ ? \$_[0] : \$_;
508	0						ansi_width($_);
509							}
510
511							sub describe {
512	0			0	0		(my $column, local $_) = { @_ }->@{ qw(column match) };
513	0						my @s;
514	0	0					push @s, sprintf qw' %3d ' , $column if $config->{column};
515	0	0					push @s, sprintf qw' %s ' , visible if $config->{visible};
516	0	0					push @s, sprintf qw' char="%s" ' , $_ if $config->{char};
517	0	0					push @s, sprintf qw' w=%d ' , width if $config->{width};
518	0	0					push @s, sprintf qw' utf8=%s ' , utf8 if $config->{utf8};
519	0	0					push @s, sprintf qw' utf16=%s ' , utf16 if $config->{utf16};
520	0	0					push @s, sprintf qw' nfd=%s ' , nfd if $config->{nfd};
521	0	0					push @s, sprintf qw' nfc=%s ' , nfc if $config->{nfc};
522	0	0					push @s, sprintf qw' nfkd=%s ' , nfkd if $config->{nfkd};
523	0	0					push @s, sprintf qw' nfkc=%s ' , nfkc if $config->{nfkc};
524	0	0					push @s, sprintf qw' code=%s ' , charcode if $config->{code};
525	0	0					push @s, sprintf qw' name=%s ' , charname if $config->{name};
526	1			1		7	join "\N{NBSP}", @s;
	1					1
	1					8
	0
527							}
528
529							$App::Greple::annotate::ANNOTATE = \&describe;
530
531							1;
532
533							__DATA__