File Coverage

blib/lib/File/FormatIdentification/Regex.pm

Criterion	Covered	Total	%
statement	120	124	96.7
branch	29	34	85.2
condition	13	15	86.6
subroutine	15	15	100.0
pod	0	7	0.0
total	177	195	90.7

line	stmt	bran	cond	sub	pod	time	code
1							package File::FormatIdentification::Regex;
2
3							our $VERSION = '0.07'; # VERSION
4
5							# ABSTRACT helper module to combine and optimize regexes
6
7	2			2		336265	use feature qw(say);
	2					12
	2					265
8	2			2		13	use strict;
	2					18
	2					48
9	2			2		11	use warnings;
	2					4
	2					65
10	2			2		940	use String::LCSS;
	2					890
	2					106
11
12	2			2		1777	use Regexp::Assemble;
	2					43101
	2					75
13	2			2		2353	use Regexp::Optimizer;
	2					2141
	2					65
14	2			2		25	use Carp;
	2					4
	2					158
15	2			2		13	use Exporter 'import'; # gives you Exporter's import() method directly
	2					9
	2					3327
16							our @EXPORT =
17							qw(and_combine or_combine calc_quality simplify_two_or_combined_regex peep_hole_optimizer )
18							; # symbols to export on request
19							our @EXPORT_OK = qw( hex_replace_from_bracket hex_replace_to_bracket );
20
21
22
23
24							sub and_combine (@) {
25							my @rx_groups = map {
26	5			5	0	141	my $rx = $_;
	11					25
27	11					14	my $rxfill = "";
28	11					27	my $ret = '';
29	11	100				39	if ( $rx =~ m#^\^$# ) { $ret = $rx; }
	1	100				6
30	1					2	elsif ( $rx =~ m#^\$$# ) { $ret = $rx; }
31							else {
32	9	100				26	if ( $rx =~ m#\$$# ) {
33	1					3	$rxfill = ".*";
34							}
35	9					20	$ret = "(?=$rxfill$rx)";
36							}
37	11					30	$ret;
38							} @_;
39	5					14	my $combined = join( "", @rx_groups );
40
41							#my $rx = Regexp::Assemble->new;
42							#$rx->add( $combined );
43							#return $rx->as_string;
44							#my $o = Regexp::Optimizer->new;
45							#my $rcomb = qr/$combined/;
46							#return $o->as_string($rcomb);
47	5					45	return $combined;
48							}
49
50							sub or_combine (@) {
51	2			2	0	1345	my $ro = Regexp::Assemble->new;
52	2					166	foreach my $rx (@_) {
53	5					227	$ro->add($rx);
54							}
55	2					212	return $ro->as_string;
56							}
57
58							sub simplify_two_or_combined_regex($$) {
59	8367			8367	0	13484	my $rx1 = $_[0];
60	8367					12761	my $rx2 = $_[1];
61	8367					23636	my $rx = qr#$[A-Za-z0-9]*$#;
62	8367	100	100			97699	return "" if (($rx1 !~ m/$rx/) \|\| ($rx2 !~ m/$rx/));
63							# only left simplify supported yet
64	3828					11049	return String::LCSS::lcss( $rx1, $rx2 );
65							}
66
67							sub hex_replace_to_bracket {
68	1			1	0	1354	my $regex = shift;
69	1					19	$regex =~ s#(?<=\\x)([0-9A-F]{2})#{$1}#g;
70	1					7	return $regex;
71							}
72
73							sub hex_replace_from_bracket {
74	1			1	0	6	my $regex = shift;
75	1					18	$regex =~ s#(?<=\\x)\{([0-9A-F]{2})\}#$1#g;
76	1					6	return $regex;
77							}
78
79							sub peep_hole_optimizer ($) {
80	8182			8182	0	14372	my $regex = $_[0]; # only works if special Regexes within File::FormatIdentification:: used
81
82							#$regex = hex_replace_to_bracket($regex);
83	8182	50				38184	if ($regex =~ m/\\x[0-9]+/) {
84	0					0	confess "regex '$regex' has invalid \\x sequences, use \\x{} instead!";
85							}
86	8182					14274	my $oldregex = $regex;
87							##### first optimize bracket-groups
88	8182					27718	my $subrg =
89							qr#(?:[A-Za-z0-9])\|(?:\\x\{[0-9A-F]{2}\})#; # matches: \x00-\xff or text
90							#my $subrg = qr#(?:$$subra$)#;
91	8182					44591	my $subre = qr#(?:$$subrg(?:\\|$subrg)+$)\|(?:$subrg)#
92							; # matches (…\|…) or (…\|…\|…) ...
93							#$regex =~ s#$\(($subra*)$\)(?!\\|)#(\1\)#g; # matches ((…))
94	8182					38966	$regex =~ s#$\(($subre+)$\)#($1)#g;
95	8182					21723	$regex =~ s#$\((\([^)\|]$(\\|$[^)\|]$)+)\)\)#($1)#g;
96							##### optimize common subsequences
97							##### part1, combine bar\|baz -> ba(r\|z)
98							#say "BEFORE: regex=$regex";
99	8182		100			71928	while (
100							$regex =~ m#$($subrg)$\\|$($subrg)$# \|\|
101							$regex =~ m#($subrg)\\|($subrg)#
102							) {
103	1629					4552	my $rx1 = $1;
104	1629					3332	my $rx2 = $2;
105
106							#say "common subseq: $regex -> rx1=$rx1 rx2=$rx2";
107
108	1629					4769	my $common = String::LCSS::lcss( $rx1, $rx2 );
109	1629	100	66			199279	if ( !defined $common \|\| length($common) == 0 ) { last; }
	641					1709
110	988	100				7135	if ( $common !~ m#^$subrg+$# ) { last; }
	881					2499
111
112							#say "!ok: $regex -> common=$common";
113
114							# common prefix
115	107	100	100			3900	if ( $rx1 =~ m#^(.)$common$# && $rx2 =~ m#^(.)$common$# ) {
		100	66
116
117							#say "suffix found";
118	11					180	$rx1 =~ m#^(.*)$common$#;
119	11					46	my $rx1_prefix = $1;
120	11					154	$rx2 =~ m#^(.*)$common$#;
121	11					39	my $rx2_prefix = $1;
122	11					58	my $subst = "($rx1_prefix\|$rx2_prefix)$common";
123	11	100				304	if ( $regex =~ m#$($subrg)$\\|$($subrg)$# ) {
		50
124	9					857	$regex =~ s#$$subrg$\\|$$subrg$#$subst#g;
125							}
126							elsif ( $regex =~ m#($subrg)\\|($subrg)# ) {
127	2					63	$regex =~ s#$subrg\\|$subrg#$subst#g;
128							}
129							}
130
131							# common suffix
132							elsif ( $rx1 =~ m#^$common(.)$# && $rx2 =~ m#^$common(.)$# ) {
133
134							#say "prefix found";
135	9					110	$rx1 =~ m#^$common(.*)$#;
136	9					33	my $rx1_suffix = $1;
137	9					87	$rx2 =~ m#^$common(.*)$#;
138	9					30	my $rx2_suffix = $1;
139	9					37	my $subst = "$common($rx1_suffix\|$rx2_suffix)";
140
141							#say "subst=$subst";
142	9	100				236	if ( $regex =~ m#$($subrg)$\\|$($subrg)$# ) {
		50
143	7					407	$regex =~ s#$$subrg$\\|$$subrg$#$subst#g;
144							}
145							elsif ( $regex =~ m#($subrg)\\|($subrg)# ) {
146	2					72	$regex =~ s#$subrg\\|$subrg#$subst#g;
147							}
148
149							#say "regex=$regex";
150							}
151							else {
152	87					308	last;
153							}
154							}
155							##### part2, combine barbara -> (bar){2}a
156	8182					514640	while ( $regex =~ m#($subrg{3,}?)(\1+)(?!$subrg*\})# ) {
157	339					1229	my $sub = $1;
158	339	100				2447	if ( $sub =~ m#^($subrg)\1+$# ) {
159	279					750	last;
160							}
161	60					273	my $l1 = length($1);
162	60					170	my $l2 = length($2);
163	60					193	my $matches = 1 + ( $l2 / $l1 );
164
165							#say "Found1 in regex='$regex' sub='$sub' with \$2=$2 l1=$l1 l2=$l2 matches=$matches";
166
167	60	50				447	if ( $sub =~ m#^$subrg$# ) {
168	0					0	$regex =~ s#($subrg{3,}?)\1+(?!$subrg*\})#$sub\{$matches\}#;
169							}
170							else {
171	60					13761	$regex =~ s#($subrg{3,}?)\1+(?!$subrg*\})#($sub)\{$matches\}#;
172							}
173							}
174							##### part2, combine toooor -> to{4}r
175	8182					143930	while ( $regex =~ m#($subrg)(\1{3,})(?!$subrg*\})# ) {
176	656					1729	my $sub = $1;
177	656					1345	my $l1 = length($1);
178	656					1496	my $l2 = length($2);
179	656					1624	my $matches = 1 + ( $l2 / $l1 );
180
181							#say "Found2 in regex='$regex' sub='$sub' with \$2=$2 l1=$l1 l2=$l2 matches=$matches";
182
183	656	50				3269	if ( $sub =~ m#^$subrg$# ) {
184	656					31186	$regex =~ s#($subrg)\1{3,}(?!$subrg*\})#$sub\{$matches\}#;
185							}
186							else {
187	0					0	$regex =~ s#($subrg)\1{3,}(?!$subrg*\})#($sub)\{$matches\}#;
188							}
189							}
190							##### part2, combine foooo -> fo{4}
191							#while ($regex =~ m#($subrg)\1{3,}(?!$subrg*\})#) {
192							# my $sub = $1;
193							# my $matches = $#+; $matches++;
194							# say "Found in regex='$regex' sub='$sub' with matches=$matches";
195							# $regex =~ s#($subrg)\1{3,}(?!$subrg*\}#$sub\{$matches\}#;
196							#}
197
198	8182					43136	return $regex;
199							}
200
201							# calc regex quality, if more specific the quality is higher
202							sub calc_quality ($) {
203	7138			7138	0	13030	my $regex = shift;
204
205							# replace all \xff with #
206							# replace all . with ( \| \| \| \| )
207							# replace all [abc] with (a\|b\|c)
208							# replace all [^abc] with (d\|e\|f\|..\|)
209							# then $len = count of # and $or = count of \|
210							# divide it with $len / (1+$or)
211	7138					11211	my $len = 0;
212	7138					10597	my $alt = 0;
213	7138					42612	while ( $regex =~ s/\\x[0-9a-f]{2}// ) {
214	0					0	$len++;
215							}
216	7138					20222	while ( $regex =~ s/\[\^(.*?)\]// ) {
217	64					202	$alt += ( 256 - length($1) );
218	64					258	$len++;
219							}
220	7138					20457	while ( $regex =~ s/\[(.*?)\]// ) {
221	1297					3212	$alt += length($1);
222	1297					6567	$len++;
223							}
224	7138					26546	while ( $regex =~ s/\.// ) {
225	9877					17968	$alt += 256;
226	9877					35754	$len++;
227							}
228	7138					34670	while ( $regex =~ s/[A-Za-z0-9 ]// ) {
229	267360					1219232	$len++;
230							}
231	7138					15130	my $tmp = $len / ( 1 + $alt );
232
233	7138	100				25929	my $quality = ( $tmp == 0 ) ? 0 : int( 1000 * log($tmp) ) / 1000;
234
235							#say "rest: $regex len=$len alt=$alt quality=$quality ($tmp)";
236	7138					25825	return $quality;
237							}
238
239							# see https://stackoverflow.com/questions/869809/combine-regexp#870506
240
241							1;
242
243							__END__
244
245							=pod
246
247							=encoding UTF-8
248
249							=head1 NAME
250
251							File::FormatIdentification::Regex
252
253							=head1 VERSION
254
255							version 0.07
256
257							=head1 AUTHOR
258
259							Andreas Romeyke <pause@andreas-romeyke.de>
260
261							=head1 COPYRIGHT AND LICENSE
262
263							This software is copyright (c) 2018 by Andreas Romeyke.
264
265							This is free software; you can redistribute it and/or modify it under
266							the same terms as the Perl 5 programming language system itself.
267
268							=cut