File Coverage

blib/lib/File/FormatIdentification/Regex.pm

Criterion	Covered	Total	%
statement	120	124	96.7
branch	29	34	85.2
condition	13	15	86.6
subroutine	15	15	100.0
pod	0	7	0.0
total	177	195	90.7

line	stmt	bran	cond	sub	pod	time	code
1							package File::FormatIdentification::Regex;
2
3							our $VERSION = '0.06'; # VERSION
4
5							# ABSTRACT helper module to combine and optimize regexes
6
7	2			2		355532	use feature qw(say);
	2					11
	2					264
8	2			2		31	use strict;
	2					6
	2					58
9	2			2		11	use warnings;
	2					4
	2					69
10	2			2		1035	use String::LCSS;
	2					886
	2					117
11
12	2			2		1862	use Regexp::Assemble;
	2					45989
	2					85
13	2			2		1172	use Regexp::Optimizer;
	2					2421
	2					71
14	2			2		15	use Carp;
	2					5
	2					134
15	2			2		16	use Exporter 'import'; # gives you Exporter's import() method directly
	2					4
	2					3337
16							our @EXPORT =
17							qw(and_combine or_combine calc_quality simplify_two_or_combined_regex peep_hole_optimizer )
18							; # symbols to export on request
19							our @EXPORT_OK = qw( hex_replace_from_bracket hex_replace_to_bracket );
20
21
22
23
24							sub and_combine (@) {
25							my @rx_groups = map {
26	5			5	0	147	my $rx = $_;
	11					23
27	11					19	my $rxfill = "";
28	11					31	my $ret = '';
29	11	100				40	if ( $rx =~ m#^\^$# ) { $ret = $rx; }
	1	100				4
30	1					3	elsif ( $rx =~ m#^\$$# ) { $ret = $rx; }
31							else {
32	9	100				26	if ( $rx =~ m#\$$# ) {
33	1					3	$rxfill = ".*";
34							}
35	9					20	$ret = "(?=$rxfill$rx)";
36							}
37	11					29	$ret;
38							} @_;
39	5					17	my $combined = join( "", @rx_groups );
40
41							#my $rx = Regexp::Assemble->new;
42							#$rx->add( $combined );
43							#return $rx->as_string;
44							#my $o = Regexp::Optimizer->new;
45							#my $rcomb = qr/$combined/;
46							#return $o->as_string($rcomb);
47	5					35	return $combined;
48							}
49
50							sub or_combine (@) {
51	2			2	0	1489	my $ro = Regexp::Assemble->new;
52	2					173	foreach my $rx (@_) {
53	5					220	$ro->add($rx);
54							}
55	2					197	return $ro->as_string;
56							}
57
58							sub simplify_two_or_combined_regex($$) {
59	5757			5757	0	9756	my $rx1 = $_[0];
60	5757					8941	my $rx2 = $_[1];
61	5757					16131	my $rx = qr#$[A-Za-z0-9]*$#;
62	5757	100	100			68156	return "" if (($rx1 !~ m/$rx/) \|\| ($rx2 !~ m/$rx/));
63							# only left simplify supported yet
64	3396					10546	return String::LCSS::lcss( $rx1, $rx2 );
65							}
66
67							sub hex_replace_to_bracket {
68	1			1	0	1371	my $regex = shift;
69	1					18	$regex =~ s#(?<=\\x)([0-9A-F]{2})#{$1}#g;
70	1					7	return $regex;
71							}
72
73							sub hex_replace_from_bracket {
74	1			1	0	4	my $regex = shift;
75	1					12	$regex =~ s#(?<=\\x)\{([0-9A-F]{2})\}#$1#g;
76	1					7	return $regex;
77							}
78
79							sub peep_hole_optimizer ($) {
80	5836			5836	0	10047	my $regex = $_[0]; # only works if special Regexes within File::FormatIdentification:: used
81
82							#$regex = hex_replace_to_bracket($regex);
83	5836	50				26663	if ($regex =~ m/\\x[0-9]+/) {
84	0					0	confess "regex '$regex' has invalid \\x sequences, use \\x{} instead!";
85							}
86	5836					10560	my $oldregex = $regex;
87							##### first optimize bracket-groups
88	5836					19750	my $subrg =
89							qr#(?:[A-Za-z0-9])\|(?:\\x\{[0-9A-F]{2}\})#; # matches: \x00-\xff or text
90							#my $subrg = qr#(?:$$subra$)#;
91	5836					32666	my $subre = qr#(?:$$subrg(?:\\|$subrg)+$)\|(?:$subrg)#
92							; # matches (…\|…) or (…\|…\|…) ...
93							#$regex =~ s#$\(($subra*)$\)(?!\\|)#(\1\)#g; # matches ((…))
94	5836					30108	$regex =~ s#$\(($subre+)$\)#($1)#g;
95	5836					18687	$regex =~ s#$\((\([^)\|]$(\\|$[^)\|]$)+)\)\)#($1)#g;
96							##### optimize common subsequences
97							##### part1, combine bar\|baz -> ba(r\|z)
98							#say "BEFORE: regex=$regex";
99	5836		100			51467	while (
100							$regex =~ m#$($subrg)$\\|$($subrg)$# \|\|
101							$regex =~ m#($subrg)\\|($subrg)#
102							) {
103	1101					3144	my $rx1 = $1;
104	1101					3046	my $rx2 = $2;
105
106							#say "common subseq: $regex -> rx1=$rx1 rx2=$rx2";
107
108	1101					3879	my $common = String::LCSS::lcss( $rx1, $rx2 );
109	1101	100	66			131301	if ( !defined $common \|\| length($common) == 0 ) { last; }
	527					1358
110	574	100				4253	if ( $common !~ m#^$subrg+$# ) { last; }
	464					1581
111
112							#say "!ok: $regex -> common=$common";
113
114							# common prefix
115	110	100	100			3823	if ( $rx1 =~ m#^(.)$common$# && $rx2 =~ m#^(.)$common$# ) {
		100	66
116
117							#say "suffix found";
118	11					184	$rx1 =~ m#^(.*)$common$#;
119	11					38	my $rx1_prefix = $1;
120	11					157	$rx2 =~ m#^(.*)$common$#;
121	11					47	my $rx2_prefix = $1;
122	11					54	my $subst = "($rx1_prefix\|$rx2_prefix)$common";
123	11	100				302	if ( $regex =~ m#$($subrg)$\\|$($subrg)$# ) {
		50
124	9					860	$regex =~ s#$$subrg$\\|$$subrg$#$subst#g;
125							}
126							elsif ( $regex =~ m#($subrg)\\|($subrg)# ) {
127	2					63	$regex =~ s#$subrg\\|$subrg#$subst#g;
128							}
129							}
130
131							# common suffix
132							elsif ( $rx1 =~ m#^$common(.)$# && $rx2 =~ m#^$common(.)$# ) {
133
134							#say "prefix found";
135	9					116	$rx1 =~ m#^$common(.*)$#;
136	9					32	my $rx1_suffix = $1;
137	9					98	$rx2 =~ m#^$common(.*)$#;
138	9					26	my $rx2_suffix = $1;
139	9					39	my $subst = "$common($rx1_suffix\|$rx2_suffix)";
140
141							#say "subst=$subst";
142	9	100				227	if ( $regex =~ m#$($subrg)$\\|$($subrg)$# ) {
		50
143	7					472	$regex =~ s#$$subrg$\\|$$subrg$#$subst#g;
144							}
145							elsif ( $regex =~ m#($subrg)\\|($subrg)# ) {
146	2					89	$regex =~ s#$subrg\\|$subrg#$subst#g;
147							}
148
149							#say "regex=$regex";
150							}
151							else {
152	90					318	last;
153							}
154							}
155							##### part2, combine barbara -> (bar){2}a
156	5836					370830	while ( $regex =~ m#($subrg{3,}?)(\1+)(?!$subrg*\})# ) {
157	300					1000	my $sub = $1;
158	300	100				2081	if ( $sub =~ m#^($subrg)\1+$# ) {
159	264					731	last;
160							}
161	36					115	my $l1 = length($1);
162	36					99	my $l2 = length($2);
163	36					126	my $matches = 1 + ( $l2 / $l1 );
164
165							#say "Found1 in regex='$regex' sub='$sub' with \$2=$2 l1=$l1 l2=$l2 matches=$matches";
166
167	36	50				258	if ( $sub =~ m#^$subrg$# ) {
168	0					0	$regex =~ s#($subrg{3,}?)\1+(?!$subrg*\})#$sub\{$matches\}#;
169							}
170							else {
171	36					6046	$regex =~ s#($subrg{3,}?)\1+(?!$subrg*\})#($sub)\{$matches\}#;
172							}
173							}
174							##### part2, combine toooor -> to{4}r
175	5836					99946	while ( $regex =~ m#($subrg)(\1{3,})(?!$subrg*\})# ) {
176	581					1614	my $sub = $1;
177	581					1229	my $l1 = length($1);
178	581					1469	my $l2 = length($2);
179	581					1421	my $matches = 1 + ( $l2 / $l1 );
180
181							#say "Found2 in regex='$regex' sub='$sub' with \$2=$2 l1=$l1 l2=$l2 matches=$matches";
182
183	581	50				3167	if ( $sub =~ m#^$subrg$# ) {
184	581					29056	$regex =~ s#($subrg)\1{3,}(?!$subrg*\})#$sub\{$matches\}#;
185							}
186							else {
187	0					0	$regex =~ s#($subrg)\1{3,}(?!$subrg*\})#($sub)\{$matches\}#;
188							}
189							}
190							##### part2, combine foooo -> fo{4}
191							#while ($regex =~ m#($subrg)\1{3,}(?!$subrg*\})#) {
192							# my $sub = $1;
193							# my $matches = $#+; $matches++;
194							# say "Found in regex='$regex' sub='$sub' with matches=$matches";
195							# $regex =~ s#($subrg)\1{3,}(?!$subrg*\}#$sub\{$matches\}#;
196							#}
197
198	5836					30086	return $regex;
199							}
200
201							# calc regex quality, if more specific the quality is higher
202							sub calc_quality ($) {
203	4939			4939	0	9398	my $regex = shift;
204
205							# replace all \xff with #
206							# replace all . with ( \| \| \| \| )
207							# replace all [abc] with (a\|b\|c)
208							# replace all [^abc] with (d\|e\|f\|..\|)
209							# then $len = count of # and $or = count of \|
210							# divide it with $len / (1+$or)
211	4939					8152	my $len = 0;
212	4939					8550	my $alt = 0;
213	4939					30536	while ( $regex =~ s/\\x[0-9a-f]{2}// ) {
214	0					0	$len++;
215							}
216	4939					14693	while ( $regex =~ s/\[\^(.*?)\]// ) {
217	67					205	$alt += ( 256 - length($1) );
218	67					281	$len++;
219							}
220	4939					14987	while ( $regex =~ s/\[(.*?)\]// ) {
221	1060					2738	$alt += length($1);
222	1060					5093	$len++;
223							}
224	4939					18655	while ( $regex =~ s/\.// ) {
225	7537					13515	$alt += 256;
226	7537					28633	$len++;
227							}
228	4939					23333	while ( $regex =~ s/[A-Za-z0-9 ]// ) {
229	189579					832874	$len++;
230							}
231	4939					11097	my $tmp = $len / ( 1 + $alt );
232
233	4939	100				19291	my $quality = ( $tmp == 0 ) ? 0 : int( 1000 * log($tmp) ) / 1000;
234
235							#say "rest: $regex len=$len alt=$alt quality=$quality ($tmp)";
236	4939					17863	return $quality;
237							}
238
239							# see https://stackoverflow.com/questions/869809/combine-regexp#870506
240
241							1;
242
243							__END__
244
245							=pod
246
247							=encoding UTF-8
248
249							=head1 NAME
250
251							File::FormatIdentification::Regex
252
253							=head1 VERSION
254
255							version 0.06
256
257							=head1 AUTHOR
258
259							Andreas Romeyke <pause@andreas-romeyke.de>
260
261							=head1 COPYRIGHT AND LICENSE
262
263							This software is copyright (c) 2018 by Andreas Romeyke.
264
265							This is free software; you can redistribute it and/or modify it under
266							the same terms as the Perl 5 programming language system itself.
267
268							=cut