File Coverage

blib/lib/File/FormatIdentification/Regex.pm

Criterion	Covered	Total	%
statement	120	124	96.7
branch	29	34	85.2
condition	13	15	86.6
subroutine	15	15	100.0
pod	0	7	0.0
total	177	195	90.7

line	stmt	bran	cond	sub	pod	time	code
1							package File::FormatIdentification::Regex;
2
3							our $VERSION = '0.05'; # TRIAL VERSION
4
5							# ABSTRACT helper module to combine and optimize regexes
6
7	2			2		330743	use feature qw(say);
	2					12
	2					244
8	2			2		14	use strict;
	2					4
	2					50
9	2			2		12	use warnings;
	2					4
	2					60
10	2			2		876	use String::LCSS;
	2					845
	2					97
11
12	2			2		1677	use Regexp::Assemble;
	2					42987
	2					85
13	2			2		933	use Regexp::Optimizer;
	2					2253
	2					69
14	2			2		16	use Carp;
	2					6
	2					120
15	2			2		13	use Exporter 'import'; # gives you Exporter's import() method directly
	2					4
	2					3189
16							our @EXPORT =
17							qw(and_combine or_combine calc_quality simplify_two_or_combined_regex peep_hole_optimizer )
18							; # symbols to export on request
19							our @EXPORT_OK = qw( hex_replace_from_bracket hex_replace_to_bracket );
20
21
22
23
24							sub and_combine (@) {
25							my @rx_groups = map {
26	5			5	0	138	my $rx = $_;
	11					23
27	11					28	my $rxfill = "";
28	11					32	my $ret = '';
29	11	100				38	if ( $rx =~ m#^\^$# ) { $ret = $rx; }
	1	100				3
30	1					2	elsif ( $rx =~ m#^\$$# ) { $ret = $rx; }
31							else {
32	9	100				27	if ( $rx =~ m#\$$# ) {
33	1					2	$rxfill = ".*";
34							}
35	9					23	$ret = "(?=$rxfill$rx)";
36							}
37	11					30	$ret;
38							} @_;
39	5					14	my $combined = join( "", @rx_groups );
40
41							#my $rx = Regexp::Assemble->new;
42							#$rx->add( $combined );
43							#return $rx->as_string;
44							#my $o = Regexp::Optimizer->new;
45							#my $rcomb = qr/$combined/;
46							#return $o->as_string($rcomb);
47	5					34	return $combined;
48							}
49
50							sub or_combine (@) {
51	2			2	0	1241	my $ro = Regexp::Assemble->new;
52	2					173	foreach my $rx (@_) {
53	5					224	$ro->add($rx);
54							}
55	2					202	return $ro->as_string;
56							}
57
58							sub simplify_two_or_combined_regex($$) {
59	5757			5757	0	9492	my $rx1 = $_[0];
60	5757					8965	my $rx2 = $_[1];
61	5757					15470	my $rx = qr#$[A-Za-z0-9]*$#;
62	5757	100	100			66511	return "" if (($rx1 !~ m/$rx/) \|\| ($rx2 !~ m/$rx/));
63							# only left simplify supported yet
64	3396					9760	return String::LCSS::lcss( $rx1, $rx2 );
65							}
66
67							sub hex_replace_to_bracket {
68	1			1	0	1350	my $regex = shift;
69	1					18	$regex =~ s#(?<=\\x)([0-9A-F]{2})#{$1}#g;
70	1					7	return $regex;
71							}
72
73							sub hex_replace_from_bracket {
74	1			1	0	3	my $regex = shift;
75	1					12	$regex =~ s#(?<=\\x)\{([0-9A-F]{2})\}#$1#g;
76	1					6	return $regex;
77							}
78
79							sub peep_hole_optimizer ($) {
80	5836			5836	0	11344	my $regex = $_[0]; # only works if special Regexes within File::FormatIdentification:: used
81
82							#$regex = hex_replace_to_bracket($regex);
83	5836	50				27342	if ($regex =~ m/\\x[0-9]+/) {
84	0					0	confess "regex '$regex' has invalid \\x sequences, use \\x{} instead!";
85							}
86	5836					10320	my $oldregex = $regex;
87							##### first optimize bracket-groups
88	5836					19816	my $subrg =
89							qr#(?:[A-Za-z0-9])\|(?:\\x\{[0-9A-F]{2}\})#; # matches: \x00-\xff or text
90							#my $subrg = qr#(?:$$subra$)#;
91	5836					29985	my $subre = qr#(?:$$subrg(?:\\|$subrg)+$)\|(?:$subrg)#
92							; # matches (…\|…) or (…\|…\|…) ...
93							#$regex =~ s#$\(($subra*)$\)(?!\\|)#(\1\)#g; # matches ((…))
94	5836					27438	$regex =~ s#$\(($subre+)$\)#($1)#g;
95	5836					16532	$regex =~ s#$\((\([^)\|]$(\\|$[^)\|]$)+)\)\)#($1)#g;
96							##### optimize common subsequences
97							##### part1, combine bar\|baz -> ba(r\|z)
98							#say "BEFORE: regex=$regex";
99	5836		100			51047	while (
100							$regex =~ m#$($subrg)$\\|$($subrg)$# \|\|
101							$regex =~ m#($subrg)\\|($subrg)#
102							) {
103	1101					2973	my $rx1 = $1;
104	1101					2595	my $rx2 = $2;
105
106							#say "common subseq: $regex -> rx1=$rx1 rx2=$rx2";
107
108	1101					3538	my $common = String::LCSS::lcss( $rx1, $rx2 );
109	1101	100	66			131450	if ( !defined $common \|\| length($common) == 0 ) { last; }
	527					1394
110	574	100				4466	if ( $common !~ m#^$subrg+$# ) { last; }
	464					1556
111
112							#say "!ok: $regex -> common=$common";
113
114							# common prefix
115	110	100	100			3892	if ( $rx1 =~ m#^(.)$common$# && $rx2 =~ m#^(.)$common$# ) {
		100	66
116
117							#say "suffix found";
118	11					169	$rx1 =~ m#^(.*)$common$#;
119	11					39	my $rx1_prefix = $1;
120	11					159	$rx2 =~ m#^(.*)$common$#;
121	11					36	my $rx2_prefix = $1;
122	11					56	my $subst = "($rx1_prefix\|$rx2_prefix)$common";
123	11	100				294	if ( $regex =~ m#$($subrg)$\\|$($subrg)$# ) {
		50
124	9					890	$regex =~ s#$$subrg$\\|$$subrg$#$subst#g;
125							}
126							elsif ( $regex =~ m#($subrg)\\|($subrg)# ) {
127	2					76	$regex =~ s#$subrg\\|$subrg#$subst#g;
128							}
129							}
130
131							# common suffix
132							elsif ( $rx1 =~ m#^$common(.)$# && $rx2 =~ m#^$common(.)$# ) {
133
134							#say "prefix found";
135	9					106	$rx1 =~ m#^$common(.*)$#;
136	9					33	my $rx1_suffix = $1;
137	9					93	$rx2 =~ m#^$common(.*)$#;
138	9					26	my $rx2_suffix = $1;
139	9					36	my $subst = "$common($rx1_suffix\|$rx2_suffix)";
140
141							#say "subst=$subst";
142	9	100				211	if ( $regex =~ m#$($subrg)$\\|$($subrg)$# ) {
		50
143	7					449	$regex =~ s#$$subrg$\\|$$subrg$#$subst#g;
144							}
145							elsif ( $regex =~ m#($subrg)\\|($subrg)# ) {
146	2					82	$regex =~ s#$subrg\\|$subrg#$subst#g;
147							}
148
149							#say "regex=$regex";
150							}
151							else {
152	90					289	last;
153							}
154							}
155							##### part2, combine barbara -> (bar){2}a
156	5836					372979	while ( $regex =~ m#($subrg{3,}?)(\1+)(?!$subrg*\})# ) {
157	300					1173	my $sub = $1;
158	300	100				2070	if ( $sub =~ m#^($subrg)\1+$# ) {
159	264					709	last;
160							}
161	36					117	my $l1 = length($1);
162	36					80	my $l2 = length($2);
163	36					110	my $matches = 1 + ( $l2 / $l1 );
164
165							#say "Found1 in regex='$regex' sub='$sub' with \$2=$2 l1=$l1 l2=$l2 matches=$matches";
166
167	36	50				256	if ( $sub =~ m#^$subrg$# ) {
168	0					0	$regex =~ s#($subrg{3,}?)\1+(?!$subrg*\})#$sub\{$matches\}#;
169							}
170							else {
171	36					5998	$regex =~ s#($subrg{3,}?)\1+(?!$subrg*\})#($sub)\{$matches\}#;
172							}
173							}
174							##### part2, combine toooor -> to{4}r
175	5836					99038	while ( $regex =~ m#($subrg)(\1{3,})(?!$subrg*\})# ) {
176	581					1529	my $sub = $1;
177	581					1285	my $l1 = length($1);
178	581					1382	my $l2 = length($2);
179	581					1519	my $matches = 1 + ( $l2 / $l1 );
180
181							#say "Found2 in regex='$regex' sub='$sub' with \$2=$2 l1=$l1 l2=$l2 matches=$matches";
182
183	581	50				2802	if ( $sub =~ m#^$subrg$# ) {
184	581					29703	$regex =~ s#($subrg)\1{3,}(?!$subrg*\})#$sub\{$matches\}#;
185							}
186							else {
187	0					0	$regex =~ s#($subrg)\1{3,}(?!$subrg*\})#($sub)\{$matches\}#;
188							}
189							}
190							##### part2, combine foooo -> fo{4}
191							#while ($regex =~ m#($subrg)\1{3,}(?!$subrg*\})#) {
192							# my $sub = $1;
193							# my $matches = $#+; $matches++;
194							# say "Found in regex='$regex' sub='$sub' with matches=$matches";
195							# $regex =~ s#($subrg)\1{3,}(?!$subrg*\}#$sub\{$matches\}#;
196							#}
197
198	5836					31610	return $regex;
199							}
200
201							# calc regex quality, if more specific the quality is higher
202							sub calc_quality ($) {
203	4939			4939	0	8893	my $regex = shift;
204
205							# replace all \xff with #
206							# replace all . with ( \| \| \| \| )
207							# replace all [abc] with (a\|b\|c)
208							# replace all [^abc] with (d\|e\|f\|..\|)
209							# then $len = count of # and $or = count of \|
210							# divide it with $len / (1+$or)
211	4939					8632	my $len = 0;
212	4939					8014	my $alt = 0;
213	4939					28848	while ( $regex =~ s/\\x[0-9a-f]{2}// ) {
214	0					0	$len++;
215							}
216	4939					14704	while ( $regex =~ s/\[\^(.*?)\]// ) {
217	67					210	$alt += ( 256 - length($1) );
218	67					275	$len++;
219							}
220	4939					14422	while ( $regex =~ s/\[(.*?)\]// ) {
221	1060					2678	$alt += length($1);
222	1060					5100	$len++;
223							}
224	4939					18073	while ( $regex =~ s/\.// ) {
225	7537					13793	$alt += 256;
226	7537					28227	$len++;
227							}
228	4939					23715	while ( $regex =~ s/[A-Za-z0-9 ]// ) {
229	189579					823088	$len++;
230							}
231	4939					10993	my $tmp = $len / ( 1 + $alt );
232
233	4939	100				18368	my $quality = ( $tmp == 0 ) ? 0 : int( 1000 * log($tmp) ) / 1000;
234
235							#say "rest: $regex len=$len alt=$alt quality=$quality ($tmp)";
236	4939					17403	return $quality;
237							}
238
239							# see https://stackoverflow.com/questions/869809/combine-regexp#870506
240
241							1;
242
243							__END__
244
245							=pod
246
247							=encoding UTF-8
248
249							=head1 NAME
250
251							File::FormatIdentification::Regex
252
253							=head1 VERSION
254
255							version 0.05
256
257							=head1 AUTHOR
258
259							Andreas Romeyke <pause@andreas-romeyke.de>
260
261							=head1 COPYRIGHT AND LICENSE
262
263							This software is copyright (c) 2018 by Andreas Romeyke.
264
265							This is free software; you can redistribute it and/or modify it under
266							the same terms as the Perl 5 programming language system itself.
267
268							=cut