File Coverage

blib/lib/File/FormatIdentification/Pronom.pm

Criterion	Covered	Total	%
statement	246	537	45.8
branch	47	106	44.3
condition	29	48	60.4
subroutine	25	44	56.8
pod	0	20	0.0
total	347	755	45.9

line	stmt	bran	cond	sub	pod	time	code
1							package File::FormatIdentification::Pronom;
2
3	1			1		355502	use feature qw(say);
	1					2
	1					133
4	1			1		8	use strict;
	1					2
	1					31
5	1			1		7	use warnings;
	1					2
	1					28
6	1			1		689	use XML::LibXML;
	1					56149
	1					6
7	1			1		160	use Carp;
	1					3
	1					53
8	1			1		6	use List::Util qw( none first );
	1					2
	1					67
9	1			1		5	use Scalar::Util;
	1					2
	1					53
10	1			1		486	use YAML::XS;
	1					2920
	1					57
11	1			1		486	use File::FormatIdentification::Regex;
	1					4
	1					74
12	1			1		642	use Moose;
	1					497022
	1					7
13
14							our $VERSION = '0.05'; # TRIAL VERSION
15
16							# ABSTRACT Perl extension for parsing PRONOM-Signatures using DROID-Signature file
17
18							# Preloaded methods go here.
19							# flattens a regex-structure to a regex-string, expects a signature-pattern and a list of regex-structures
20							# returns regex
21							#
22	1			1		7876	no warnings 'recursion';
	1					3
	1					815
23
24							sub _flatten_rx_recursive ($$$@) {
25	35355			35355		56976	my $regex = shift;
26	35355					52542	my $lastpos = shift;
27	35355					49381	my $open_brackets = shift;
28	35355					69606	my @rx_groups = @_;
29	35355					53553	my $rx = shift @rx_groups;
30
31							#use Data::Printer;
32							#say "_flatten_rx_recursive";
33							#p( @rx_groups );
34							#p( $rx );
35	35355					53718	my $bracket_symbol = "(";
36	35355	50				68854	if ( !defined $regex ) { confess; }
	0					0
37
38	35355	100				68695	if ( !defined $rx ) { # do nothing
39	16551					35602	while ( $open_brackets > 0 ) {
40	2661					5572	$regex .= ")";
41	2661					5605	$open_brackets--;
42							}
43							}
44							else {
45	18804					40628	my $pos_diff = $rx->{position} - $lastpos;
46	18804					31792	my $local_regex = $rx->{regex};
47	18804	50				36127	if ( !defined $local_regex ) {
48	0					0	$local_regex = '';
49							}
50	18804	100				39525	if ( 0 == $pos_diff ) {
		50
		0
51
52							# TODO:
53	5757					16613	File::FormatIdentification::Regex::simplify_two_or_combined_regex(
54							$regex, $local_regex );
55	5757					1038818	$regex =
56							&_flatten_rx_recursive( "$regex\|$local_regex", $lastpos,
57							$open_brackets, @rx_groups );
58							}
59							elsif ( $pos_diff > 0 ) { # is deeper
60							# look a head, if same pos found, then use bracket, otherwise not
61	13047	100	100			54044	if (
			66
62							(
63							scalar @rx_groups > 0
64							&& ( $rx_groups[0]->{position} == $rx->{position} )
65							)
66							\|\| $pos_diff > 1
67							)
68							{ # use (
69							$regex = &_flatten_rx_recursive(
70							"$regex" . ( $bracket_symbol x $pos_diff ) . $local_regex,
71	2661					11004	$rx->{position}, $open_brackets += $pos_diff, @rx_groups );
72							}
73							else {
74							$regex = &_flatten_rx_recursive(
75							"$regex$local_regex", $rx->{position},
76	10386					32282	$open_brackets, @rx_groups
77							);
78							} ## end else [ if ( scalar @rx_groups...)]
79							}
80							elsif ( $pos_diff < 0 ) { # is higher
81							$regex = &_flatten_rx_recursive(
82							"$regex)$local_regex",
83							$rx->{position},
84	0					0	$open_brackets - 1, #($rx->{position} - $lastpos),
85							@rx_groups
86							);
87							}
88							else {
89	0					0	confess
90							"FL: pos=$rx->{position} lastpos=$lastpos regex='$regex' open=$open_brackets\n";
91							}
92							}
93	35355					81048	return $regex;
94							} ## end sub _flatten_rx_recursive ($$$@)
95	1			1		9	use warnings 'recursion';
	1					2
	1					1843
96
97							sub _flatten_rx ($@) {
98	16551			16551		27882	my $regex = shift;
99	16551					31675	my @rx_groups = @_;
100
101							#say "calling flatten_rx with regex=$regex quality=$quality";
102							#use Data::Printer;
103							#p( @rx_groups );
104	16551					34660	$regex = _flatten_rx_recursive( $regex, 0, 0, @rx_groups );
105	16551					33025	return $regex;
106							} ## end sub _flatten_rx ($@)
107
108							# expands pattern of form "FFFB[10:EB]" to FFFB10, FFFB11, ... FFFBEB
109							sub _expand_pattern ($) {
110	18804			18804		30737	my $pattern = $_[0];
111	18804					41373	$pattern =~ s/(?<=\[)!/^/g;
112	18804					31192	$pattern =~ s/(?<=[0-9A-F]{2}):(?=[0-9A-F]{2})\]/-]/g;
113	18804					195572	$pattern =~ s/([0-9A-F]{2})/\\x{$1}/g;
114
115							# substitute hex with printable ASCII-Output
116	18804					86412	$pattern =~ s#\\x\{(3[0-9]\|[46][1-9A-F]\|[57][0-9A])\}#chr( hex($1) );#egs;
	62487					239166
117	18804					49397	return $pattern;
118							} ## end sub _expand_pattern ($)
119
120							# expands offsets min,max to regex ".{$min,$max}" and uses workarounds if $min or $max exceeds 32766
121							sub _expand_offsets($$) {
122	18804			18804		30099	my $minoffset = shift;
123	18804					27401	my $maxoffset = shift;
124	18804					27276	my $byte =
125							'.'; # HINT: needs the character set modifier "aa" in $foo=~m/$regex/aa
126							#my $byte = '[\x00-\xff]';
127	18804					28067	my $offset_expanded = "";
128	18804	100	66			173751	if ( ( ( not defined $minoffset ) \|\| ( length($minoffset) == 0 ) )
		100	33
			66
			33
			66
			66
			100
129							&& ( ( not defined $maxoffset ) \|\| ( length($maxoffset) == 0 ) ) )
130							{
131	111					216	$offset_expanded = "";
132							}
133							elsif (( defined $minoffset )
134							&& ( length($minoffset) > 0 )
135							&& ( defined $maxoffset )
136							&& ( length($maxoffset) > 0 )
137							&& ( $minoffset == $maxoffset ) )
138							{
139	15243	100				33997	if ( $minoffset > 0 ) {
140	3507					7631	my $maxloops = int( $maxoffset / 32766 );
141	3507					6120	my $maxresidual = $maxoffset % 32766;
142	3507					8460	for ( my $i = 0 ; $i < $maxloops ; $i++ ) {
143	3					18	$offset_expanded .= $byte . "{32766}";
144							}
145	3507					8823	$offset_expanded .= $byte . "{$maxresidual}";
146							} ## end if ( $minoffset > 0 )
147							}
148							else {
149
150							# workaround, because perl quantifier limits,
151							# calc How many repetitions we need! Both offsets should be less than 32766
152							#TODO: check if this comes from Droid or is calculated
153
154	3450					6253	my $mintmp = 0;
155	3450					5054	my $maxtmp = 0;
156	3450	50	33			11550	if ( defined $minoffset && ( length($minoffset) > 0 ) ) {
157	3450					6025	$mintmp = $minoffset;
158							}
159	3450	100	66			10189	if ( defined $maxoffset && ( length($maxoffset) > 0 ) ) {
160	2313					3686	$maxtmp = $maxoffset;
161							}
162
163	3450					5488	my $maxloops;
164	3450	100				6689	if ( $maxtmp >= $mintmp ) {
165	3408					7508	$maxloops = int( $maxtmp / 32766 );
166							}
167							else {
168	42					95	$maxloops = int( $mintmp / 32766 );
169							}
170	3450					6320	my $maxresidual = $maxtmp % 32766;
171	3450					5732	my $minresidual = $mintmp % 32766;
172
173							#say "\tMaxloops=$maxloops maxres = $maxresidual minres=$minresidual";
174	3450					5511	my @offsets;
175	3450					5415	my $minstr = "";
176	3450					5322	my $maxstr = "";
177	3450	50	33			11417	if ( defined $minoffset && length($minoffset) > 0 ) {
178	3450					5436	$minstr = $minresidual;
179	3450					5743	$mintmp = $mintmp - $minresidual;
180							}
181
182	3450					8035	for ( my $i = 0 ; $i <= $maxloops ; $i++ ) {
183
184							# loop, so we assure the special handling of residuals
185	4437	100				9674	if ( $maxtmp > $maxresidual ) {
		50
186	1134					1710	$maxstr = 32766;
187							}
188							elsif ( $maxtmp < 0 ) {
189	0					0	$maxstr = 0;
190							}
191							else {
192	3303					5176	$maxstr = $maxresidual;
193							}
194	4437	50				8801	if ( $mintmp > $minresidual ) {
		100
195	0					0	$minstr = 32766;
196							}
197							elsif ( $mintmp < 0 ) {
198	279					428	$minstr = 0;
199							}
200							else {
201	4158					6119	$minstr = $minresidual;
202							}
203							#### handle residuals
204	4437	100				9057	if ( $i == 0 ) {
		100
205	3450					5312	$minstr = $minresidual;
206	3450					5058	$mintmp = $mintmp - $minresidual;
207							}
208							elsif ( $i == $maxloops ) {
209	147					224	$maxstr = $maxresidual;
210	147					292	$maxtmp = $maxtmp - $maxresidual;
211							}
212
213							# mark offsets
214	4437					6281	my $tmp;
215	4437					10100	$tmp->{minoffset} = $minstr;
216	4437					7794	$tmp->{maxoffset} = $maxstr;
217	4437					11527	push @offsets, $tmp;
218							} ## end for ( my $i = 0 ; $i <=...)
219							my @filtered = map {
220	3450	100	66			6862	if ( !defined $maxoffset \|\| length($maxoffset) == 0 ) {
	4437					12944
221	1137					2130	$_->{maxoffset} = "";
222							}
223	4437	50	33			13956	if ( !defined $minoffset \|\| length($minoffset) == 0 ) {
224	0					0	$_->{minoffset} = "";
225							}
226	4437					12145	$_;
227							} @offsets;
228	3450					7615	foreach my $tmp (@filtered) {
229
230							# ? at the end - means non-greedy
231							#$offset_expanded .= $byte."{" . $tmp->{minoffset} . "," . $tmp->{maxoffset} . "}?";
232							$offset_expanded .=
233	4437					18143	$byte . "{" . $tmp->{minoffset} . "," . $tmp->{maxoffset} . "}";
234							} ## end foreach my $tmp (@filtered)
235							} ## end else [ if ( ( ( not defined $minoffset...)))]
236
237							#say "DEBUG: minoffset='$minoffset' maxoffset='$maxoffset' --> offset_expanded='$offset_expanded'";
238
239							# minimization steps
240	18804					35366	$offset_expanded =~ s#{0,}#*#g;
241	18804					29362	$offset_expanded =~ s#{1,}#+#g;
242	18804					28324	$offset_expanded =~ s#{0,1}#?#g;
243	18804					48073	return $offset_expanded;
244							} ## end sub _expand_offsets($$)
245
246							# got XPath-object and returns a regex-structure as hashref
247							sub _parse_fragments ($) {
248	12993			12993		21497	my $fq = shift;
249	12993					29729	my $position = $fq->getAttribute('Position');
250	12993					133192	my $minoffset = $fq->getAttribute('MinOffset');
251	12993					107144	my $maxoffset = $fq->getAttribute('MaxOffset');
252	12993					123005	my $rx = $fq->textContent;
253	12993					27842	my $expanded = _expand_pattern($rx);
254	12993					21431	my $ret;
255	12993					30999	$ret->{position} = $position;
256	12993					22424	$ret->{direction} = "left";
257	12993					21513	$ret->{regex} = "";
258
259	12993					23109	my ($offset_expanded) = _expand_offsets( $minoffset, $maxoffset );
260
261	12993	100				58847	if ( $fq->localname eq "LeftFragment" ) {
		50
262	4788					8970	$ret->{direction} = "left";
263	4788					12903	$ret->{regex} = "($expanded)$offset_expanded";
264							}
265							elsif ( $fq->localname eq "RightFragment" ) {
266	8205					15899	$ret->{direction} = "right";
267	8205					19854	$ret->{regex} = "$offset_expanded($expanded)";
268							}
269
270							#say "pF: rx=$rx expanded=$expanded offset=$offset_expanded";
271	12993					34975	return $ret;
272							} ## end sub _parse_fragments ($)
273
274							# got XPath-object and search direction and returns a regex-structure as hashref
275							sub _parse_subsequence ($$) {
276	5811			5811		9944	my $ssq = shift;
277	5811					10624	my $dir = shift;
278	5811					14917	my $position = $ssq->getAttribute('Position');
279	5811					62113	my $minoffset = $ssq->getAttribute('SubSeqMinOffset');
280	5811					50995	my $maxoffset = $ssq->getAttribute('SubSeqMaxOffset');
281
282	5811					48821	my $rx = $ssq->getElementsByTagName('Sequence')->get_node(1)->textContent;
283
284	5811					302634	my @lnodes = $ssq->getElementsByTagName('LeftFragment');
285	5811					267240	my @rnodes = $ssq->getElementsByTagName('RightFragment');
286	5811					191552	my @lrx_fragments = map { _parse_fragments($_) } @lnodes;
	4788					9312
287	5811					10531	my @rrx_fragments = map { _parse_fragments($_) } @rnodes;
	8205					14891
288	5811					14854	my $lregex = _flatten_rx( "", @lrx_fragments );
289	5811					11716	my $rregex = _flatten_rx( "", @rrx_fragments );
290	5811					11613	my $expanded = _expand_pattern($rx);
291
292							#if ( length($minoffset) > 0
293							# && length($maxoffset) > 0
294							# && $minoffset > $maxoffset ) {
295							# confess(
296							#"parse_subsequence: Maxoffset=$maxoffset < Minoffset=$minoffset! regex= '$rx'"
297							# );
298							# } ## end if ( length($minoffset...))
299
300	5811					12518	my $offset_expanded = _expand_offsets( $minoffset, $maxoffset );
301	5811					18707	my $prefix;
302							my $suffix;
303	5811					0	my $ret;
304	5811					0	my $regex;
305	5811	100	66			22329	if ( !defined $dir \|\| length($dir) == 0 ) {
		100
		50
306	420					1254	$regex = join( "", $lregex, $expanded, $rregex );
307							}
308							elsif ( $dir eq "BOFoffset" ) {
309	4782					14816	$regex =
310							join( "", $offset_expanded, "(", $lregex, $expanded, $rregex, ")" );
311							}
312							elsif ( $dir eq "EOFoffset" ) {
313	609					2130	$regex =
314							join( "", "(", $lregex, $expanded, $rregex, ")", $offset_expanded );
315							}
316							else {
317	0					0	warn "unknown reference '$dir' found\n";
318	0					0	$regex = join( "", $lregex, $expanded, $rregex );
319							}
320							$ret->{regex} =
321	5811					17492	File::FormatIdentification::Regex::peep_hole_optimizer($regex);
322	5811					13557	$ret->{position} = $position;
323
324	5811					32176	return $ret;
325							} ## end sub _parse_subsequence ($$)
326
327							# got XPath-object and returns regex-string
328							sub _parse_bytesequence ($) {
329	4929			4929		9780	my $bsq = shift;
330
331							#say "rx_groups in parse_byte_sequence:";
332	4929					12288	my $reference = $bsq->getAttribute('Reference');
333							; # if BOFoffset -> anchored begin of file, EOFofset -> end of file
334	4929					54221	my @nodes = $bsq->getElementsByTagName('SubSequence');
335	4929					193091	my @rx_groups = map { _parse_subsequence( $_, $reference ) } @nodes;
	5811					27811
336	4929					108712	my $expanded = "";
337	4929					10654	my $regex_flattened = _flatten_rx( $expanded, @rx_groups );
338
339							#my $ro = Regexp::Optimizer->new;
340							#my $ro = Regexp::Assemble->new;
341							#$ro->add( $regex_flattened);
342							#$regex_flattened = $ro->as_string($regex_flattened);
343							#$regex_flattened = $ro->re;
344	4929					8699	my $regex;
345	4929	100	66			20686	if ( !defined $reference \|\| 0 == length($reference) ) {
		100
		50
346	255					515	$regex = "$regex_flattened";
347							}
348							elsif ( $reference eq "BOFoffset" ) {
349	4065					9677	$regex = "\\A$regex_flattened";
350							}
351							elsif ( $reference eq "EOFoffset" ) {
352	609					1561	$regex = "$regex_flattened\\Z";
353							}
354							else {
355	0					0	warn "unknown reference '$reference' found\n";
356	0					0	$regex = "$regex_flattened";
357							}
358
359	1			1		11	use Regexp::Optimizer;
	1					2
	1					2897
360	4929					17514	my $ro = Regexp::Optimizer->new;
361
362							#say "regex='$regex'";
363							#$regex = $ro->as_string( $regex );
364	4929					41560	return $regex;
365							} ## end sub _parse_bytesequence ($)
366
367							# ($%signatures, $%internal) = parse_signaturefile( $file )
368							sub _parse_signaturefile($) {
369	3			3		8	my $pronomfile = shift;
370	3					9	my %signatures;
371
372							# hash{internalid}->{regex} = $regex
373							# ->{signature} = $signature
374							my %internal_signatures;
375
376	3					28	my $dom = XML::LibXML->load_xml( location => $pronomfile );
377	3					212868	$dom->indexElements();
378	3					205	my $xp = XML::LibXML::XPathContext->new($dom);
379	3					53	$xp->registerNs( 'droid',
380							'http://www.nationalarchives.gov.uk/pronom/SignatureFile' );
381
382							# find Fileformats
383							#my $tmp = $xp->find('/*[local-name() = "FFSignatureFile"]')->get_node(1);
384							#say "E:", $tmp->nodeName;
385							#say "EXISTS:", $xp->exists('/droid:FFSignatureFile');
386							#say "EXISTS2", $xp->exists('/droid:FFSignatureFile/droid:FileFormatCollection/droid:FileFormat');
387
388	3					26	my $fmts = $xp->find(
389							'/[local-name() = "FFSignatureFile"]/[local-name() = "FileFormatCollection"]/*[local-name() = "FileFormat"]'
390							);
391	3					7429	foreach my $fmt ( $fmts->get_nodelist() ) {
392	4752					12545	my $id = $fmt->getAttribute('ID');
393	4752					49427	my $mimetype = $fmt->getAttribute('MIMEtype');
394	4752					41159	my $name = $fmt->getAttribute('Name');
395	4752					39923	my $puid = $fmt->getAttribute('PUID');
396	4752					38432	my $version = $fmt->getAttribute('Version');
397							#
398
399							##
400							my @extensions =
401	4752					39472	map { $_->textContent() } $fmt->getElementsByTagName('Extension');
	5679					155898
402							my @internalsignatures =
403	4752					20916	map { $_->textContent() }
	4029					144848
404							$fmt->getElementsByTagName('InternalSignatureID');
405	4752					61233	my @haspriorityover = map { $_->textContent() }
	2229					43558
406							$fmt->getElementsByTagName('HasPriorityOverFileFormatID');
407	4752					135962	$signatures{$id}->{mimetype} = $mimetype;
408	4752					30390	$signatures{$id}->{name} = $name;
409	4752					9792	$signatures{$id}->{puid} = $puid;
410	4752					9584	$signatures{$id}->{version} = $version; # optional
411	4752					10477	$signatures{$id}->{extensions} = \@extensions;
412	4752					9744	$signatures{$id}->{internal_signatures} = \@internalsignatures;
413
414	4752					10242	foreach my $prio (@haspriorityover) {
415	2229					5590	$signatures{$id}->{priorityover}->{$prio} = 1;
416							}
417
418	4752					10189	foreach my $internal (@internalsignatures) {
419	4029					17682	$internal_signatures{$internal}->{signature} = $id;
420							}
421							} ## end foreach my $fmt ( $fmts->get_nodelist...)
422
423							# find InternalSignatures
424	3					438	my $sigs =
425							$xp->find(
426							'/[local-name() = "FFSignatureFile"]/[local-name() = "InternalSignatureCollection"]/*[local-name() = "InternalSignature"]'
427							);
428
429	3					8455	foreach my $sig ( $sigs->get_nodelist() ) {
430
431	4083					59922	my $id = $sig->getAttribute('ID');
432	4083					50951	my $specificity = $sig->getAttribute('Specificity');
433	4083					46817	$internal_signatures{$id}->{specificity} = $specificity;
434
435							#p( $sig->toString() );
436	4083					11118	my @nodes = $sig->getElementsByTagName('ByteSequence');
437
438							#p( @nodes );
439	4083					233931	my @rx_groups = map { _parse_bytesequence($_) } @nodes;
	4929					23312
440							my @rx_quality =
441	4083					61301	map { File::FormatIdentification::Regex::calc_quality($_); }
	4929					11370
442							@rx_groups;
443
444	4083					15031	$internal_signatures{$id}->{regex} = \@rx_groups;
445	4083					18297	$internal_signatures{$id}->{quality} = \@rx_quality;
446							} ## end foreach my $sig ( $sigs->get_nodelist...)
447
448	3					611	return ( \%signatures, \%internal_signatures );
449							} ## end sub _parse_signaturefile($)
450
451							sub uniq_signature_ids_by_priority {
452	0			0	0	0	my $self = shift;
453	0					0	my @signatures = @_;
454	0					0	my %found_signature_ids;
455
456							# which PUIDs are in list?
457	0					0	foreach my $signatureid (@signatures) {
458	0	0				0	if ( defined $signatureid ) {
459	0					0	$found_signature_ids{$signatureid} = 1;
460							}
461							}
462
463							# remove all signatures when actual signature has priority over
464	0					0	foreach my $signatureid ( keys %found_signature_ids ) {
465	0					0	foreach my $priority_over_sid (
466	0					0	keys %{ $self->{signatures}->{$signatureid}->{priorityover} } )
467							{
468	0	0				0	if ( exists $found_signature_ids{$priority_over_sid} ) {
469	0					0	delete $found_signature_ids{$priority_over_sid};
470							}
471							} ## end foreach my $priority_over_sid...
472							} ## end foreach my $signatureid ( keys...)
473
474							# reduce list to all signatures with correct priority
475							my @result =
476	0					0	grep { defined $found_signature_ids{ $_->{signature} } } @signatures;
	0					0
477	0					0	return @result;
478							} ## end sub uniq_signature_ids_by_priority
479
480							has 'droid_signature_filename' => (
481							is => 'ro',
482							required => 1,
483							reader => 'get_droid_signature_filename',
484							trigger => sub {
485							my $self = shift;
486
487							#say "TRIGGER";
488							my $yaml_file = $self->get_droid_signature_filename() . ".yaml";
489							if ( $self->{auto_load} && -e $yaml_file ) {
490							$self->load_from_yamlfile($yaml_file);
491							}
492							else {
493							my ( $signatures, $internal_signatures ) =
494							_parse_signaturefile( $self->{droid_signature_filename} );
495							$self->{signatures} = $signatures;
496							$self->{internal_signatures} = $internal_signatures;
497
498							#die;
499							if ( $self->{auto_store} ) {
500							$self->save_as_yamlfile($yaml_file);
501							}
502							} ## end else [ if ( $self->{auto_load...})]
503							foreach my $s ( keys %{ $self->{signatures} } ) {
504							my $puid = $self->{signatures}->{$s}->{puid};
505							if ( defined $puid && length($puid) > 0 ) {
506							$self->{puids}->{$puid} = $s;
507							}
508							}
509							}
510							);
511
512							sub save_as_yamlfile {
513	2			2	0	9	my $self = shift;
514	2					6	my $filename = shift;
515	2					6	my @res;
516	2					23	push @res, $self->{signatures};
517	2					6	push @res, $self->{internal_signatures};
518	2					27	YAML::XS::DumpFile( "$filename", @res );
519	2					176253	return;
520							} ## end sub save_as_yamlfile
521
522							sub load_from_yamlfile {
523	1			1	0	3	my $self = shift;
524	1					3	my $filename = shift;
525	1					8	my ( $sig, $int ) = YAML::XS::LoadFile($filename);
526	1					50270	$self->{signatures} = $sig;
527	1					4	$self->{internal_signatures} = $int;
528	1					5	return;
529							} ## end sub load_from_yamlfile
530
531							has 'auto_store' => (
532							is => 'ro',
533							default => 1,
534							);
535
536							has 'auto_load' => (
537							is => 'ro',
538							default => 1,
539							);
540
541							sub get_all_signature_ids {
542	0			0	0		my $self = shift;
543	0						my @sigs = sort { $a <=> $b } keys %{ $self->{signatures} };
	0
	0
544	0						return @sigs;
545							}
546
547							sub get_signature_id_by_puid {
548	0			0	0		my $self = shift;
549	0						my $puid = shift;
550	0						my $sig = $self->{puids}->{$puid};
551	0						return $sig;
552							}
553
554							sub get_internal_ids_by_puid {
555	0			0	0		my $self = shift;
556	0						my $puid = shift;
557	0						my $sig = $self->get_signature_id_by_puid($puid);
558	0						my @ids = ();
559	0	0					if ( defined $sig ) {
560	0						@ids = grep { defined $_ }
561	0						@{ $self->{signatures}->{$sig}->{internal_signatures} };
	0
562							}
563	0						return @ids;
564							}
565
566							sub get_file_endings_by_puid {
567	0			0	0		my $self = shift;
568	0						my $puid = shift;
569	0						my $sig = $self->get_signature_id_by_puid($puid);
570	0						my @endings = ();
571	0	0					if ( defined $sig ) {
572	0						@endings = $self->{signatures}->{$sig}->{extensions};
573							}
574	0						return @endings;
575							}
576
577							sub get_all_internal_ids {
578	0			0	0		my $self = shift;
579	0						my @ids = sort { $a <=> $b } keys %{ $self->{internal_signatures} };
	0
	0
580	0						foreach my $id (@ids) {
581	0	0					if ( !defined $id ) { confess("$id not defined") }
	0
582							}
583	0						return @ids;
584							}
585
586							sub get_all_puids {
587	0			0	0		my $self = shift;
588							my @ids =
589	0						sort grep { defined $_ }
590	0						map { $self->{signatures}->{$_}->{puid}; }
591	0						grep { defined $_ } $self->get_all_signature_ids();
	0
592	0						return @ids;
593							}
594
595							sub get_regular_expressions_by_internal_id {
596	0			0	0		my $self = shift;
597	0						my $internalid = shift;
598	0	0					if ( !defined $internalid ) { confess("internalid must exists!"); }
	0
599	0						my @rx = @{ $self->{internal_signatures}->{$internalid}->{regex} };
	0
600	0						return @rx;
601							}
602
603							sub get_all_regular_expressions {
604	0			0	0		my $self = shift;
605	0						my @ids = $self->get_all_internal_ids();
606	0						my @regexes = ();
607	0						foreach my $id (@ids) {
608	0						my @rx = $self->get_regular_expressions_by_internal_id($id);
609	0						push @regexes, @rx;
610							}
611	0						my @ret = sort @regexes;
612	0						return @ret;
613							}
614
615							sub get_qualities_by_internal_id {
616	0			0	0		my $self = shift;
617	0						my $internalid = shift;
618	0	0					if ( !defined $internalid ) { confess("internalid must exists!"); }
	0
619	0						my $value = $self->{internal_signatures}->{$internalid}->{quality};
620	0	0					if ( defined $value ) {
621	0						return @{$value};
	0
622							}
623	0						return;
624							}
625
626							sub get_signature_id_by_internal_id {
627	0			0	0		my $self = shift;
628	0						my $internalid = shift;
629	0	0					if ( !defined $internalid ) { confess("internalid must exists!"); }
	0
630	0						return $self->{internal_signatures}->{$internalid}->{signature};
631							}
632
633							sub get_name_by_signature_id {
634	0			0	0		my $self = shift;
635	0						my $signature = shift;
636	0						return $self->{signatures}->{$signature}->{name};
637							}
638
639							sub get_puid_by_signature_id {
640	0			0	0		my $self = shift;
641	0						my $signature = shift;
642	0						return $self->{signatures}->{$signature}->{puid};
643							}
644
645							sub get_puid_by_internal_id {
646	0			0	0		my $self = shift;
647	0						my $internalid = shift;
648	0	0					if ( !defined $internalid ) { confess("internalid must exists!"); }
	0
649	0						my $signature = $self->get_signature_id_by_internal_id($internalid);
650	0						return $self->get_puid_by_signature_id($signature);
651							}
652
653							sub get_quality_sorted_internal_ids {
654	0			0	0		my $self = shift;
655							my @ids = sort {
656
657							# sort by regexes
658	0						my @a_rxq = @{ $self->{internal_signatures}->{$a}->{quality} };
	0
	0
659	0						my @b_rxq = @{ $self->{internal_signatures}->{$b}->{quality} };
	0
660	0						my $aq = 0;
661	0						foreach my $as (@a_rxq) { $aq += $as; }
	0
662	0						my $bq = 0;
663	0						foreach my $bs (@b_rxq) { $bq += $bs; }
	0
664
665							#use Data::Printer;
666							#p( $a );
667							#p( $aq );
668	0						$aq <=> $bq;
669							} $self->get_all_internal_ids();
670	0						return @ids;
671							}
672
673							sub get_combined_regex_by_puid {
674	0			0	0		my $self = shift;
675	0						my $puid = shift;
676	0						my @internals = $self->get_internal_ids_by_puid($puid);
677
678							#use Data::Printer;
679							#p( $puid );
680							#p( @internals );
681							my @regexes = map {
682	0						my @regexes_per_internal =
	0
683							$self->get_regular_expressions_by_internal_id($_);
684	0						my $combined =
685							File::FormatIdentification::Regex::and_combine(@regexes_per_internal);
686
687							#p( $combined );
688	0						$combined;
689							} @internals;
690	0						my $result = File::FormatIdentification::Regex::or_combine(@regexes);
691
692							#p( $result );
693	0						return $result;
694							}
695
696							sub _prepare_statistics {
697	0			0			my $self = shift;
698	0						my $results;
699
700							# count of PUIDs
701							# count of internal ids (IDs per PUID)
702							# count of regexes
703							# count of file endings only
704							# count of internal ids without PUID
705							# larges and shortest regex
706							# complex and simple regex
707							# common regexes
708							#say "stat";
709	0						my @puids = $self->get_all_puids();
710	0						my $puids = scalar(@puids);
711	0						my @internals = $self->get_all_internal_ids();
712	0						my $internals = scalar(@internals);
713	0						my $regexes = 0;
714	0						my $fileendingsonly = 0;
715	0						my @fileendingsonly = ();
716	0						my $fileendings = 0;
717	0						my $int_per_puid = 0;
718	0						my $internal_without_puid = 0;
719	0						my @internal_without_puid = ();
720	0						my @quality_sorted_internal_ids = $self->get_quality_sorted_internal_ids();
721	0						my %uniq_regexes;
722
723	0						foreach my $internalid (@internals) {
724	0						my @regexes =
725							$self->get_regular_expressions_by_internal_id($internalid);
726	0						foreach my $rx (@regexes) {
727	0						my @tmp = ();
728	0	0					if ( exists $uniq_regexes{$rx} ) {
729	0						@tmp = @{ $uniq_regexes{$rx} };
	0
730							}
731	0						push @tmp, $internalid;
732	0						$uniq_regexes{$rx} = \@tmp;
733							}
734
735	0						$regexes += scalar(@regexes);
736	0						my $sigid = $self->get_signature_id_by_internal_id($internalid);
737	0	0					if ( !defined $sigid ) {
738	0						$internal_without_puid++;
739	0						push @internal_without_puid, $internalid;
740							}
741							}
742	0						foreach my $puid (@puids) {
743	0						my @ints = $self->get_internal_ids_by_puid($puid);
744	0						my @fileendings = $self->get_file_endings_by_puid($puid);
745	0	0					if ( 0 == scalar(@ints) ) {
746	0						$fileendingsonly++;
747	0						push @fileendingsonly, $puid;
748							}
749							else {
750	0						$fileendings += scalar(@fileendings);
751	0						$int_per_puid += scalar(@ints);
752							}
753							}
754	0						foreach my $i (@quality_sorted_internal_ids) {
755	0						my $regex =
756							join( "#", $self->get_regular_expressions_by_internal_id($i) );
757	0						my $quality = join( " ", $self->get_qualities_by_internal_id($i) );
758
759							}
760
761	0						$results->{filename} = $self->get_droid_signature_filename();
762	0						$results->{count_of_puids} = $puids;
763	0						$results->{count_of_internal_ids} = $internals;
764	0						$results->{count_of_regular_expressions} = $regexes;
765	0						$results->{count_of_fileendings} = $fileendings;
766	0						$results->{count_of_puid_with_fileendings_only} = $fileendingsonly;
767	0						$results->{puids_with_fileendings_only} = \@fileendingsonly;
768	0						$results->{count_of_orphaned_internal_ids} = $internal_without_puid;
769	0						$results->{internal_ids_without_puids} = \@internal_without_puid;
770	1			1		9	no warnings;
	1					3
	1					1469
771
772	0						for ( my $i = 0 ; $i <= 4 ; $i++ ) {
773	0						my $best_quality_internal = pop @quality_sorted_internal_ids;
774	0	0					if ( defined $best_quality_internal ) {
775	0						my $best_quality = join( ";",
776							$self->get_qualities_by_internal_id($best_quality_internal) );
777	0						my $best_puid =
778							$self->get_puid_by_internal_id($best_quality_internal);
779	0						my $best_name =
780							$self->get_name_by_signature_id(
781							$self->get_signature_id_by_internal_id($best_quality_internal)
782							);
783	0						my $best_regex = $self->get_combined_regex_by_puid($best_puid);
784							$results->{nth_best_quality}->[$i]->{internal_id} =
785	0						$best_quality_internal;
786	0						$results->{nth_best_quality}->[$i]->{puid} = $best_puid;
787	0						$results->{nth_best_quality}->[$i]->{name} = $best_name;
788	0						$results->{nth_best_quality}->[$i]->{quality} = $best_quality;
789	0						$results->{nth_best_quality}->[$i]->{combined_regex} = $best_regex;
790							}
791							}
792	0						for ( my $i = 0 ; $i <= 4 ; $i++ ) {
793	0						my $worst_quality_internal = shift @quality_sorted_internal_ids;
794	0	0					if ( defined $worst_quality_internal ) {
795	0						my $worst_quality = join( ";",
796							$self->get_qualities_by_internal_id($worst_quality_internal) );
797	0						my $worst_puid =
798							$self->get_puid_by_internal_id($worst_quality_internal);
799	0						my $worst_name =
800							$self->get_name_by_signature_id(
801							$self->get_signature_id_by_internal_id($worst_quality_internal)
802							);
803	0						my $worst_regex = $self->get_combined_regex_by_puid($worst_puid);
804							$results->{nth_worst_quality}->[$i]->{internal_id} =
805	0						$worst_quality_internal;
806	0						$results->{nth_worst_quality}->[$i]->{puid} = $worst_puid;
807	0						$results->{nth_worst_quality}->[$i]->{name} = $worst_name;
808	0						$results->{nth_worst_quality}->[$i]->{quality} = $worst_quality;
809							$results->{nth_worst_quality}->[$i]->{combined_regex} =
810	0						$worst_regex;
811							}
812							}
813							my @multiple_used_regex = grep {
814	0						my $tmp = $uniq_regexes{$_};
	0
815	0						my @tmp = @{$tmp};
	0
816	0						scalar(@tmp) > 1
817							} sort keys %uniq_regexes;
818	0						$results->{count_of_multiple_used_regex} = scalar(@multiple_used_regex);
819	0						for ( my $i = 0 ; $i <= $#multiple_used_regex ; $i++ ) {
820							$results->{multiple_used_regex}->[$i]->{regex} =
821	0						$multiple_used_regex[$i];
822	0						my @ids = join( ",", @{ $uniq_regexes{ $multiple_used_regex[$i] } } );
	0
823	0						$results->{multiple_used_regex}->[$i]->{internal_ids} = \@ids;
824							}
825	0						return $results;
826							}
827
828							sub print_csv_statistics {
829	0			0	0		my $self = shift;
830	0						my $csv_file = shift;
831	0						my $results = $self->_prepare_statistics();
832	0						my $version = $results->{filename};
833	0						$version =~ s/DROID_SignatureFile_V(\d+)\.xml/$1/;
834	0						$results->{version} = $version;
835	0						$results->{best_quality_puid} = $results->{nth_best_quality}->[0]->{puid};
836							$results->{best_quality_internal_id} =
837	0						$results->{nth_best_quality}->[0]->{internal_id};
838							$results->{best_quality_quality} =
839	0						$results->{nth_best_quality}->[0]->{quality};
840							$results->{best_quality_combined_regex} =
841	0						$results->{nth_best_quality}->[0]->{combined_regex};
842	0						$results->{worst_quality_puid} = $results->{nth_worst_quality}->[0]->{puid};
843							$results->{worst_quality_internal_id} =
844	0						$results->{nth_worst_quality}->[0]->{internal_id};
845							$results->{worst_quality_quality} =
846	0						$results->{nth_worst_quality}->[0]->{quality};
847							$results->{worst_quality_combined_regex} =
848	0						$results->{nth_worst_quality}->[0]->{combined_regex};
849
850	0						my @headers =
851							qw(version filename count_of_puids count_of_internal_ids count_of_regular_expressions count_of_fileendings count_of_puid_with_fileendings_only count_of_orphaned_internal_ids count_of_multiple_used_regex best_quality_puid best_quality_internal_id best_quality_quality best_quality_combined_regex worst_quality_puid worst_quality_internal_id worst_quality_quality worst_quality_combined_regex);
852	0						my $file_exists = (-e $csv_file);
853	0	0					open (my $FH, ">>", "$csv_file") or croak "Can't open file '$csv_file', $0";
854	0	0					if (not $file_exists) {
855	0						say $FH "#", join( ",", @headers );
856							}
857							say $FH join(
858							",",
859							map {
860	0						my $result = $results->{$_};
	0
861	0	0					if ( !defined $result ) { $result = ""; }
	0
862	0						"\"$result\"";
863							} @headers
864							);
865	0						close ($FH);
866	0						return;
867							}
868
869							sub print_statistics {
870	0			0	0		my $self = shift;
871	0						my $verbose = shift;
872	0						my $results = $self->_prepare_statistics();
873
874	0						say "Statistics of file $results->{filename}";
875	0						say "=======================================";
876	0						say "";
877	0						say "Countings";
878	0						say "---------------------------------------";
879	0						say "Count of PUIDs: $results->{count_of_puids}";
880	0						say
881							" internal IDs: $results->{count_of_internal_ids}";
882	0						say
883							" regular expressions: $results->{count_of_regular_expressions}";
884	0						say
885							" file endings: $results->{count_of_fileendings}";
886	0						say
887							" PUIDs with file endings only: $results->{count_of_puid_with_fileendings_only}";
888
889	0	0					if ( defined $verbose ) {
890							say " (",
891	0						join( ", ", sort @{ $results->{puids_with_fileendings_only} } ), ")";
	0
892							}
893							say
894	0						" orphaned internal IDs: $results->{count_of_orphaned_internal_ids}";
895	0	0					if ( defined $verbose ) {
896							say " (",
897	0						join( ", ", sort {$a <=> $b} @{ $results->{internal_ids_without_puids} } ), ")";
	0
	0
898							}
899	0						say "";
900	0						say "Quality of internal IDs";
901	0						say "---------------------------------------";
902
903	0						my $nth = 1;
904	0						foreach my $n ( @{ $results->{nth_best_quality} } ) {
	0
905	0						say
906							"$nth-best quality internal ID (PUID, name): $n->{internal_id} ($n->{puid}, $n->{name}) -> $n->{quality}";
907	0	0					if ( defined $verbose ) {
908	0						say " combined regex: ", $n->{combined_regex};
909							}
910	0						$nth++;
911							}
912	0						say "";
913	0						$nth = 1;
914	0						foreach my $n ( @{ $results->{nth_worst_quality} } ) {
	0
915	0						say
916							"$nth-worst quality internal ID (PUID, name): $n->{internal_id} ($n->{puid}, $n->{name}) -> $n->{quality}";
917	0	0					if ( defined $verbose ) {
918	0						say " combined regex: ", $n->{combined_regex};
919							}
920	0						$nth++;
921							}
922	0						say "";
923
924	0						say "";
925	0						say "Regular expressions";
926	0						say "---------------------------------------";
927	0						say
928							"Count of multiple used regular expressions: $results->{count_of_multiple_used_regex}";
929	0	0					if ( defined $verbose ) {
930	0						for ( my $i = 0 ; $i < $results->{count_of_multiple_used_regex} ; $i++ )
931							{
932	0						say " common regex group no $i:";
933							say " regex='"
934	0						. $results->{multiple_used_regex}->[$i]->{regex} . "'";
935							say " internal IDs: ",
936	0						join( ",", @{ $results->{multiple_used_regex}->[$i]->{internal_ids} } );
	0
937							}
938							}
939	0						say "";
940
941							#my @rx = $self->get_all_regular_expressions();
942							#use Data::Printer;
943							#p( %uniq_regexes );
944	0						return;
945							}
946
947							1;
948
949	1			1		10	no Moose;
	1					3
	1					7
950							__PACKAGE__->meta->make_immutable;
951							1;
952
953							__END__
954
955							=pod
956
957							=encoding UTF-8
958
959							=head1 NAME
960
961							File::FormatIdentification::Pronom
962
963							=head1 VERSION
964
965							version 0.05
966
967							=head1 SYNOPSIS
968
969							use File::FormatIdentification::Pronom;
970							my $pronomfile = "Droid-Signature.xml";
971							my ( $signatures, $internals ) = parse_signaturefile($pronomfile);
972
973							=head1 DESCRIPTION
974
975							The module allows to handle Droid signatures. Droid is a utility which
976							uses the PRONOM database to identify file formats.
977
978							See https://www.nationalarchives.gov.uk/PRONOM/ for details.
979
980							With this module you could:
981
982							=over
983
984							=item convert Droid signatures to Perl regular expressions
985
986							=item analyze files and display which/where pattern of Droid signature matches via tag-files for wxHexEditor
987
988							=item calc statistics about Droid signatures
989
990							=back
991
992							The module is in early alpha state and should not be used in production.
993
994							=head2 Examples
995
996							=head3 Colorize wxHexeditor fields
997
998							See example file F<bin/pronom2wxhexeditor.pl>. This colorizes the hex-blob to check PRONOM pattern matches for a given file.
999
1000							=head3 Identify file
1001
1002							There are better tools for the job, but as a proof of concept certainly not bad: Identifying the file type of a file.
1003
1004							my $pronom = File::FormatIdentification::Pronom->new(
1005							"droid_signature_filename" => $pronomfile
1006							);
1007							# .. $filestream is a scalar representing a file
1008							foreach my $internalid ( $pronom->get_all_internal_ids() ) {
1009							my $sig = $pronom->get_signature_id_by_internal_id($internalid);
1010							my $puid = $pronom->get_puid_by_signature_id($sig);
1011							my $name = $pronom->get_name_by_signature_id($sig);
1012							my $quality = $pronom->get_qualities_by_internal_id($internalid);
1013							my @regexes = $pronom->get_regular_expressions_by_internal_id($internalid);
1014							if ( all {$filestream =~ m/$_/saa} @regexes ) {
1015							say "$binaryfile identified as $name with PUID $puid (regex quality $quality)";
1016							}
1017							}
1018
1019							See example file F<bin/pronomidentify.pl> for a full working script.
1020
1021							=head3 Get PRONOM Statistics
1022
1023							To get a feeling for which signatures need to be revised in PRONOM, or why which file formats are difficult to recognize,
1024							you can get detailed statistics for given signature files.
1025
1026							In the blog entry under L<https://kulturreste.blogspot.com/2018/10/heres-tool-make-it-work.html> the statistic report is presented in more detail.
1027
1028							=head2 EXPORT
1029
1030							None by default.
1031
1032							=head1 NAME
1033
1034							File::FormatIdentification::Pronom - Perl extension for parsing PRONOM-Signatures using DROID-Signature file
1035
1036							=head1 SEE ALSO
1037
1038							L<File::FormatIdentification::Regex>
1039
1040							=head1 AUTHOR
1041
1042							Andreas Romeyke L<pause@andreas-romeyke.de>
1043
1044							=head1 COPYRIGHT AND LICENSE
1045
1046							Copyright (C) 2018/19/20 by Andreas Romeyke
1047
1048							This library is free software; you can redistribute it and/or modify
1049							it under the same terms as Perl itself, either Perl version 5.24.1 or,
1050							at your option, any later version of Perl 5 you may have available.
1051
1052							The droid-signature file in t/ is from L<https://www.nationalarchives.gov.uk/PRONOM/Default.aspx>
1053							and without guarantee, it does not look like it is legally protected. If there are any legal claims,
1054							please let me know that I can remove them from the distribution.
1055
1056							=head1 BUGS
1057
1058							=over
1059
1060							=item Some droid recipes results in PCREs which are greedy and therefore the running
1061							time could be exponential with size of binary file.
1062
1063							=back
1064
1065							=head1 CONTRIBUTING
1066
1067							Please feel free to send me comments and patches to my email address. You can clone the modules
1068							from L<https://art1pirat.spdns.org/art1/File-FormatIdentification-Pronom> and send me merge requests.
1069
1070							=head1 AUTHOR
1071
1072							Andreas Romeyke <pause@andreas-romeyke.de>
1073
1074							=head1 COPYRIGHT AND LICENSE
1075
1076							This software is copyright (c) 2018 by Andreas Romeyke.
1077
1078							This is free software; you can redistribute it and/or modify it under
1079							the same terms as the Perl 5 programming language system itself.
1080
1081							=cut