| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package File::FormatIdentification::Pronom; | 
| 2 |  |  |  |  |  |  |  | 
| 3 | 1 |  |  | 1 |  | 362485 | use feature qw(say); | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 146 |  | 
| 4 | 1 |  |  | 1 |  | 8 | use strict; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 22 |  | 
| 5 | 1 |  |  | 1 |  | 5 | use warnings; | 
|  | 1 |  |  |  |  | 11 |  | 
|  | 1 |  |  |  |  | 40 |  | 
| 6 | 1 |  |  | 1 |  | 774 | use XML::LibXML; | 
|  | 1 |  |  |  |  | 59618 |  | 
|  | 1 |  |  |  |  | 7 |  | 
| 7 | 1 |  |  | 1 |  | 166 | use Carp; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 51 |  | 
| 8 | 1 |  |  | 1 |  | 6 | use List::Util qw( none first ); | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 64 |  | 
| 9 | 1 |  |  | 1 |  | 6 | use Scalar::Util; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 29 |  | 
| 10 | 1 |  |  | 1 |  | 614 | use YAML::XS; | 
|  | 1 |  |  |  |  | 2962 |  | 
|  | 1 |  |  |  |  | 56 |  | 
| 11 | 1 |  |  | 1 |  | 577 | use File::FormatIdentification::Regex; | 
|  | 1 |  |  |  |  | 4 |  | 
|  | 1 |  |  |  |  | 70 |  | 
| 12 | 1 |  |  | 1 |  | 12 | use v5.21; # special regex syntax introduced with 5.21 needed! | 
|  | 1 |  |  |  |  | 3 |  | 
| 13 | 1 |  |  | 1 |  | 668 | use Moose; | 
|  | 1 |  |  |  |  | 504666 |  | 
|  | 1 |  |  |  |  | 5 |  | 
| 14 |  |  |  |  |  |  |  | 
| 15 |  |  |  |  |  |  | our $VERSION = '0.06'; # VERSION | 
| 16 |  |  |  |  |  |  |  | 
| 17 |  |  |  |  |  |  | # ABSTRACT Perl extension for parsing PRONOM-Signatures using DROID-Signature file | 
| 18 |  |  |  |  |  |  |  | 
| 19 |  |  |  |  |  |  | # Preloaded methods go here. | 
| 20 |  |  |  |  |  |  | # flattens a regex-structure to a regex-string, expects a signature-pattern and a list of regex-structures | 
| 21 |  |  |  |  |  |  | # returns regex | 
| 22 |  |  |  |  |  |  | # | 
| 23 | 1 |  |  | 1 |  | 7790 | no warnings 'recursion'; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 502 |  | 
| 24 |  |  |  |  |  |  |  | 
| 25 |  |  |  |  |  |  | sub _flatten_rx_recursive ($$$@) { | 
| 26 | 35355 |  |  | 35355 |  | 58641 | my $regex         = shift; | 
| 27 | 35355 |  |  |  |  | 56567 | my $lastpos       = shift; | 
| 28 | 35355 |  |  |  |  | 50427 | my $open_brackets = shift; | 
| 29 | 35355 |  |  |  |  | 66272 | my @rx_groups     = @_; | 
| 30 | 35355 |  |  |  |  | 53374 | my $rx            = shift @rx_groups; | 
| 31 |  |  |  |  |  |  |  | 
| 32 |  |  |  |  |  |  | #use Data::Printer; | 
| 33 |  |  |  |  |  |  | #say "_flatten_rx_recursive"; | 
| 34 |  |  |  |  |  |  | #p( @rx_groups ); | 
| 35 |  |  |  |  |  |  | #p( $rx ); | 
| 36 | 35355 |  |  |  |  | 56834 | my $bracket_symbol = "("; | 
| 37 | 35355 | 50 |  |  |  | 68545 | if ( !defined $regex ) { confess; } | 
|  | 0 |  |  |  |  | 0 |  | 
| 38 |  |  |  |  |  |  |  | 
| 39 | 35355 | 100 |  |  |  | 68258 | if ( !defined $rx ) {    # do nothing | 
| 40 | 16551 |  |  |  |  | 35090 | while ( $open_brackets > 0 ) { | 
| 41 | 2661 |  |  |  |  | 6207 | $regex .= ")"; | 
| 42 | 2661 |  |  |  |  | 5749 | $open_brackets--; | 
| 43 |  |  |  |  |  |  | } | 
| 44 |  |  |  |  |  |  | } | 
| 45 |  |  |  |  |  |  | else { | 
| 46 | 18804 |  |  |  |  | 41162 | my $pos_diff    = $rx->{position} - $lastpos; | 
| 47 | 18804 |  |  |  |  | 30421 | my $local_regex = $rx->{regex}; | 
| 48 | 18804 | 50 |  |  |  | 35210 | if ( !defined $local_regex ) { | 
| 49 | 0 |  |  |  |  | 0 | $local_regex = ''; | 
| 50 |  |  |  |  |  |  | } | 
| 51 | 18804 | 100 |  |  |  | 42520 | if ( 0 == $pos_diff ) { | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
| 52 |  |  |  |  |  |  |  | 
| 53 |  |  |  |  |  |  | # TODO: | 
| 54 | 5757 |  |  |  |  | 17198 | File::FormatIdentification::Regex::simplify_two_or_combined_regex( | 
| 55 |  |  |  |  |  |  | $regex, $local_regex ); | 
| 56 | 5757 |  |  |  |  | 1023384 | $regex = | 
| 57 |  |  |  |  |  |  | &_flatten_rx_recursive( "$regex|$local_regex", $lastpos, | 
| 58 |  |  |  |  |  |  | $open_brackets, @rx_groups ); | 
| 59 |  |  |  |  |  |  | } | 
| 60 |  |  |  |  |  |  | elsif ( $pos_diff > 0 ) {    # is deeper | 
| 61 |  |  |  |  |  |  | # look a head, if same pos found, then use bracket, otherwise not | 
| 62 | 13047 | 100 | 100 |  |  | 56252 | if ( | 
|  |  |  | 66 |  |  |  |  | 
| 63 |  |  |  |  |  |  | ( | 
| 64 |  |  |  |  |  |  | scalar @rx_groups > 0 | 
| 65 |  |  |  |  |  |  | && ( $rx_groups[0]->{position} == $rx->{position} ) | 
| 66 |  |  |  |  |  |  | ) | 
| 67 |  |  |  |  |  |  | || $pos_diff > 1 | 
| 68 |  |  |  |  |  |  | ) | 
| 69 |  |  |  |  |  |  | {    # use ( | 
| 70 |  |  |  |  |  |  | $regex = &_flatten_rx_recursive( | 
| 71 |  |  |  |  |  |  | "$regex" . ( $bracket_symbol x $pos_diff ) . $local_regex, | 
| 72 | 2661 |  |  |  |  | 10882 | $rx->{position}, $open_brackets += $pos_diff, @rx_groups ); | 
| 73 |  |  |  |  |  |  | } | 
| 74 |  |  |  |  |  |  | else { | 
| 75 |  |  |  |  |  |  | $regex = &_flatten_rx_recursive( | 
| 76 |  |  |  |  |  |  | "$regex$local_regex", $rx->{position}, | 
| 77 | 10386 |  |  |  |  | 33866 | $open_brackets,       @rx_groups | 
| 78 |  |  |  |  |  |  | ); | 
| 79 |  |  |  |  |  |  | } ## end else [ if ( scalar @rx_groups...)] | 
| 80 |  |  |  |  |  |  | } | 
| 81 |  |  |  |  |  |  | elsif ( $pos_diff < 0 ) {    # is higher | 
| 82 |  |  |  |  |  |  | $regex = &_flatten_rx_recursive( | 
| 83 |  |  |  |  |  |  | "$regex)$local_regex", | 
| 84 |  |  |  |  |  |  | $rx->{position}, | 
| 85 | 0 |  |  |  |  | 0 | $open_brackets - 1,    #($rx->{position} - $lastpos), | 
| 86 |  |  |  |  |  |  | @rx_groups | 
| 87 |  |  |  |  |  |  | ); | 
| 88 |  |  |  |  |  |  | } | 
| 89 |  |  |  |  |  |  | else { | 
| 90 | 0 |  |  |  |  | 0 | confess | 
| 91 |  |  |  |  |  |  | "FL: pos=$rx->{position} lastpos=$lastpos regex='$regex' open=$open_brackets\n"; | 
| 92 |  |  |  |  |  |  | } | 
| 93 |  |  |  |  |  |  | } | 
| 94 | 35355 |  |  |  |  | 82539 | return $regex; | 
| 95 |  |  |  |  |  |  | } ## end sub _flatten_rx_recursive ($$$@) | 
| 96 | 1 |  |  | 1 |  | 8 | use warnings 'recursion'; | 
|  | 1 |  |  |  |  | 4 |  | 
|  | 1 |  |  |  |  | 1880 |  | 
| 97 |  |  |  |  |  |  |  | 
| 98 |  |  |  |  |  |  | sub _flatten_rx ($@) { | 
| 99 | 16551 |  |  | 16551 |  | 27214 | my $regex     = shift; | 
| 100 | 16551 |  |  |  |  | 32072 | my @rx_groups = @_; | 
| 101 |  |  |  |  |  |  |  | 
| 102 |  |  |  |  |  |  | #say "calling flatten_rx with regex=$regex quality=$quality"; | 
| 103 |  |  |  |  |  |  | #use Data::Printer; | 
| 104 |  |  |  |  |  |  | #p( @rx_groups ); | 
| 105 | 16551 |  |  |  |  | 33685 | $regex = _flatten_rx_recursive( $regex, 0, 0, @rx_groups ); | 
| 106 | 16551 |  |  |  |  | 33558 | return $regex; | 
| 107 |  |  |  |  |  |  | } ## end sub _flatten_rx ($@) | 
| 108 |  |  |  |  |  |  |  | 
| 109 |  |  |  |  |  |  | # expands pattern of form "FFFB[10:EB]" to FFFB10, FFFB11, ... FFFBEB | 
| 110 |  |  |  |  |  |  | sub _expand_pattern ($) { | 
| 111 | 18804 |  |  | 18804 |  | 32367 | my $pattern = $_[0]; | 
| 112 | 18804 |  |  |  |  | 43258 | $pattern =~ s/(?<=\[)!/^/g; | 
| 113 | 18804 |  |  |  |  | 33310 | $pattern =~ s/(?<=[0-9A-F]{2}):(?=[0-9A-F]{2})\]/-]/g; | 
| 114 | 18804 |  |  |  |  | 198046 | $pattern =~ s/([0-9A-F]{2})/\\x{$1}/g; | 
| 115 |  |  |  |  |  |  |  | 
| 116 |  |  |  |  |  |  | # substitute hex with printable ASCII-Output | 
| 117 | 18804 |  |  |  |  | 89084 | $pattern =~ s#\\x\{(3[0-9]|[46][1-9A-F]|[57][0-9A])\}#chr( hex($1) );#egs; | 
|  | 62487 |  |  |  |  | 241220 |  | 
| 118 | 18804 |  |  |  |  | 48378 | return $pattern; | 
| 119 |  |  |  |  |  |  | } ## end sub _expand_pattern ($) | 
| 120 |  |  |  |  |  |  |  | 
| 121 |  |  |  |  |  |  | # expands offsets min,max to regex ".{$min,$max}" and uses workarounds if $min or $max exceeds 32766 | 
| 122 |  |  |  |  |  |  | sub _expand_offsets($$) { | 
| 123 | 18804 |  |  | 18804 |  | 33470 | my $minoffset = shift; | 
| 124 | 18804 |  |  |  |  | 28345 | my $maxoffset = shift; | 
| 125 | 18804 |  |  |  |  | 28028 | my $byte = | 
| 126 |  |  |  |  |  |  | '.';    # HINT: needs the character set modifier "aa" in $foo=~m/$regex/aa | 
| 127 |  |  |  |  |  |  | #my $byte = '[\x00-\xff]'; | 
| 128 | 18804 |  |  |  |  | 28948 | my $offset_expanded = ""; | 
| 129 | 18804 | 100 | 66 |  |  | 176665 | if (   ( ( not defined $minoffset ) || ( length($minoffset) == 0 ) ) | 
|  |  | 100 | 33 |  |  |  |  | 
|  |  |  | 66 |  |  |  |  | 
|  |  |  | 33 |  |  |  |  | 
|  |  |  | 66 |  |  |  |  | 
|  |  |  | 66 |  |  |  |  | 
|  |  |  | 100 |  |  |  |  | 
| 130 |  |  |  |  |  |  | && ( ( not defined $maxoffset ) || ( length($maxoffset) == 0 ) ) ) | 
| 131 |  |  |  |  |  |  | { | 
| 132 | 111 |  |  |  |  | 227 | $offset_expanded = ""; | 
| 133 |  |  |  |  |  |  | } | 
| 134 |  |  |  |  |  |  | elsif (( defined $minoffset ) | 
| 135 |  |  |  |  |  |  | && ( length($minoffset) > 0 ) | 
| 136 |  |  |  |  |  |  | && ( defined $maxoffset ) | 
| 137 |  |  |  |  |  |  | && ( length($maxoffset) > 0 ) | 
| 138 |  |  |  |  |  |  | && ( $minoffset == $maxoffset ) ) | 
| 139 |  |  |  |  |  |  | { | 
| 140 | 15243 | 100 |  |  |  | 34363 | if ( $minoffset > 0 ) { | 
| 141 | 3507 |  |  |  |  | 7765 | my $maxloops    = int( $maxoffset / 32766 ); | 
| 142 | 3507 |  |  |  |  | 6500 | my $maxresidual = $maxoffset % 32766; | 
| 143 | 3507 |  |  |  |  | 9162 | for ( my $i = 0 ; $i < $maxloops ; $i++ ) { | 
| 144 | 3 |  |  |  |  | 13 | $offset_expanded .= $byte . "{32766}"; | 
| 145 |  |  |  |  |  |  | } | 
| 146 | 3507 |  |  |  |  | 8639 | $offset_expanded .= $byte . "{$maxresidual}"; | 
| 147 |  |  |  |  |  |  | } ## end if ( $minoffset > 0 ) | 
| 148 |  |  |  |  |  |  | } | 
| 149 |  |  |  |  |  |  | else { | 
| 150 |  |  |  |  |  |  |  | 
| 151 |  |  |  |  |  |  | # workaround, because perl quantifier limits, | 
| 152 |  |  |  |  |  |  | #  calc How many repetitions we need! Both offsets should be less than 32766 | 
| 153 |  |  |  |  |  |  | #TODO: check if this comes from Droid or is calculated | 
| 154 |  |  |  |  |  |  |  | 
| 155 | 3450 |  |  |  |  | 6153 | my $mintmp = 0; | 
| 156 | 3450 |  |  |  |  | 4921 | my $maxtmp = 0; | 
| 157 | 3450 | 50 | 33 |  |  | 11824 | if ( defined $minoffset && ( length($minoffset) > 0 ) ) { | 
| 158 | 3450 |  |  |  |  | 5987 | $mintmp = $minoffset; | 
| 159 |  |  |  |  |  |  | } | 
| 160 | 3450 | 100 | 66 |  |  | 10667 | if ( defined $maxoffset && ( length($maxoffset) > 0 ) ) { | 
| 161 | 2313 |  |  |  |  | 3943 | $maxtmp = $maxoffset; | 
| 162 |  |  |  |  |  |  | } | 
| 163 |  |  |  |  |  |  |  | 
| 164 | 3450 |  |  |  |  | 5280 | my $maxloops; | 
| 165 | 3450 | 100 |  |  |  | 7156 | if ( $maxtmp >= $mintmp ) { | 
| 166 | 3408 |  |  |  |  | 7067 | $maxloops = int( $maxtmp / 32766 ); | 
| 167 |  |  |  |  |  |  | } | 
| 168 |  |  |  |  |  |  | else { | 
| 169 | 42 |  |  |  |  | 86 | $maxloops = int( $mintmp / 32766 ); | 
| 170 |  |  |  |  |  |  | } | 
| 171 | 3450 |  |  |  |  | 6136 | my $maxresidual = $maxtmp % 32766; | 
| 172 | 3450 |  |  |  |  | 5368 | my $minresidual = $mintmp % 32766; | 
| 173 |  |  |  |  |  |  |  | 
| 174 |  |  |  |  |  |  | #say "\tMaxloops=$maxloops maxres = $maxresidual minres=$minresidual"; | 
| 175 | 3450 |  |  |  |  | 5367 | my @offsets; | 
| 176 | 3450 |  |  |  |  | 5671 | my $minstr = ""; | 
| 177 | 3450 |  |  |  |  | 5462 | my $maxstr = ""; | 
| 178 | 3450 | 50 | 33 |  |  | 11217 | if ( defined $minoffset && length($minoffset) > 0 ) { | 
| 179 | 3450 |  |  |  |  | 5413 | $minstr = $minresidual; | 
| 180 | 3450 |  |  |  |  | 6312 | $mintmp = $mintmp - $minresidual; | 
| 181 |  |  |  |  |  |  | } | 
| 182 |  |  |  |  |  |  |  | 
| 183 | 3450 |  |  |  |  | 7605 | for ( my $i = 0 ; $i <= $maxloops ; $i++ ) { | 
| 184 |  |  |  |  |  |  |  | 
| 185 |  |  |  |  |  |  | # loop, so we assure the special handling of residuals | 
| 186 | 4437 | 100 |  |  |  | 10146 | if ( $maxtmp > $maxresidual ) { | 
|  |  | 50 |  |  |  |  |  | 
| 187 | 1134 |  |  |  |  | 1714 | $maxstr = 32766; | 
| 188 |  |  |  |  |  |  | } | 
| 189 |  |  |  |  |  |  | elsif ( $maxtmp < 0 ) { | 
| 190 | 0 |  |  |  |  | 0 | $maxstr = 0; | 
| 191 |  |  |  |  |  |  | } | 
| 192 |  |  |  |  |  |  | else { | 
| 193 | 3303 |  |  |  |  | 5493 | $maxstr = $maxresidual; | 
| 194 |  |  |  |  |  |  | } | 
| 195 | 4437 | 50 |  |  |  | 9285 | if ( $mintmp > $minresidual ) { | 
|  |  | 100 |  |  |  |  |  | 
| 196 | 0 |  |  |  |  | 0 | $minstr = 32766; | 
| 197 |  |  |  |  |  |  | } | 
| 198 |  |  |  |  |  |  | elsif ( $mintmp < 0 ) { | 
| 199 | 279 |  |  |  |  | 435 | $minstr = 0; | 
| 200 |  |  |  |  |  |  | } | 
| 201 |  |  |  |  |  |  | else { | 
| 202 | 4158 |  |  |  |  | 6143 | $minstr = $minresidual; | 
| 203 |  |  |  |  |  |  | } | 
| 204 |  |  |  |  |  |  | #### handle residuals | 
| 205 | 4437 | 100 |  |  |  | 8380 | if ( $i == 0 ) { | 
|  |  | 100 |  |  |  |  |  | 
| 206 | 3450 |  |  |  |  | 5135 | $minstr = $minresidual; | 
| 207 | 3450 |  |  |  |  | 4968 | $mintmp = $mintmp - $minresidual; | 
| 208 |  |  |  |  |  |  | } | 
| 209 |  |  |  |  |  |  | elsif ( $i == $maxloops ) { | 
| 210 | 147 |  |  |  |  | 229 | $maxstr = $maxresidual; | 
| 211 | 147 |  |  |  |  | 240 | $maxtmp = $maxtmp - $maxresidual; | 
| 212 |  |  |  |  |  |  | } | 
| 213 |  |  |  |  |  |  |  | 
| 214 |  |  |  |  |  |  | # mark offsets | 
| 215 | 4437 |  |  |  |  | 6264 | my $tmp; | 
| 216 | 4437 |  |  |  |  | 10528 | $tmp->{minoffset} = $minstr; | 
| 217 | 4437 |  |  |  |  | 7443 | $tmp->{maxoffset} = $maxstr; | 
| 218 | 4437 |  |  |  |  | 11308 | push @offsets, $tmp; | 
| 219 |  |  |  |  |  |  | } ## end for ( my $i = 0 ; $i <=...) | 
| 220 |  |  |  |  |  |  | my @filtered = map { | 
| 221 | 3450 | 100 | 66 |  |  | 7103 | if ( !defined $maxoffset || length($maxoffset) == 0 ) { | 
|  | 4437 |  |  |  |  | 13721 |  | 
| 222 | 1137 |  |  |  |  | 2805 | $_->{maxoffset} = ""; | 
| 223 |  |  |  |  |  |  | } | 
| 224 | 4437 | 50 | 33 |  |  | 13954 | if ( !defined $minoffset || length($minoffset) == 0 ) { | 
| 225 | 0 |  |  |  |  | 0 | $_->{minoffset} = ""; | 
| 226 |  |  |  |  |  |  | } | 
| 227 | 4437 |  |  |  |  | 11689 | $_; | 
| 228 |  |  |  |  |  |  | } @offsets; | 
| 229 | 3450 |  |  |  |  | 7563 | foreach my $tmp (@filtered) { | 
| 230 |  |  |  |  |  |  |  | 
| 231 |  |  |  |  |  |  | # ? at the end - means non-greedy | 
| 232 |  |  |  |  |  |  | #$offset_expanded .= $byte."{" . $tmp->{minoffset} . "," . $tmp->{maxoffset} . "}?"; | 
| 233 |  |  |  |  |  |  | $offset_expanded .= | 
| 234 | 4437 |  |  |  |  | 18767 | $byte . "{" . $tmp->{minoffset} . "," . $tmp->{maxoffset} . "}"; | 
| 235 |  |  |  |  |  |  | } ## end foreach my $tmp (@filtered) | 
| 236 |  |  |  |  |  |  | } ## end else [ if ( ( ( not defined $minoffset...)))] | 
| 237 |  |  |  |  |  |  |  | 
| 238 |  |  |  |  |  |  | #say "DEBUG: minoffset='$minoffset' maxoffset='$maxoffset' --> offset_expanded='$offset_expanded'"; | 
| 239 |  |  |  |  |  |  |  | 
| 240 |  |  |  |  |  |  | # minimization steps | 
| 241 | 18804 |  |  |  |  | 35640 | $offset_expanded =~ s#{0,}#*#g; | 
| 242 | 18804 |  |  |  |  | 28364 | $offset_expanded =~ s#{1,}#+#g; | 
| 243 | 18804 |  |  |  |  | 29668 | $offset_expanded =~ s#{0,1}#?#g; | 
| 244 | 18804 |  |  |  |  | 48127 | return $offset_expanded; | 
| 245 |  |  |  |  |  |  | } ## end sub _expand_offsets($$) | 
| 246 |  |  |  |  |  |  |  | 
| 247 |  |  |  |  |  |  | # got XPath-object and returns a regex-structure as hashref | 
| 248 |  |  |  |  |  |  | sub _parse_fragments ($) { | 
| 249 | 12993 |  |  | 12993 |  | 21614 | my $fq        = shift; | 
| 250 | 12993 |  |  |  |  | 30591 | my $position  = $fq->getAttribute('Position'); | 
| 251 | 12993 |  |  |  |  | 128254 | my $minoffset = $fq->getAttribute('MinOffset'); | 
| 252 | 12993 |  |  |  |  | 109410 | my $maxoffset = $fq->getAttribute('MaxOffset'); | 
| 253 | 12993 |  |  |  |  | 125100 | my $rx        = $fq->textContent; | 
| 254 | 12993 |  |  |  |  | 26622 | my $expanded  = _expand_pattern($rx); | 
| 255 | 12993 |  |  |  |  | 21883 | my $ret; | 
| 256 | 12993 |  |  |  |  | 31298 | $ret->{position}  = $position; | 
| 257 | 12993 |  |  |  |  | 22405 | $ret->{direction} = "left"; | 
| 258 | 12993 |  |  |  |  | 21340 | $ret->{regex}     = ""; | 
| 259 |  |  |  |  |  |  |  | 
| 260 | 12993 |  |  |  |  | 26306 | my ($offset_expanded) = _expand_offsets( $minoffset, $maxoffset ); | 
| 261 |  |  |  |  |  |  |  | 
| 262 | 12993 | 100 |  |  |  | 58118 | if ( $fq->localname eq "LeftFragment" ) { | 
|  |  | 50 |  |  |  |  |  | 
| 263 | 4788 |  |  |  |  | 8815 | $ret->{direction} = "left"; | 
| 264 | 4788 |  |  |  |  | 12631 | $ret->{regex}     = "($expanded)$offset_expanded"; | 
| 265 |  |  |  |  |  |  | } | 
| 266 |  |  |  |  |  |  | elsif ( $fq->localname eq "RightFragment" ) { | 
| 267 | 8205 |  |  |  |  | 15295 | $ret->{direction} = "right"; | 
| 268 | 8205 |  |  |  |  | 20333 | $ret->{regex}     = "$offset_expanded($expanded)"; | 
| 269 |  |  |  |  |  |  | } | 
| 270 |  |  |  |  |  |  |  | 
| 271 |  |  |  |  |  |  | #say "pF: rx=$rx expanded=$expanded offset=$offset_expanded"; | 
| 272 | 12993 |  |  |  |  | 34926 | return $ret; | 
| 273 |  |  |  |  |  |  | } ## end sub _parse_fragments ($) | 
| 274 |  |  |  |  |  |  |  | 
| 275 |  |  |  |  |  |  | # got XPath-object and search direction and returns a regex-structure as hashref | 
| 276 |  |  |  |  |  |  | sub _parse_subsequence ($$) { | 
| 277 | 5811 |  |  | 5811 |  | 10537 | my $ssq       = shift; | 
| 278 | 5811 |  |  |  |  | 9359 | my $dir       = shift; | 
| 279 | 5811 |  |  |  |  | 13828 | my $position  = $ssq->getAttribute('Position'); | 
| 280 | 5811 |  |  |  |  | 58908 | my $minoffset = $ssq->getAttribute('SubSeqMinOffset'); | 
| 281 | 5811 |  |  |  |  | 50509 | my $maxoffset = $ssq->getAttribute('SubSeqMaxOffset'); | 
| 282 |  |  |  |  |  |  |  | 
| 283 | 5811 |  |  |  |  | 50232 | my $rx = $ssq->getElementsByTagName('Sequence')->get_node(1)->textContent; | 
| 284 |  |  |  |  |  |  |  | 
| 285 | 5811 |  |  |  |  | 301011 | my @lnodes        = $ssq->getElementsByTagName('LeftFragment'); | 
| 286 | 5811 |  |  |  |  | 268232 | my @rnodes        = $ssq->getElementsByTagName('RightFragment'); | 
| 287 | 5811 |  |  |  |  | 191289 | my @lrx_fragments = map { _parse_fragments($_) } @lnodes; | 
|  | 4788 |  |  |  |  | 9798 |  | 
| 288 | 5811 |  |  |  |  | 10940 | my @rrx_fragments = map { _parse_fragments($_) } @rnodes; | 
|  | 8205 |  |  |  |  | 16475 |  | 
| 289 | 5811 |  |  |  |  | 13726 | my $lregex        = _flatten_rx( "", @lrx_fragments ); | 
| 290 | 5811 |  |  |  |  | 11848 | my $rregex        = _flatten_rx( "", @rrx_fragments ); | 
| 291 | 5811 |  |  |  |  | 11892 | my $expanded      = _expand_pattern($rx); | 
| 292 |  |  |  |  |  |  |  | 
| 293 |  |  |  |  |  |  | #if (   length($minoffset) > 0 | 
| 294 |  |  |  |  |  |  | #    && length($maxoffset) > 0 | 
| 295 |  |  |  |  |  |  | #    && $minoffset > $maxoffset ) { | 
| 296 |  |  |  |  |  |  | #    confess( | 
| 297 |  |  |  |  |  |  | #"parse_subsequence: Maxoffset=$maxoffset < Minoffset=$minoffset! regex= '$rx'" | 
| 298 |  |  |  |  |  |  | #            ); | 
| 299 |  |  |  |  |  |  | #    } ## end if ( length($minoffset...)) | 
| 300 |  |  |  |  |  |  |  | 
| 301 | 5811 |  |  |  |  | 15261 | my $offset_expanded = _expand_offsets( $minoffset, $maxoffset ); | 
| 302 | 5811 |  |  |  |  | 20022 | my $prefix; | 
| 303 |  |  |  |  |  |  | my $suffix; | 
| 304 | 5811 |  |  |  |  | 0 | my $ret; | 
| 305 | 5811 |  |  |  |  | 0 | my $regex; | 
| 306 | 5811 | 100 | 66 |  |  | 24598 | if ( !defined $dir || length($dir) == 0 ) { | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
| 307 | 420 |  |  |  |  | 1314 | $regex = join( "", $lregex, $expanded, $rregex ); | 
| 308 |  |  |  |  |  |  | } | 
| 309 |  |  |  |  |  |  | elsif ( $dir eq "BOFoffset" ) { | 
| 310 | 4782 |  |  |  |  | 15389 | $regex = | 
| 311 |  |  |  |  |  |  | join( "", $offset_expanded, "(", $lregex, $expanded, $rregex, ")" ); | 
| 312 |  |  |  |  |  |  | } | 
| 313 |  |  |  |  |  |  | elsif ( $dir eq "EOFoffset" ) { | 
| 314 | 609 |  |  |  |  | 1894 | $regex = | 
| 315 |  |  |  |  |  |  | join( "", "(", $lregex, $expanded, $rregex, ")", $offset_expanded ); | 
| 316 |  |  |  |  |  |  | } | 
| 317 |  |  |  |  |  |  | else { | 
| 318 | 0 |  |  |  |  | 0 | warn "unknown reference '$dir' found\n"; | 
| 319 | 0 |  |  |  |  | 0 | $regex = join( "", $lregex, $expanded, $rregex ); | 
| 320 |  |  |  |  |  |  | } | 
| 321 |  |  |  |  |  |  | $ret->{regex} = | 
| 322 | 5811 |  |  |  |  | 17869 | File::FormatIdentification::Regex::peep_hole_optimizer($regex); | 
| 323 | 5811 |  |  |  |  | 13466 | $ret->{position} = $position; | 
| 324 |  |  |  |  |  |  |  | 
| 325 | 5811 |  |  |  |  | 31606 | return $ret; | 
| 326 |  |  |  |  |  |  | } ## end sub _parse_subsequence ($$) | 
| 327 |  |  |  |  |  |  |  | 
| 328 |  |  |  |  |  |  | # got XPath-object and returns regex-string | 
| 329 |  |  |  |  |  |  | sub _parse_bytesequence ($) { | 
| 330 | 4929 |  |  | 4929 |  | 8406 | my $bsq = shift; | 
| 331 |  |  |  |  |  |  |  | 
| 332 |  |  |  |  |  |  | #say "rx_groups in parse_byte_sequence:"; | 
| 333 | 4929 |  |  |  |  | 12512 | my $reference = $bsq->getAttribute('Reference'); | 
| 334 |  |  |  |  |  |  | ;    # if BOFoffset -> anchored begin of file, EOFofset -> end of file | 
| 335 | 4929 |  |  |  |  | 53090 | my @nodes           = $bsq->getElementsByTagName('SubSequence'); | 
| 336 | 4929 |  |  |  |  | 191885 | my @rx_groups       = map { _parse_subsequence( $_, $reference ) } @nodes; | 
|  | 5811 |  |  |  |  | 27861 |  | 
| 337 | 4929 |  |  |  |  | 105800 | my $expanded        = ""; | 
| 338 | 4929 |  |  |  |  | 11051 | my $regex_flattened = _flatten_rx( $expanded, @rx_groups ); | 
| 339 |  |  |  |  |  |  |  | 
| 340 |  |  |  |  |  |  | #my $ro = Regexp::Optimizer->new; | 
| 341 |  |  |  |  |  |  | #my $ro = Regexp::Assemble->new; | 
| 342 |  |  |  |  |  |  | #$ro->add( $regex_flattened); | 
| 343 |  |  |  |  |  |  | #$regex_flattened = $ro->as_string($regex_flattened); | 
| 344 |  |  |  |  |  |  | #$regex_flattened = $ro->re; | 
| 345 | 4929 |  |  |  |  | 8527 | my $regex; | 
| 346 | 4929 | 100 | 66 |  |  | 21786 | if ( !defined $reference || 0 == length($reference) ) { | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
| 347 | 255 |  |  |  |  | 489 | $regex = "$regex_flattened"; | 
| 348 |  |  |  |  |  |  | } | 
| 349 |  |  |  |  |  |  | elsif ( $reference eq "BOFoffset" ) { | 
| 350 | 4065 |  |  |  |  | 9638 | $regex = "\\A$regex_flattened"; | 
| 351 |  |  |  |  |  |  | } | 
| 352 |  |  |  |  |  |  | elsif ( $reference eq "EOFoffset" ) { | 
| 353 | 609 |  |  |  |  | 1656 | $regex = "$regex_flattened\\Z"; | 
| 354 |  |  |  |  |  |  | } | 
| 355 |  |  |  |  |  |  | else { | 
| 356 | 0 |  |  |  |  | 0 | warn "unknown reference '$reference' found\n"; | 
| 357 | 0 |  |  |  |  | 0 | $regex = "$regex_flattened"; | 
| 358 |  |  |  |  |  |  | } | 
| 359 |  |  |  |  |  |  |  | 
| 360 | 1 |  |  | 1 |  | 11 | use Regexp::Optimizer; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 3047 |  | 
| 361 | 4929 |  |  |  |  | 17102 | my $ro = Regexp::Optimizer->new; | 
| 362 |  |  |  |  |  |  |  | 
| 363 |  |  |  |  |  |  | #say "regex='$regex'"; | 
| 364 |  |  |  |  |  |  | #$regex = $ro->as_string( $regex ); | 
| 365 | 4929 |  |  |  |  | 41730 | return $regex; | 
| 366 |  |  |  |  |  |  | } ## end sub _parse_bytesequence ($) | 
| 367 |  |  |  |  |  |  |  | 
| 368 |  |  |  |  |  |  | # ($%signatures, $%internal) = parse_signaturefile( $file ) | 
| 369 |  |  |  |  |  |  | sub _parse_signaturefile($) { | 
| 370 | 3 |  |  | 3 |  | 9 | my $pronomfile = shift; | 
| 371 | 3 |  |  |  |  | 9 | my %signatures; | 
| 372 |  |  |  |  |  |  |  | 
| 373 |  |  |  |  |  |  | # hash{internalid}->{regex} = $regex | 
| 374 |  |  |  |  |  |  | #                 ->{signature} = $signature | 
| 375 |  |  |  |  |  |  | my %internal_signatures; | 
| 376 |  |  |  |  |  |  |  | 
| 377 | 3 |  |  |  |  | 29 | my $dom = XML::LibXML->load_xml( location => $pronomfile ); | 
| 378 | 3 |  |  |  |  | 216571 | $dom->indexElements(); | 
| 379 | 3 |  |  |  |  | 221 | my $xp = XML::LibXML::XPathContext->new($dom); | 
| 380 | 3 |  |  |  |  | 47 | $xp->registerNs( 'droid', | 
| 381 |  |  |  |  |  |  | 'http://www.nationalarchives.gov.uk/pronom/SignatureFile' ); | 
| 382 |  |  |  |  |  |  |  | 
| 383 |  |  |  |  |  |  | # find Fileformats | 
| 384 |  |  |  |  |  |  | #my $tmp = $xp->find('/*[local-name() = "FFSignatureFile"]')->get_node(1); | 
| 385 |  |  |  |  |  |  | #say "E:", $tmp->nodeName; | 
| 386 |  |  |  |  |  |  | #say "EXISTS:", $xp->exists('/droid:FFSignatureFile'); | 
| 387 |  |  |  |  |  |  | #say "EXISTS2", $xp->exists('/droid:FFSignatureFile/droid:FileFormatCollection/droid:FileFormat'); | 
| 388 |  |  |  |  |  |  |  | 
| 389 | 3 |  |  |  |  | 28 | my $fmts = $xp->find( | 
| 390 |  |  |  |  |  |  | '/*[local-name() = "FFSignatureFile"]/*[local-name() = "FileFormatCollection"]/*[local-name() = "FileFormat"]' | 
| 391 |  |  |  |  |  |  | ); | 
| 392 | 3 |  |  |  |  | 8137 | foreach my $fmt ( $fmts->get_nodelist() ) { | 
| 393 | 4752 |  |  |  |  | 12758 | my $id       = $fmt->getAttribute('ID'); | 
| 394 | 4752 |  |  |  |  | 47914 | my $mimetype = $fmt->getAttribute('MIMEtype'); | 
| 395 | 4752 |  |  |  |  | 40908 | my $name     = $fmt->getAttribute('Name'); | 
| 396 | 4752 |  |  |  |  | 39550 | my $puid     = $fmt->getAttribute('PUID'); | 
| 397 | 4752 |  |  |  |  | 37565 | my $version  = $fmt->getAttribute('Version'); | 
| 398 |  |  |  |  |  |  | # | 
| 399 |  |  |  |  |  |  |  | 
| 400 |  |  |  |  |  |  | ## | 
| 401 |  |  |  |  |  |  | my @extensions = | 
| 402 | 4752 |  |  |  |  | 39321 | map { $_->textContent() } $fmt->getElementsByTagName('Extension'); | 
|  | 5679 |  |  |  |  | 151537 |  | 
| 403 |  |  |  |  |  |  | my @internalsignatures = | 
| 404 | 4752 |  |  |  |  | 20874 | map { $_->textContent() } | 
|  | 4029 |  |  |  |  | 142304 |  | 
| 405 |  |  |  |  |  |  | $fmt->getElementsByTagName('InternalSignatureID'); | 
| 406 | 4752 |  |  |  |  | 60392 | my @haspriorityover = map { $_->textContent() } | 
|  | 2229 |  |  |  |  | 43264 |  | 
| 407 |  |  |  |  |  |  | $fmt->getElementsByTagName('HasPriorityOverFileFormatID'); | 
| 408 | 4752 |  |  |  |  | 134438 | $signatures{$id}->{mimetype}   = $mimetype; | 
| 409 | 4752 |  |  |  |  | 29317 | $signatures{$id}->{name}       = $name; | 
| 410 | 4752 |  |  |  |  | 9197 | $signatures{$id}->{puid}       = $puid; | 
| 411 | 4752 |  |  |  |  | 9606 | $signatures{$id}->{version}    = $version;       # optional | 
| 412 | 4752 |  |  |  |  | 9960 | $signatures{$id}->{extensions} = \@extensions; | 
| 413 | 4752 |  |  |  |  | 9418 | $signatures{$id}->{internal_signatures} = \@internalsignatures; | 
| 414 |  |  |  |  |  |  |  | 
| 415 | 4752 |  |  |  |  | 10306 | foreach my $prio (@haspriorityover) { | 
| 416 | 2229 |  |  |  |  | 5440 | $signatures{$id}->{priorityover}->{$prio} = 1; | 
| 417 |  |  |  |  |  |  | } | 
| 418 |  |  |  |  |  |  |  | 
| 419 | 4752 |  |  |  |  | 8962 | foreach my $internal (@internalsignatures) { | 
| 420 | 4029 |  |  |  |  | 16065 | $internal_signatures{$internal}->{signature} = $id; | 
| 421 |  |  |  |  |  |  | } | 
| 422 |  |  |  |  |  |  | } ## end foreach my $fmt ( $fmts->get_nodelist...) | 
| 423 |  |  |  |  |  |  |  | 
| 424 |  |  |  |  |  |  | # find InternalSignatures | 
| 425 | 3 |  |  |  |  | 458 | my $sigs = | 
| 426 |  |  |  |  |  |  | $xp->find( | 
| 427 |  |  |  |  |  |  | '/*[local-name() = "FFSignatureFile"]/*[local-name() = "InternalSignatureCollection"]/*[local-name() = "InternalSignature"]' | 
| 428 |  |  |  |  |  |  | ); | 
| 429 |  |  |  |  |  |  |  | 
| 430 | 3 |  |  |  |  | 8092 | foreach my $sig ( $sigs->get_nodelist() ) { | 
| 431 |  |  |  |  |  |  |  | 
| 432 | 4083 |  |  |  |  | 60370 | my $id          = $sig->getAttribute('ID'); | 
| 433 | 4083 |  |  |  |  | 49465 | my $specificity = $sig->getAttribute('Specificity'); | 
| 434 | 4083 |  |  |  |  | 48250 | $internal_signatures{$id}->{specificity} = $specificity; | 
| 435 |  |  |  |  |  |  |  | 
| 436 |  |  |  |  |  |  | #p( $sig->toString() ); | 
| 437 | 4083 |  |  |  |  | 12026 | my @nodes = $sig->getElementsByTagName('ByteSequence'); | 
| 438 |  |  |  |  |  |  |  | 
| 439 |  |  |  |  |  |  | #p( @nodes ); | 
| 440 | 4083 |  |  |  |  | 234663 | my @rx_groups = map { _parse_bytesequence($_) } @nodes; | 
|  | 4929 |  |  |  |  | 22556 |  | 
| 441 |  |  |  |  |  |  | my @rx_quality = | 
| 442 | 4083 |  |  |  |  | 62337 | map { File::FormatIdentification::Regex::calc_quality($_); } | 
|  | 4929 |  |  |  |  | 12636 |  | 
| 443 |  |  |  |  |  |  | @rx_groups; | 
| 444 |  |  |  |  |  |  |  | 
| 445 | 4083 |  |  |  |  | 14972 | $internal_signatures{$id}->{regex}   = \@rx_groups; | 
| 446 | 4083 |  |  |  |  | 16224 | $internal_signatures{$id}->{quality} = \@rx_quality; | 
| 447 |  |  |  |  |  |  | } ## end foreach my $sig ( $sigs->get_nodelist...) | 
| 448 |  |  |  |  |  |  |  | 
| 449 | 3 |  |  |  |  | 511 | return ( \%signatures, \%internal_signatures ); | 
| 450 |  |  |  |  |  |  | } ## end sub _parse_signaturefile($) | 
| 451 |  |  |  |  |  |  |  | 
| 452 |  |  |  |  |  |  | sub uniq_signature_ids_by_priority { | 
| 453 | 0 |  |  | 0 | 0 | 0 | my $self       = shift; | 
| 454 | 0 |  |  |  |  | 0 | my @signatures = @_; | 
| 455 | 0 |  |  |  |  | 0 | my %found_signature_ids; | 
| 456 |  |  |  |  |  |  |  | 
| 457 |  |  |  |  |  |  | # which PUIDs are in list? | 
| 458 | 0 |  |  |  |  | 0 | foreach my $signatureid (@signatures) { | 
| 459 | 0 | 0 |  |  |  | 0 | if ( defined $signatureid ) { | 
| 460 | 0 |  |  |  |  | 0 | $found_signature_ids{$signatureid} = 1; | 
| 461 |  |  |  |  |  |  | } | 
| 462 |  |  |  |  |  |  | } | 
| 463 |  |  |  |  |  |  |  | 
| 464 |  |  |  |  |  |  | # remove all signatures when actual signature has priority over | 
| 465 | 0 |  |  |  |  | 0 | foreach my $signatureid ( keys %found_signature_ids ) { | 
| 466 | 0 |  |  |  |  | 0 | foreach my $priority_over_sid ( | 
| 467 | 0 |  |  |  |  | 0 | keys %{ $self->{signatures}->{$signatureid}->{priorityover} } ) | 
| 468 |  |  |  |  |  |  | { | 
| 469 | 0 | 0 |  |  |  | 0 | if ( exists $found_signature_ids{$priority_over_sid} ) { | 
| 470 | 0 |  |  |  |  | 0 | delete $found_signature_ids{$priority_over_sid}; | 
| 471 |  |  |  |  |  |  | } | 
| 472 |  |  |  |  |  |  | } ## end foreach my $priority_over_sid... | 
| 473 |  |  |  |  |  |  | } ## end foreach my $signatureid ( keys...) | 
| 474 |  |  |  |  |  |  |  | 
| 475 |  |  |  |  |  |  | # reduce list to all signatures with correct priority | 
| 476 |  |  |  |  |  |  | my @result = | 
| 477 | 0 |  |  |  |  | 0 | grep { defined $found_signature_ids{ $_->{signature} } } @signatures; | 
|  | 0 |  |  |  |  | 0 |  | 
| 478 | 0 |  |  |  |  | 0 | return @result; | 
| 479 |  |  |  |  |  |  | } ## end sub uniq_signature_ids_by_priority | 
| 480 |  |  |  |  |  |  |  | 
| 481 |  |  |  |  |  |  | has 'droid_signature_filename' => ( | 
| 482 |  |  |  |  |  |  | is       => 'ro', | 
| 483 |  |  |  |  |  |  | required => 1, | 
| 484 |  |  |  |  |  |  | reader   => 'get_droid_signature_filename', | 
| 485 |  |  |  |  |  |  | trigger  => sub { | 
| 486 |  |  |  |  |  |  | my $self = shift; | 
| 487 |  |  |  |  |  |  |  | 
| 488 |  |  |  |  |  |  | #say "TRIGGER"; | 
| 489 |  |  |  |  |  |  | my $yaml_file = $self->get_droid_signature_filename() . ".yaml"; | 
| 490 |  |  |  |  |  |  | if ( $self->{auto_load} && -e $yaml_file ) { | 
| 491 |  |  |  |  |  |  | $self->load_from_yamlfile($yaml_file); | 
| 492 |  |  |  |  |  |  | } | 
| 493 |  |  |  |  |  |  | else { | 
| 494 |  |  |  |  |  |  | my ( $signatures, $internal_signatures ) = | 
| 495 |  |  |  |  |  |  | _parse_signaturefile( $self->{droid_signature_filename} ); | 
| 496 |  |  |  |  |  |  | $self->{signatures}          = $signatures; | 
| 497 |  |  |  |  |  |  | $self->{internal_signatures} = $internal_signatures; | 
| 498 |  |  |  |  |  |  |  | 
| 499 |  |  |  |  |  |  | #die; | 
| 500 |  |  |  |  |  |  | if ( $self->{auto_store} ) { | 
| 501 |  |  |  |  |  |  | $self->save_as_yamlfile($yaml_file); | 
| 502 |  |  |  |  |  |  | } | 
| 503 |  |  |  |  |  |  | } ## end else [ if ( $self->{auto_load...})] | 
| 504 |  |  |  |  |  |  | foreach my $s ( keys %{ $self->{signatures} } ) { | 
| 505 |  |  |  |  |  |  | my $puid = $self->{signatures}->{$s}->{puid}; | 
| 506 |  |  |  |  |  |  | if ( defined $puid && length($puid) > 0 ) { | 
| 507 |  |  |  |  |  |  | $self->{puids}->{$puid} = $s; | 
| 508 |  |  |  |  |  |  | } | 
| 509 |  |  |  |  |  |  | } | 
| 510 |  |  |  |  |  |  | } | 
| 511 |  |  |  |  |  |  | ); | 
| 512 |  |  |  |  |  |  |  | 
| 513 |  |  |  |  |  |  | sub save_as_yamlfile { | 
| 514 | 2 |  |  | 2 | 0 | 10 | my $self     = shift; | 
| 515 | 2 |  |  |  |  | 9 | my $filename = shift; | 
| 516 | 2 |  |  |  |  | 5 | my @res; | 
| 517 | 2 |  |  |  |  | 10 | push @res, $self->{signatures}; | 
| 518 | 2 |  |  |  |  | 7 | push @res, $self->{internal_signatures}; | 
| 519 | 2 |  |  |  |  | 25 | YAML::XS::DumpFile( "$filename", @res ); | 
| 520 | 2 |  |  |  |  | 194135 | return; | 
| 521 |  |  |  |  |  |  | } ## end sub save_as_yamlfile | 
| 522 |  |  |  |  |  |  |  | 
| 523 |  |  |  |  |  |  | sub load_from_yamlfile { | 
| 524 | 1 |  |  | 1 | 0 | 3 | my $self     = shift; | 
| 525 | 1 |  |  |  |  | 4 | my $filename = shift; | 
| 526 | 1 |  |  |  |  | 9 | my ( $sig, $int ) = YAML::XS::LoadFile($filename); | 
| 527 | 1 |  |  |  |  | 55188 | $self->{signatures}          = $sig; | 
| 528 | 1 |  |  |  |  | 6 | $self->{internal_signatures} = $int; | 
| 529 | 1 |  |  |  |  | 5 | return; | 
| 530 |  |  |  |  |  |  | } ## end sub load_from_yamlfile | 
| 531 |  |  |  |  |  |  |  | 
| 532 |  |  |  |  |  |  | has 'auto_store' => ( | 
| 533 |  |  |  |  |  |  | is      => 'ro', | 
| 534 |  |  |  |  |  |  | default => 1, | 
| 535 |  |  |  |  |  |  | ); | 
| 536 |  |  |  |  |  |  |  | 
| 537 |  |  |  |  |  |  | has 'auto_load' => ( | 
| 538 |  |  |  |  |  |  | is      => 'ro', | 
| 539 |  |  |  |  |  |  | default => 1, | 
| 540 |  |  |  |  |  |  | ); | 
| 541 |  |  |  |  |  |  |  | 
| 542 |  |  |  |  |  |  | sub get_all_signature_ids { | 
| 543 | 0 |  |  | 0 | 0 |  | my $self = shift; | 
| 544 | 0 |  |  |  |  |  | my @sigs = sort { $a <=> $b } keys %{ $self->{signatures} }; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 545 | 0 |  |  |  |  |  | return @sigs; | 
| 546 |  |  |  |  |  |  | } | 
| 547 |  |  |  |  |  |  |  | 
| 548 |  |  |  |  |  |  | sub get_signature_id_by_puid { | 
| 549 | 0 |  |  | 0 | 0 |  | my $self = shift; | 
| 550 | 0 |  |  |  |  |  | my $puid = shift; | 
| 551 | 0 |  |  |  |  |  | my $sig  = $self->{puids}->{$puid}; | 
| 552 | 0 |  |  |  |  |  | return $sig; | 
| 553 |  |  |  |  |  |  | } | 
| 554 |  |  |  |  |  |  |  | 
| 555 |  |  |  |  |  |  | sub get_internal_ids_by_puid { | 
| 556 | 0 |  |  | 0 | 0 |  | my $self = shift; | 
| 557 | 0 |  |  |  |  |  | my $puid = shift; | 
| 558 | 0 |  |  |  |  |  | my $sig  = $self->get_signature_id_by_puid($puid); | 
| 559 | 0 |  |  |  |  |  | my @ids  = (); | 
| 560 | 0 | 0 |  |  |  |  | if ( defined $sig ) { | 
| 561 | 0 |  |  |  |  |  | @ids = grep { defined $_ } | 
| 562 | 0 |  |  |  |  |  | @{ $self->{signatures}->{$sig}->{internal_signatures} }; | 
|  | 0 |  |  |  |  |  |  | 
| 563 |  |  |  |  |  |  | } | 
| 564 | 0 |  |  |  |  |  | return @ids; | 
| 565 |  |  |  |  |  |  | } | 
| 566 |  |  |  |  |  |  |  | 
| 567 |  |  |  |  |  |  | sub get_file_endings_by_puid { | 
| 568 | 0 |  |  | 0 | 0 |  | my $self    = shift; | 
| 569 | 0 |  |  |  |  |  | my $puid    = shift; | 
| 570 | 0 |  |  |  |  |  | my $sig     = $self->get_signature_id_by_puid($puid); | 
| 571 | 0 |  |  |  |  |  | my @endings = (); | 
| 572 | 0 | 0 |  |  |  |  | if ( defined $sig ) { | 
| 573 | 0 |  |  |  |  |  | @endings = $self->{signatures}->{$sig}->{extensions}; | 
| 574 |  |  |  |  |  |  | } | 
| 575 | 0 |  |  |  |  |  | return @endings; | 
| 576 |  |  |  |  |  |  | } | 
| 577 |  |  |  |  |  |  |  | 
| 578 |  |  |  |  |  |  | sub get_all_internal_ids { | 
| 579 | 0 |  |  | 0 | 0 |  | my $self = shift; | 
| 580 | 0 |  |  |  |  |  | my @ids = sort { $a <=> $b } keys %{ $self->{internal_signatures} }; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 581 | 0 |  |  |  |  |  | foreach my $id (@ids) { | 
| 582 | 0 | 0 |  |  |  |  | if ( !defined $id ) { confess("$id not defined") } | 
|  | 0 |  |  |  |  |  |  | 
| 583 |  |  |  |  |  |  | } | 
| 584 | 0 |  |  |  |  |  | return @ids; | 
| 585 |  |  |  |  |  |  | } | 
| 586 |  |  |  |  |  |  |  | 
| 587 |  |  |  |  |  |  | sub get_all_puids { | 
| 588 | 0 |  |  | 0 | 0 |  | my $self = shift; | 
| 589 |  |  |  |  |  |  | my @ids = | 
| 590 | 0 |  |  |  |  |  | sort grep { defined $_ } | 
| 591 | 0 |  |  |  |  |  | map       { $self->{signatures}->{$_}->{puid}; } | 
| 592 | 0 |  |  |  |  |  | grep      { defined $_ } $self->get_all_signature_ids(); | 
|  | 0 |  |  |  |  |  |  | 
| 593 | 0 |  |  |  |  |  | return @ids; | 
| 594 |  |  |  |  |  |  | } | 
| 595 |  |  |  |  |  |  |  | 
| 596 |  |  |  |  |  |  | sub get_regular_expressions_by_internal_id { | 
| 597 | 0 |  |  | 0 | 0 |  | my $self       = shift; | 
| 598 | 0 |  |  |  |  |  | my $internalid = shift; | 
| 599 | 0 | 0 |  |  |  |  | if ( !defined $internalid ) { confess("internalid must exists!"); } | 
|  | 0 |  |  |  |  |  |  | 
| 600 | 0 |  |  |  |  |  | return @{ $self->{internal_signatures}->{$internalid}->{regex} }; | 
|  | 0 |  |  |  |  |  |  | 
| 601 |  |  |  |  |  |  | } | 
| 602 |  |  |  |  |  |  |  | 
| 603 |  |  |  |  |  |  | sub get_all_regular_expressions { | 
| 604 | 0 |  |  | 0 | 0 |  | my $self    = shift; | 
| 605 | 0 |  |  |  |  |  | my @ids     = $self->get_all_internal_ids(); | 
| 606 | 0 |  |  |  |  |  | my @regexes = (); | 
| 607 | 0 |  |  |  |  |  | foreach my $id (@ids) { | 
| 608 | 0 |  |  |  |  |  | my @rx = $self->get_regular_expressions_by_internal_id($id); | 
| 609 | 0 |  |  |  |  |  | push @regexes, @rx; | 
| 610 |  |  |  |  |  |  | } | 
| 611 | 0 |  |  |  |  |  | my @ret = sort @regexes; | 
| 612 | 0 |  |  |  |  |  | return @ret; | 
| 613 |  |  |  |  |  |  | } | 
| 614 |  |  |  |  |  |  |  | 
| 615 |  |  |  |  |  |  | sub get_qualities_by_internal_id { | 
| 616 | 0 |  |  | 0 | 0 |  | my $self       = shift; | 
| 617 | 0 |  |  |  |  |  | my $internalid = shift; | 
| 618 | 0 | 0 |  |  |  |  | if ( !defined $internalid ) { confess("internalid must exists!"); } | 
|  | 0 |  |  |  |  |  |  | 
| 619 | 0 |  |  |  |  |  | my $value = $self->{internal_signatures}->{$internalid}->{quality}; | 
| 620 | 0 | 0 |  |  |  |  | if ( defined $value ) { | 
| 621 | 0 |  |  |  |  |  | return @{$value}; | 
|  | 0 |  |  |  |  |  |  | 
| 622 |  |  |  |  |  |  | } | 
| 623 | 0 |  |  |  |  |  | return; | 
| 624 |  |  |  |  |  |  | } | 
| 625 |  |  |  |  |  |  |  | 
| 626 |  |  |  |  |  |  | sub get_signature_id_by_internal_id { | 
| 627 | 0 |  |  | 0 | 0 |  | my $self       = shift; | 
| 628 | 0 |  |  |  |  |  | my $internalid = shift; | 
| 629 | 0 | 0 |  |  |  |  | if ( !defined $internalid ) { confess("internalid must exists!"); } | 
|  | 0 |  |  |  |  |  |  | 
| 630 | 0 |  |  |  |  |  | return $self->{internal_signatures}->{$internalid}->{signature}; | 
| 631 |  |  |  |  |  |  | } | 
| 632 |  |  |  |  |  |  |  | 
| 633 |  |  |  |  |  |  | sub get_name_by_signature_id { | 
| 634 | 0 |  |  | 0 | 0 |  | my $self      = shift; | 
| 635 | 0 |  |  |  |  |  | my $signature = shift; | 
| 636 | 0 |  |  |  |  |  | return $self->{signatures}->{$signature}->{name}; | 
| 637 |  |  |  |  |  |  | } | 
| 638 |  |  |  |  |  |  |  | 
| 639 |  |  |  |  |  |  | sub get_puid_by_signature_id { | 
| 640 | 0 |  |  | 0 | 0 |  | my $self      = shift; | 
| 641 | 0 |  |  |  |  |  | my $signature = shift; | 
| 642 | 0 |  |  |  |  |  | return $self->{signatures}->{$signature}->{puid}; | 
| 643 |  |  |  |  |  |  | } | 
| 644 |  |  |  |  |  |  |  | 
| 645 |  |  |  |  |  |  | sub get_puid_by_internal_id { | 
| 646 | 0 |  |  | 0 | 0 |  | my $self       = shift; | 
| 647 | 0 |  |  |  |  |  | my $internalid = shift; | 
| 648 | 0 | 0 |  |  |  |  | if ( !defined $internalid ) { confess("internalid must exists!"); } | 
|  | 0 |  |  |  |  |  |  | 
| 649 | 0 |  |  |  |  |  | my $signature = $self->get_signature_id_by_internal_id($internalid); | 
| 650 | 0 |  |  |  |  |  | return $self->get_puid_by_signature_id($signature); | 
| 651 |  |  |  |  |  |  | } | 
| 652 |  |  |  |  |  |  |  | 
| 653 |  |  |  |  |  |  | sub get_quality_sorted_internal_ids { | 
| 654 | 0 |  |  | 0 | 0 |  | my $self = shift; | 
| 655 |  |  |  |  |  |  | my @ids  = sort { | 
| 656 |  |  |  |  |  |  |  | 
| 657 |  |  |  |  |  |  | # sort by regexes | 
| 658 | 0 |  |  |  |  |  | my @a_rxq = @{ $self->{internal_signatures}->{$a}->{quality} }; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 659 | 0 |  |  |  |  |  | my @b_rxq = @{ $self->{internal_signatures}->{$b}->{quality} }; | 
|  | 0 |  |  |  |  |  |  | 
| 660 | 0 |  |  |  |  |  | my $aq    = 0; | 
| 661 | 0 |  |  |  |  |  | foreach my $as (@a_rxq) { $aq += $as; } | 
|  | 0 |  |  |  |  |  |  | 
| 662 | 0 |  |  |  |  |  | my $bq = 0; | 
| 663 | 0 |  |  |  |  |  | foreach my $bs (@b_rxq) { $bq += $bs; } | 
|  | 0 |  |  |  |  |  |  | 
| 664 |  |  |  |  |  |  |  | 
| 665 |  |  |  |  |  |  | #use Data::Printer; | 
| 666 |  |  |  |  |  |  | #p( $a ); | 
| 667 |  |  |  |  |  |  | #p( $aq ); | 
| 668 | 0 |  |  |  |  |  | $aq <=> $bq; | 
| 669 |  |  |  |  |  |  | } $self->get_all_internal_ids(); | 
| 670 | 0 |  |  |  |  |  | return @ids; | 
| 671 |  |  |  |  |  |  | } | 
| 672 |  |  |  |  |  |  |  | 
| 673 |  |  |  |  |  |  | sub get_combined_regex_by_puid { | 
| 674 | 0 |  |  | 0 | 0 |  | my $self      = shift; | 
| 675 | 0 |  |  |  |  |  | my $puid      = shift; | 
| 676 | 0 |  |  |  |  |  | my @internals = $self->get_internal_ids_by_puid($puid); | 
| 677 |  |  |  |  |  |  |  | 
| 678 |  |  |  |  |  |  | #use Data::Printer; | 
| 679 |  |  |  |  |  |  | #p( $puid ); | 
| 680 |  |  |  |  |  |  | #p( @internals ); | 
| 681 |  |  |  |  |  |  | my @regexes = map { | 
| 682 | 0 |  |  |  |  |  | my @regexes_per_internal = | 
|  | 0 |  |  |  |  |  |  | 
| 683 |  |  |  |  |  |  | $self->get_regular_expressions_by_internal_id($_); | 
| 684 | 0 |  |  |  |  |  | my $combined = | 
| 685 |  |  |  |  |  |  | File::FormatIdentification::Regex::and_combine(@regexes_per_internal); | 
| 686 |  |  |  |  |  |  |  | 
| 687 |  |  |  |  |  |  | #p( $combined ); | 
| 688 | 0 |  |  |  |  |  | $combined; | 
| 689 |  |  |  |  |  |  | } @internals; | 
| 690 | 0 |  |  |  |  |  | my $result = File::FormatIdentification::Regex::or_combine(@regexes); | 
| 691 |  |  |  |  |  |  |  | 
| 692 |  |  |  |  |  |  | #p( $result ); | 
| 693 | 0 |  |  |  |  |  | return $result; | 
| 694 |  |  |  |  |  |  | } | 
| 695 |  |  |  |  |  |  |  | 
| 696 |  |  |  |  |  |  | sub _prepare_statistics { | 
| 697 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 698 | 0 |  |  |  |  |  | my $results; | 
| 699 |  |  |  |  |  |  |  | 
| 700 |  |  |  |  |  |  | # count of PUIDs | 
| 701 |  |  |  |  |  |  | # count of internal ids (IDs per PUID) | 
| 702 |  |  |  |  |  |  | # count of regexes | 
| 703 |  |  |  |  |  |  | # count of file endings only | 
| 704 |  |  |  |  |  |  | # count of internal ids without PUID | 
| 705 |  |  |  |  |  |  | # larges and shortest regex | 
| 706 |  |  |  |  |  |  | # complex and simple regex | 
| 707 |  |  |  |  |  |  | # common regexes | 
| 708 |  |  |  |  |  |  | #say "stat"; | 
| 709 | 0 |  |  |  |  |  | my @puids                       = $self->get_all_puids(); | 
| 710 | 0 |  |  |  |  |  | my $puids                       = scalar(@puids); | 
| 711 | 0 |  |  |  |  |  | my @internals                   = $self->get_all_internal_ids(); | 
| 712 | 0 |  |  |  |  |  | my $internals                   = scalar(@internals); | 
| 713 | 0 |  |  |  |  |  | my $regexes                     = 0; | 
| 714 | 0 |  |  |  |  |  | my $fileendingsonly             = 0; | 
| 715 | 0 |  |  |  |  |  | my @fileendingsonly             = (); | 
| 716 | 0 |  |  |  |  |  | my $fileendings                 = 0; | 
| 717 | 0 |  |  |  |  |  | my $int_per_puid                = 0; | 
| 718 | 0 |  |  |  |  |  | my $internal_without_puid       = 0; | 
| 719 | 0 |  |  |  |  |  | my @internal_without_puid       = (); | 
| 720 | 0 |  |  |  |  |  | my @quality_sorted_internal_ids = $self->get_quality_sorted_internal_ids(); | 
| 721 | 0 |  |  |  |  |  | my %uniq_regexes; | 
| 722 |  |  |  |  |  |  |  | 
| 723 | 0 |  |  |  |  |  | foreach my $internalid (@internals) { | 
| 724 | 0 |  |  |  |  |  | my @regexes = | 
| 725 |  |  |  |  |  |  | $self->get_regular_expressions_by_internal_id($internalid); | 
| 726 | 0 |  |  |  |  |  | foreach my $rx (@regexes) { | 
| 727 | 0 |  |  |  |  |  | my @tmp = (); | 
| 728 | 0 | 0 |  |  |  |  | if ( exists $uniq_regexes{$rx} ) { | 
| 729 | 0 |  |  |  |  |  | @tmp = @{ $uniq_regexes{$rx} }; | 
|  | 0 |  |  |  |  |  |  | 
| 730 |  |  |  |  |  |  | } | 
| 731 | 0 |  |  |  |  |  | push @tmp, $internalid; | 
| 732 | 0 |  |  |  |  |  | $uniq_regexes{$rx} = \@tmp; | 
| 733 |  |  |  |  |  |  | } | 
| 734 |  |  |  |  |  |  |  | 
| 735 | 0 |  |  |  |  |  | $regexes += scalar(@regexes); | 
| 736 | 0 |  |  |  |  |  | my $sigid = $self->get_signature_id_by_internal_id($internalid); | 
| 737 | 0 | 0 |  |  |  |  | if ( !defined $sigid ) { | 
| 738 | 0 |  |  |  |  |  | $internal_without_puid++; | 
| 739 | 0 |  |  |  |  |  | push @internal_without_puid, $internalid; | 
| 740 |  |  |  |  |  |  | } | 
| 741 |  |  |  |  |  |  | } | 
| 742 | 0 |  |  |  |  |  | foreach my $puid (@puids) { | 
| 743 | 0 |  |  |  |  |  | my @ints        = $self->get_internal_ids_by_puid($puid); | 
| 744 | 0 |  |  |  |  |  | my @fileendings = $self->get_file_endings_by_puid($puid); | 
| 745 | 0 | 0 |  |  |  |  | if ( 0 == scalar(@ints) ) { | 
| 746 | 0 |  |  |  |  |  | $fileendingsonly++; | 
| 747 | 0 |  |  |  |  |  | push @fileendingsonly, $puid; | 
| 748 |  |  |  |  |  |  | } | 
| 749 |  |  |  |  |  |  | else { | 
| 750 | 0 |  |  |  |  |  | $fileendings  += scalar(@fileendings); | 
| 751 | 0 |  |  |  |  |  | $int_per_puid += scalar(@ints); | 
| 752 |  |  |  |  |  |  | } | 
| 753 |  |  |  |  |  |  | } | 
| 754 | 0 |  |  |  |  |  | foreach my $i (@quality_sorted_internal_ids) { | 
| 755 | 0 |  |  |  |  |  | my $regex = | 
| 756 |  |  |  |  |  |  | join( "#", $self->get_regular_expressions_by_internal_id($i) ); | 
| 757 | 0 |  |  |  |  |  | my $quality = join( " ", $self->get_qualities_by_internal_id($i) ); | 
| 758 |  |  |  |  |  |  |  | 
| 759 |  |  |  |  |  |  | } | 
| 760 |  |  |  |  |  |  |  | 
| 761 | 0 |  |  |  |  |  | $results->{filename}              = $self->get_droid_signature_filename(); | 
| 762 | 0 |  |  |  |  |  | $results->{count_of_puids}        = $puids; | 
| 763 | 0 |  |  |  |  |  | $results->{count_of_internal_ids} = $internals; | 
| 764 | 0 |  |  |  |  |  | $results->{count_of_regular_expressions}        = $regexes; | 
| 765 | 0 |  |  |  |  |  | $results->{count_of_fileendings}                = $fileendings; | 
| 766 | 0 |  |  |  |  |  | $results->{count_of_puid_with_fileendings_only} = $fileendingsonly; | 
| 767 | 0 |  |  |  |  |  | $results->{puids_with_fileendings_only}         = \@fileendingsonly; | 
| 768 | 0 |  |  |  |  |  | $results->{count_of_orphaned_internal_ids}      = $internal_without_puid; | 
| 769 | 0 |  |  |  |  |  | $results->{internal_ids_without_puids}          = \@internal_without_puid; | 
| 770 | 1 |  |  | 1 |  | 13 | no warnings; | 
|  | 1 |  |  |  |  | 4 |  | 
|  | 1 |  |  |  |  | 1497 |  | 
| 771 |  |  |  |  |  |  |  | 
| 772 | 0 |  |  |  |  |  | for ( my $i = 0 ; $i <= 4 ; $i++ ) { | 
| 773 | 0 |  |  |  |  |  | my $best_quality_internal = pop @quality_sorted_internal_ids; | 
| 774 | 0 | 0 |  |  |  |  | if ( defined $best_quality_internal ) { | 
| 775 | 0 |  |  |  |  |  | my $best_quality = join( ";", | 
| 776 |  |  |  |  |  |  | $self->get_qualities_by_internal_id($best_quality_internal) ); | 
| 777 | 0 |  |  |  |  |  | my $best_puid = | 
| 778 |  |  |  |  |  |  | $self->get_puid_by_internal_id($best_quality_internal); | 
| 779 | 0 |  |  |  |  |  | my $best_name = | 
| 780 |  |  |  |  |  |  | $self->get_name_by_signature_id( | 
| 781 |  |  |  |  |  |  | $self->get_signature_id_by_internal_id($best_quality_internal) | 
| 782 |  |  |  |  |  |  | ); | 
| 783 | 0 |  |  |  |  |  | my $best_regex = $self->get_combined_regex_by_puid($best_puid); | 
| 784 |  |  |  |  |  |  | $results->{nth_best_quality}->[$i]->{internal_id} = | 
| 785 | 0 |  |  |  |  |  | $best_quality_internal; | 
| 786 | 0 |  |  |  |  |  | $results->{nth_best_quality}->[$i]->{puid}    = $best_puid; | 
| 787 | 0 |  |  |  |  |  | $results->{nth_best_quality}->[$i]->{name}    = $best_name; | 
| 788 | 0 |  |  |  |  |  | $results->{nth_best_quality}->[$i]->{quality} = $best_quality; | 
| 789 | 0 |  |  |  |  |  | $results->{nth_best_quality}->[$i]->{combined_regex} = $best_regex; | 
| 790 |  |  |  |  |  |  | } | 
| 791 |  |  |  |  |  |  | } | 
| 792 | 0 |  |  |  |  |  | for ( my $i = 0 ; $i <= 4 ; $i++ ) { | 
| 793 | 0 |  |  |  |  |  | my $worst_quality_internal = shift @quality_sorted_internal_ids; | 
| 794 | 0 | 0 |  |  |  |  | if ( defined $worst_quality_internal ) { | 
| 795 | 0 |  |  |  |  |  | my $worst_quality = join( ";", | 
| 796 |  |  |  |  |  |  | $self->get_qualities_by_internal_id($worst_quality_internal) ); | 
| 797 | 0 |  |  |  |  |  | my $worst_puid = | 
| 798 |  |  |  |  |  |  | $self->get_puid_by_internal_id($worst_quality_internal); | 
| 799 | 0 |  |  |  |  |  | my $worst_name = | 
| 800 |  |  |  |  |  |  | $self->get_name_by_signature_id( | 
| 801 |  |  |  |  |  |  | $self->get_signature_id_by_internal_id($worst_quality_internal) | 
| 802 |  |  |  |  |  |  | ); | 
| 803 | 0 |  |  |  |  |  | my $worst_regex = $self->get_combined_regex_by_puid($worst_puid); | 
| 804 |  |  |  |  |  |  | $results->{nth_worst_quality}->[$i]->{internal_id} = | 
| 805 | 0 |  |  |  |  |  | $worst_quality_internal; | 
| 806 | 0 |  |  |  |  |  | $results->{nth_worst_quality}->[$i]->{puid}    = $worst_puid; | 
| 807 | 0 |  |  |  |  |  | $results->{nth_worst_quality}->[$i]->{name}    = $worst_name; | 
| 808 | 0 |  |  |  |  |  | $results->{nth_worst_quality}->[$i]->{quality} = $worst_quality; | 
| 809 |  |  |  |  |  |  | $results->{nth_worst_quality}->[$i]->{combined_regex} = | 
| 810 | 0 |  |  |  |  |  | $worst_regex; | 
| 811 |  |  |  |  |  |  | } | 
| 812 |  |  |  |  |  |  | } | 
| 813 |  |  |  |  |  |  | my @multiple_used_regex = grep { | 
| 814 | 0 |  |  |  |  |  | my $tmp = $uniq_regexes{$_}; | 
|  | 0 |  |  |  |  |  |  | 
| 815 | 0 |  |  |  |  |  | my @tmp = @{$tmp}; | 
|  | 0 |  |  |  |  |  |  | 
| 816 | 0 |  |  |  |  |  | scalar(@tmp) > 1 | 
| 817 |  |  |  |  |  |  | } sort keys %uniq_regexes; | 
| 818 | 0 |  |  |  |  |  | $results->{count_of_multiple_used_regex} = scalar(@multiple_used_regex); | 
| 819 | 0 |  |  |  |  |  | for ( my $i = 0 ; $i <= $#multiple_used_regex ; $i++ ) { | 
| 820 |  |  |  |  |  |  | $results->{multiple_used_regex}->[$i]->{regex} = | 
| 821 | 0 |  |  |  |  |  | $multiple_used_regex[$i]; | 
| 822 | 0 |  |  |  |  |  | my @ids = join( ",", @{ $uniq_regexes{ $multiple_used_regex[$i] } } ); | 
|  | 0 |  |  |  |  |  |  | 
| 823 | 0 |  |  |  |  |  | $results->{multiple_used_regex}->[$i]->{internal_ids} = \@ids; | 
| 824 |  |  |  |  |  |  | } | 
| 825 | 0 |  |  |  |  |  | return $results; | 
| 826 |  |  |  |  |  |  | } | 
| 827 |  |  |  |  |  |  |  | 
| 828 |  |  |  |  |  |  | sub print_csv_statistics { | 
| 829 | 0 |  |  | 0 | 0 |  | my $self    = shift; | 
| 830 | 0 |  |  |  |  |  | my $csv_file = shift; | 
| 831 | 0 |  |  |  |  |  | my $results = $self->_prepare_statistics(); | 
| 832 | 0 |  |  |  |  |  | my $version = $results->{filename}; | 
| 833 | 0 |  |  |  |  |  | $version =~ s/DROID_SignatureFile_V(\d+)\.xml/$1/; | 
| 834 | 0 |  |  |  |  |  | $results->{version}           = $version; | 
| 835 | 0 |  |  |  |  |  | $results->{best_quality_puid} = $results->{nth_best_quality}->[0]->{puid}; | 
| 836 |  |  |  |  |  |  | $results->{best_quality_internal_id} = | 
| 837 | 0 |  |  |  |  |  | $results->{nth_best_quality}->[0]->{internal_id}; | 
| 838 |  |  |  |  |  |  | $results->{best_quality_quality} = | 
| 839 | 0 |  |  |  |  |  | $results->{nth_best_quality}->[0]->{quality}; | 
| 840 |  |  |  |  |  |  | $results->{best_quality_combined_regex} = | 
| 841 | 0 |  |  |  |  |  | $results->{nth_best_quality}->[0]->{combined_regex}; | 
| 842 | 0 |  |  |  |  |  | $results->{worst_quality_puid} = $results->{nth_worst_quality}->[0]->{puid}; | 
| 843 |  |  |  |  |  |  | $results->{worst_quality_internal_id} = | 
| 844 | 0 |  |  |  |  |  | $results->{nth_worst_quality}->[0]->{internal_id}; | 
| 845 |  |  |  |  |  |  | $results->{worst_quality_quality} = | 
| 846 | 0 |  |  |  |  |  | $results->{nth_worst_quality}->[0]->{quality}; | 
| 847 |  |  |  |  |  |  | $results->{worst_quality_combined_regex} = | 
| 848 | 0 |  |  |  |  |  | $results->{nth_worst_quality}->[0]->{combined_regex}; | 
| 849 |  |  |  |  |  |  |  | 
| 850 | 0 |  |  |  |  |  | my @headers = | 
| 851 |  |  |  |  |  |  | qw(version filename count_of_puids count_of_internal_ids count_of_regular_expressions count_of_fileendings count_of_puid_with_fileendings_only count_of_orphaned_internal_ids count_of_multiple_used_regex best_quality_puid best_quality_internal_id best_quality_quality best_quality_combined_regex worst_quality_puid worst_quality_internal_id worst_quality_quality worst_quality_combined_regex); | 
| 852 | 0 |  |  |  |  |  | my $file_exists = (-e $csv_file); | 
| 853 | 0 | 0 |  |  |  |  | open (my $FH, ">>", "$csv_file") or croak "Can't open file '$csv_file', $0"; | 
| 854 | 0 | 0 |  |  |  |  | if (not $file_exists) { | 
| 855 | 0 |  |  |  |  |  | say $FH "#", join( ",", @headers ); | 
| 856 |  |  |  |  |  |  | } | 
| 857 |  |  |  |  |  |  | say $FH join( | 
| 858 |  |  |  |  |  |  | ",", | 
| 859 |  |  |  |  |  |  | map { | 
| 860 | 0 |  |  |  |  |  | my $result = $results->{$_}; | 
|  | 0 |  |  |  |  |  |  | 
| 861 | 0 | 0 |  |  |  |  | if ( !defined $result ) { $result = ""; } | 
|  | 0 |  |  |  |  |  |  | 
| 862 | 0 |  |  |  |  |  | "\"$result\""; | 
| 863 |  |  |  |  |  |  | } @headers | 
| 864 |  |  |  |  |  |  | ); | 
| 865 | 0 |  |  |  |  |  | close ($FH); | 
| 866 | 0 |  |  |  |  |  | return; | 
| 867 |  |  |  |  |  |  | } | 
| 868 |  |  |  |  |  |  |  | 
| 869 |  |  |  |  |  |  | sub print_statistics { | 
| 870 | 0 |  |  | 0 | 0 |  | my $self    = shift; | 
| 871 | 0 |  |  |  |  |  | my $verbose = shift; | 
| 872 | 0 |  |  |  |  |  | my $results = $self->_prepare_statistics(); | 
| 873 |  |  |  |  |  |  |  | 
| 874 | 0 |  |  |  |  |  | say "Statistics of file $results->{filename}"; | 
| 875 | 0 |  |  |  |  |  | say "======================================="; | 
| 876 | 0 |  |  |  |  |  | say ""; | 
| 877 | 0 |  |  |  |  |  | say "Countings"; | 
| 878 | 0 |  |  |  |  |  | say "---------------------------------------"; | 
| 879 | 0 |  |  |  |  |  | say "Count of PUIDs:                        $results->{count_of_puids}"; | 
| 880 | 0 |  |  |  |  |  | say | 
| 881 |  |  |  |  |  |  | "         internal IDs:                 $results->{count_of_internal_ids}"; | 
| 882 | 0 |  |  |  |  |  | say | 
| 883 |  |  |  |  |  |  | "         regular expressions:          $results->{count_of_regular_expressions}"; | 
| 884 | 0 |  |  |  |  |  | say | 
| 885 |  |  |  |  |  |  | "         file endings:                 $results->{count_of_fileendings}"; | 
| 886 | 0 |  |  |  |  |  | say | 
| 887 |  |  |  |  |  |  | "         PUIDs with file endings only: $results->{count_of_puid_with_fileendings_only}"; | 
| 888 |  |  |  |  |  |  |  | 
| 889 | 0 | 0 |  |  |  |  | if ( defined $verbose ) { | 
| 890 |  |  |  |  |  |  | say "         (", | 
| 891 | 0 |  |  |  |  |  | join( ", ", sort @{ $results->{puids_with_fileendings_only} } ), ")"; | 
|  | 0 |  |  |  |  |  |  | 
| 892 |  |  |  |  |  |  | } | 
| 893 |  |  |  |  |  |  | say | 
| 894 | 0 |  |  |  |  |  | "         orphaned internal IDs:        $results->{count_of_orphaned_internal_ids}"; | 
| 895 | 0 | 0 |  |  |  |  | if ( defined $verbose ) { | 
| 896 |  |  |  |  |  |  | say "         (", | 
| 897 | 0 |  |  |  |  |  | join( ", ", sort {$a <=> $b} @{ $results->{internal_ids_without_puids} } ), ")"; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 898 |  |  |  |  |  |  | } | 
| 899 | 0 |  |  |  |  |  | say ""; | 
| 900 | 0 |  |  |  |  |  | say "Quality of internal IDs"; | 
| 901 | 0 |  |  |  |  |  | say "---------------------------------------"; | 
| 902 |  |  |  |  |  |  |  | 
| 903 | 0 |  |  |  |  |  | my $nth = 1; | 
| 904 | 0 |  |  |  |  |  | foreach my $n ( @{ $results->{nth_best_quality} } ) { | 
|  | 0 |  |  |  |  |  |  | 
| 905 | 0 |  |  |  |  |  | say | 
| 906 |  |  |  |  |  |  | "$nth-best quality internal ID (PUID, name):       $n->{internal_id} ($n->{puid}, $n->{name}) -> $n->{quality}"; | 
| 907 | 0 | 0 |  |  |  |  | if ( defined $verbose ) { | 
| 908 | 0 |  |  |  |  |  | say "        combined regex: ", $n->{combined_regex}; | 
| 909 |  |  |  |  |  |  | } | 
| 910 | 0 |  |  |  |  |  | $nth++; | 
| 911 |  |  |  |  |  |  | } | 
| 912 | 0 |  |  |  |  |  | say ""; | 
| 913 | 0 |  |  |  |  |  | $nth = 1; | 
| 914 | 0 |  |  |  |  |  | foreach my $n ( @{ $results->{nth_worst_quality} } ) { | 
|  | 0 |  |  |  |  |  |  | 
| 915 | 0 |  |  |  |  |  | say | 
| 916 |  |  |  |  |  |  | "$nth-worst quality internal ID (PUID, name):       $n->{internal_id} ($n->{puid}, $n->{name}) -> $n->{quality}"; | 
| 917 | 0 | 0 |  |  |  |  | if ( defined $verbose ) { | 
| 918 | 0 |  |  |  |  |  | say "        combined regex: ", $n->{combined_regex}; | 
| 919 |  |  |  |  |  |  | } | 
| 920 | 0 |  |  |  |  |  | $nth++; | 
| 921 |  |  |  |  |  |  | } | 
| 922 | 0 |  |  |  |  |  | say ""; | 
| 923 |  |  |  |  |  |  |  | 
| 924 | 0 |  |  |  |  |  | say ""; | 
| 925 | 0 |  |  |  |  |  | say "Regular expressions"; | 
| 926 | 0 |  |  |  |  |  | say "---------------------------------------"; | 
| 927 | 0 |  |  |  |  |  | say | 
| 928 |  |  |  |  |  |  | "Count of multiple used regular expressions: $results->{count_of_multiple_used_regex}"; | 
| 929 | 0 | 0 |  |  |  |  | if ( defined $verbose ) { | 
| 930 | 0 |  |  |  |  |  | for ( my $i = 0 ; $i < $results->{count_of_multiple_used_regex} ; $i++ ) | 
| 931 |  |  |  |  |  |  | { | 
| 932 | 0 |  |  |  |  |  | say "         common regex group no $i:"; | 
| 933 |  |  |  |  |  |  | say "            regex='" | 
| 934 | 0 |  |  |  |  |  | . $results->{multiple_used_regex}->[$i]->{regex} . "'"; | 
| 935 |  |  |  |  |  |  | say "            internal IDs: ", | 
| 936 | 0 |  |  |  |  |  | join( ",", @{ $results->{multiple_used_regex}->[$i]->{internal_ids} } ); | 
|  | 0 |  |  |  |  |  |  | 
| 937 |  |  |  |  |  |  | } | 
| 938 |  |  |  |  |  |  | } | 
| 939 | 0 |  |  |  |  |  | say ""; | 
| 940 |  |  |  |  |  |  |  | 
| 941 |  |  |  |  |  |  | #my @rx = $self->get_all_regular_expressions(); | 
| 942 |  |  |  |  |  |  | #use Data::Printer; | 
| 943 |  |  |  |  |  |  | #p( %uniq_regexes ); | 
| 944 | 0 |  |  |  |  |  | return; | 
| 945 |  |  |  |  |  |  | } | 
| 946 |  |  |  |  |  |  |  | 
| 947 |  |  |  |  |  |  | 1; | 
| 948 |  |  |  |  |  |  |  | 
| 949 | 1 |  |  | 1 |  | 12 | no Moose; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 7 |  | 
| 950 |  |  |  |  |  |  | __PACKAGE__->meta->make_immutable; | 
| 951 |  |  |  |  |  |  | 1; | 
| 952 |  |  |  |  |  |  |  | 
| 953 |  |  |  |  |  |  | __END__ | 
| 954 |  |  |  |  |  |  |  | 
| 955 |  |  |  |  |  |  | =pod | 
| 956 |  |  |  |  |  |  |  | 
| 957 |  |  |  |  |  |  | =encoding UTF-8 | 
| 958 |  |  |  |  |  |  |  | 
| 959 |  |  |  |  |  |  | =head1 NAME | 
| 960 |  |  |  |  |  |  |  | 
| 961 |  |  |  |  |  |  | File::FormatIdentification::Pronom | 
| 962 |  |  |  |  |  |  |  | 
| 963 |  |  |  |  |  |  | =head1 VERSION | 
| 964 |  |  |  |  |  |  |  | 
| 965 |  |  |  |  |  |  | version 0.06 | 
| 966 |  |  |  |  |  |  |  | 
| 967 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 968 |  |  |  |  |  |  |  | 
| 969 |  |  |  |  |  |  | use File::FormatIdentification::Pronom; | 
| 970 |  |  |  |  |  |  | my $pronomfile = "Droid-Signature.xml"; | 
| 971 |  |  |  |  |  |  | my ( $signatures, $internals ) = parse_signaturefile($pronomfile); | 
| 972 |  |  |  |  |  |  |  | 
| 973 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 974 |  |  |  |  |  |  |  | 
| 975 |  |  |  |  |  |  | The module allows to handle Droid signatures. Droid is a utility which | 
| 976 |  |  |  |  |  |  | uses the PRONOM database to identify file formats. | 
| 977 |  |  |  |  |  |  |  | 
| 978 |  |  |  |  |  |  | See https://www.nationalarchives.gov.uk/PRONOM/ for details. | 
| 979 |  |  |  |  |  |  |  | 
| 980 |  |  |  |  |  |  | With this module you could: | 
| 981 |  |  |  |  |  |  |  | 
| 982 |  |  |  |  |  |  | =over | 
| 983 |  |  |  |  |  |  |  | 
| 984 |  |  |  |  |  |  | =item convert Droid signatures to Perl regular expressions | 
| 985 |  |  |  |  |  |  |  | 
| 986 |  |  |  |  |  |  | =item analyze files and display which/where pattern of Droid signature matches  via tag-files for wxHexEditor | 
| 987 |  |  |  |  |  |  |  | 
| 988 |  |  |  |  |  |  | =item calc statistics about Droid signatures | 
| 989 |  |  |  |  |  |  |  | 
| 990 |  |  |  |  |  |  | =back | 
| 991 |  |  |  |  |  |  |  | 
| 992 |  |  |  |  |  |  | The module is in early alpha state and should not be used in production. | 
| 993 |  |  |  |  |  |  |  | 
| 994 |  |  |  |  |  |  | =head2 Examples | 
| 995 |  |  |  |  |  |  |  | 
| 996 |  |  |  |  |  |  | =head3 Colorize wxHexeditor fields | 
| 997 |  |  |  |  |  |  |  | 
| 998 |  |  |  |  |  |  | See example file F<bin/pronom2wxhexeditor.pl>. This colorizes the hex-blob to check PRONOM pattern matches for a given file. | 
| 999 |  |  |  |  |  |  |  | 
| 1000 |  |  |  |  |  |  | =head3 Identify file | 
| 1001 |  |  |  |  |  |  |  | 
| 1002 |  |  |  |  |  |  | There are better tools for the job, but as a proof of concept certainly not bad: Identifying the file type of a file. | 
| 1003 |  |  |  |  |  |  |  | 
| 1004 |  |  |  |  |  |  | my $pronom = File::FormatIdentification::Pronom->new( | 
| 1005 |  |  |  |  |  |  | "droid_signature_filename" => $pronomfile | 
| 1006 |  |  |  |  |  |  | ); | 
| 1007 |  |  |  |  |  |  | # .. $filestream is a scalar representing a file | 
| 1008 |  |  |  |  |  |  | foreach my $internalid ( $pronom->get_all_internal_ids() ) { | 
| 1009 |  |  |  |  |  |  | my $sig = $pronom->get_signature_id_by_internal_id($internalid); | 
| 1010 |  |  |  |  |  |  | next unless defined $sig; | 
| 1011 |  |  |  |  |  |  | my @regexes = $pronom->get_regular_expressions_by_internal_id($internalid); | 
| 1012 |  |  |  |  |  |  | if ( all {$filestream =~ m/$_/saa} @regexes ) { | 
| 1013 |  |  |  |  |  |  | my $puid = $pronom->get_puid_by_signature_id($sig); | 
| 1014 |  |  |  |  |  |  | my $name = $pronom->get_name_by_signature_id($sig); | 
| 1015 |  |  |  |  |  |  | my $quality = $pronom->get_qualities_by_internal_id($internalid); | 
| 1016 |  |  |  |  |  |  | say "$binaryfile identified as $name with PUID $puid (regex quality $quality)"; | 
| 1017 |  |  |  |  |  |  | } | 
| 1018 |  |  |  |  |  |  | } | 
| 1019 |  |  |  |  |  |  |  | 
| 1020 |  |  |  |  |  |  | See example file F<bin/pronomidentify.pl> for a  full working script. | 
| 1021 |  |  |  |  |  |  |  | 
| 1022 |  |  |  |  |  |  | =head3 Get PRONOM Statistics | 
| 1023 |  |  |  |  |  |  |  | 
| 1024 |  |  |  |  |  |  | To get a feeling for which signatures need to be revised in PRONOM, or why which file formats are difficult to recognize, | 
| 1025 |  |  |  |  |  |  | you can get detailed statistics for given signature files. | 
| 1026 |  |  |  |  |  |  |  | 
| 1027 |  |  |  |  |  |  | In the blog entry under L<https://kulturreste.blogspot.com/2018/10/heres-tool-make-it-work.html> the statistic report is presented in more detail. | 
| 1028 |  |  |  |  |  |  |  | 
| 1029 |  |  |  |  |  |  | =head2 EXPORT | 
| 1030 |  |  |  |  |  |  |  | 
| 1031 |  |  |  |  |  |  | None by default. | 
| 1032 |  |  |  |  |  |  |  | 
| 1033 |  |  |  |  |  |  | =head1 NAME | 
| 1034 |  |  |  |  |  |  |  | 
| 1035 |  |  |  |  |  |  | File::FormatIdentification::Pronom - Perl extension for parsing PRONOM-Signatures using DROID-Signature file | 
| 1036 |  |  |  |  |  |  |  | 
| 1037 |  |  |  |  |  |  | =head1 SEE ALSO | 
| 1038 |  |  |  |  |  |  |  | 
| 1039 |  |  |  |  |  |  | L<File::FormatIdentification::Regex> | 
| 1040 |  |  |  |  |  |  |  | 
| 1041 |  |  |  |  |  |  | =head1 AUTHOR | 
| 1042 |  |  |  |  |  |  |  | 
| 1043 |  |  |  |  |  |  | Andreas Romeyke L<pause@andreas-romeyke.de> | 
| 1044 |  |  |  |  |  |  |  | 
| 1045 |  |  |  |  |  |  | =head1 COPYRIGHT AND LICENSE | 
| 1046 |  |  |  |  |  |  |  | 
| 1047 |  |  |  |  |  |  | Copyright (C) 2018/19/20 by Andreas Romeyke | 
| 1048 |  |  |  |  |  |  |  | 
| 1049 |  |  |  |  |  |  | This library is free software; you can redistribute it and/or modify | 
| 1050 |  |  |  |  |  |  | it under the same terms as Perl itself, either Perl version 5.24.1 or, | 
| 1051 |  |  |  |  |  |  | at your option, any later version of Perl 5 you may have available. | 
| 1052 |  |  |  |  |  |  |  | 
| 1053 |  |  |  |  |  |  | The droid-signature file in t/ is from L<https://www.nationalarchives.gov.uk/PRONOM/Default.aspx> | 
| 1054 |  |  |  |  |  |  | and without guarantee, it does not look like it is legally protected. If there are any legal claims, | 
| 1055 |  |  |  |  |  |  | please let me know that I can remove them from the distribution. | 
| 1056 |  |  |  |  |  |  |  | 
| 1057 |  |  |  |  |  |  | =head1 BUGS | 
| 1058 |  |  |  |  |  |  |  | 
| 1059 |  |  |  |  |  |  | =over | 
| 1060 |  |  |  |  |  |  |  | 
| 1061 |  |  |  |  |  |  | =item Some droid recipes results in PCREs which are greedy and therefore the running | 
| 1062 |  |  |  |  |  |  | time could be exponential with size of binary file. | 
| 1063 |  |  |  |  |  |  |  | 
| 1064 |  |  |  |  |  |  | =back | 
| 1065 |  |  |  |  |  |  |  | 
| 1066 |  |  |  |  |  |  | =head1 CONTRIBUTING | 
| 1067 |  |  |  |  |  |  |  | 
| 1068 |  |  |  |  |  |  | Please feel free to send me comments and patches to my email address. You can clone the modules | 
| 1069 |  |  |  |  |  |  | from L<https://art1pirat.spdns.org/art1/File-FormatIdentification-Pronom> and send me merge requests. | 
| 1070 |  |  |  |  |  |  |  | 
| 1071 |  |  |  |  |  |  | =head1 AUTHOR | 
| 1072 |  |  |  |  |  |  |  | 
| 1073 |  |  |  |  |  |  | Andreas Romeyke <pause@andreas-romeyke.de> | 
| 1074 |  |  |  |  |  |  |  | 
| 1075 |  |  |  |  |  |  | =head1 COPYRIGHT AND LICENSE | 
| 1076 |  |  |  |  |  |  |  | 
| 1077 |  |  |  |  |  |  | This software is copyright (c) 2018 by Andreas Romeyke. | 
| 1078 |  |  |  |  |  |  |  | 
| 1079 |  |  |  |  |  |  | This is free software; you can redistribute it and/or modify it under | 
| 1080 |  |  |  |  |  |  | the same terms as the Perl 5 programming language system itself. | 
| 1081 |  |  |  |  |  |  |  | 
| 1082 |  |  |  |  |  |  | =cut |