| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Text::DAWG; | 
| 2 |  |  |  |  |  |  |  | 
| 3 | 1 |  |  | 1 |  | 9713 | use strict; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 38 |  | 
| 4 | 1 |  |  | 1 |  | 5 | use warnings; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 37 |  | 
| 5 |  |  |  |  |  |  |  | 
| 6 |  |  |  |  |  |  | BEGIN { | 
| 7 | 1 |  |  | 1 |  | 26 | our $VERSION="0.001"; | 
| 8 |  |  |  |  |  |  | } | 
| 9 |  |  |  |  |  |  |  | 
| 10 |  |  |  |  |  |  | use Carp | 
| 11 | 1 |  |  | 1 |  | 14 | qw(croak); | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 60 |  | 
| 12 |  |  |  |  |  |  |  | 
| 13 |  |  |  |  |  |  | use constant { | 
| 14 | 1 |  |  |  |  | 7146 | BADNODE	=> 0, | 
| 15 |  |  |  |  |  |  | STARTNODE	=> 1, | 
| 16 | 1 |  |  | 1 |  | 5 | }; | 
|  | 1 |  |  |  |  | 2 |  | 
| 17 |  |  |  |  |  |  |  | 
| 18 |  |  |  |  |  |  | =head1 NAME | 
| 19 |  |  |  |  |  |  |  | 
| 20 |  |  |  |  |  |  | Text::DAWG - directed acyclic word graphs | 
| 21 |  |  |  |  |  |  |  | 
| 22 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 23 |  |  |  |  |  |  |  | 
| 24 |  |  |  |  |  |  | use Text::DAWG; | 
| 25 |  |  |  |  |  |  |  | 
| 26 |  |  |  |  |  |  | my $dawg=Text::DAWG::->new([qw(one two three)]); | 
| 27 |  |  |  |  |  |  |  | 
| 28 |  |  |  |  |  |  | print "one\n"  if $dawg->match("one");   # prints something | 
| 29 |  |  |  |  |  |  |  | 
| 30 |  |  |  |  |  |  | print "four\n" if $dawg->match("four");  # prints nothing | 
| 31 |  |  |  |  |  |  |  | 
| 32 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 33 |  |  |  |  |  |  |  | 
| 34 |  |  |  |  |  |  | Text::DAWG implements implements string set recognition by way of | 
| 35 |  |  |  |  |  |  | directed acyclic word graphs. | 
| 36 |  |  |  |  |  |  |  | 
| 37 |  |  |  |  |  |  | =head1 CONSTRUCTORS | 
| 38 |  |  |  |  |  |  |  | 
| 39 |  |  |  |  |  |  | =over | 
| 40 |  |  |  |  |  |  |  | 
| 41 |  |  |  |  |  |  | =item my $dawg=Text::DAWG::->new(\@words); | 
| 42 |  |  |  |  |  |  |  | 
| 43 |  |  |  |  |  |  | Creates a new DAWG matching the strings in an array. | 
| 44 |  |  |  |  |  |  |  | 
| 45 |  |  |  |  |  |  | =item my $dawg=Text::DAWG::->load(\*FILEHANDLE); | 
| 46 |  |  |  |  |  |  |  | 
| 47 |  |  |  |  |  |  | Creates a new DAWG from a compact representation stored in a file, | 
| 48 |  |  |  |  |  |  | or dies if anything goes wrong.  The filehandle must be opened for reading | 
| 49 |  |  |  |  |  |  | and binmoded before the call. | 
| 50 |  |  |  |  |  |  |  | 
| 51 |  |  |  |  |  |  | =back | 
| 52 |  |  |  |  |  |  |  | 
| 53 |  |  |  |  |  |  | =head1 METHODS | 
| 54 |  |  |  |  |  |  |  | 
| 55 |  |  |  |  |  |  | =over | 
| 56 |  |  |  |  |  |  |  | 
| 57 |  |  |  |  |  |  | =item $dawg->match($string); | 
| 58 |  |  |  |  |  |  |  | 
| 59 |  |  |  |  |  |  | Returns a true value if the DAWG contains the string. | 
| 60 |  |  |  |  |  |  |  | 
| 61 |  |  |  |  |  |  | =item $dawg->store(\*FILEHANDLE); | 
| 62 |  |  |  |  |  |  |  | 
| 63 |  |  |  |  |  |  | Stores a compact representation of the DAWG in a file. | 
| 64 |  |  |  |  |  |  | The filehandle must be opened for writing and binmoded before the call. | 
| 65 |  |  |  |  |  |  |  | 
| 66 |  |  |  |  |  |  | =back | 
| 67 |  |  |  |  |  |  |  | 
| 68 |  |  |  |  |  |  | =head1 PEDAGOGIC METHODS | 
| 69 |  |  |  |  |  |  |  | 
| 70 |  |  |  |  |  |  | =over | 
| 71 |  |  |  |  |  |  |  | 
| 72 |  |  |  |  |  |  | =item $dawg->write_dot(\*FILEHANDLE); | 
| 73 |  |  |  |  |  |  |  | 
| 74 |  |  |  |  |  |  | =item $dawg->write_dot(\*FILEHANDLE,\%options); | 
| 75 |  |  |  |  |  |  |  | 
| 76 |  |  |  |  |  |  | Outputs a dot language representation of the DAWG | 
| 77 |  |  |  |  |  |  | (see L). | 
| 78 |  |  |  |  |  |  | The filehandle must be opened for writing before the call. | 
| 79 |  |  |  |  |  |  | If the DAWG contains any non-ASCII characters, you must set an appropriate | 
| 80 |  |  |  |  |  |  | encoding as well. | 
| 81 |  |  |  |  |  |  |  | 
| 82 |  |  |  |  |  |  | You can pass a reference to a hash of options for tweaking the output. | 
| 83 |  |  |  |  |  |  | The following keys are recognised: | 
| 84 |  |  |  |  |  |  |  | 
| 85 |  |  |  |  |  |  | =over | 
| 86 |  |  |  |  |  |  |  | 
| 87 |  |  |  |  |  |  | =item "" (the empty string) | 
| 88 |  |  |  |  |  |  |  | 
| 89 |  |  |  |  |  |  | The value must be a hash reference specifying global attributes | 
| 90 |  |  |  |  |  |  | for the generated digraph. | 
| 91 |  |  |  |  |  |  |  | 
| 92 |  |  |  |  |  |  | =item "graph" | 
| 93 |  |  |  |  |  |  |  | 
| 94 |  |  |  |  |  |  | The value must be a hash reference specifying default attributes | 
| 95 |  |  |  |  |  |  | for subgraphs. | 
| 96 |  |  |  |  |  |  |  | 
| 97 |  |  |  |  |  |  | =item "edge" | 
| 98 |  |  |  |  |  |  |  | 
| 99 |  |  |  |  |  |  | The value must be a hash reference specifying default attributes | 
| 100 |  |  |  |  |  |  | for edges. | 
| 101 |  |  |  |  |  |  |  | 
| 102 |  |  |  |  |  |  | =item "node" | 
| 103 |  |  |  |  |  |  |  | 
| 104 |  |  |  |  |  |  | The value must be a hash reference specifying default attributes | 
| 105 |  |  |  |  |  |  | for nodes.  Defaults to C<{ shape =E 'circle' }>. | 
| 106 |  |  |  |  |  |  |  | 
| 107 |  |  |  |  |  |  | =item "start" | 
| 108 |  |  |  |  |  |  |  | 
| 109 |  |  |  |  |  |  | The value must be a hash reference specifying attributes | 
| 110 |  |  |  |  |  |  | for the start node. | 
| 111 |  |  |  |  |  |  |  | 
| 112 |  |  |  |  |  |  | =item "match" | 
| 113 |  |  |  |  |  |  |  | 
| 114 |  |  |  |  |  |  | The value must be a hash reference specifying attributes | 
| 115 |  |  |  |  |  |  | for a matching node.  Defaults to C<{ shape =E 'doublecircle' }>. | 
| 116 |  |  |  |  |  |  |  | 
| 117 |  |  |  |  |  |  | =item "startmatch" | 
| 118 |  |  |  |  |  |  |  | 
| 119 |  |  |  |  |  |  | The value must be a hash reference specifying attributes | 
| 120 |  |  |  |  |  |  | for a matching start node.  Defaults to the combination of the | 
| 121 |  |  |  |  |  |  | C and C options, with C given priority. | 
| 122 |  |  |  |  |  |  |  | 
| 123 |  |  |  |  |  |  | =item "chars" | 
| 124 |  |  |  |  |  |  |  | 
| 125 |  |  |  |  |  |  | The value must be a hash reference with single characters for keys | 
| 126 |  |  |  |  |  |  | and hash references for values.  It specifies attributes for | 
| 127 |  |  |  |  |  |  | edges representing the given characters.  The default has an entry | 
| 128 |  |  |  |  |  |  | for the space character containng C<{ label =E 'SP' }>, | 
| 129 |  |  |  |  |  |  | since an edge label consisting of a single space is hard to notice. | 
| 130 |  |  |  |  |  |  |  | 
| 131 |  |  |  |  |  |  | =item "id" | 
| 132 |  |  |  |  |  |  |  | 
| 133 |  |  |  |  |  |  | An id for the digraph itself. | 
| 134 |  |  |  |  |  |  |  | 
| 135 |  |  |  |  |  |  | =item "readable" | 
| 136 |  |  |  |  |  |  |  | 
| 137 |  |  |  |  |  |  | If true, certain optimisations that reduce both the size | 
| 138 |  |  |  |  |  |  | and the readability of the output are not performed. | 
| 139 |  |  |  |  |  |  |  | 
| 140 |  |  |  |  |  |  | =back | 
| 141 |  |  |  |  |  |  |  | 
| 142 |  |  |  |  |  |  | Node ids are positive integers, with the start node always 1. | 
| 143 |  |  |  |  |  |  |  | 
| 144 |  |  |  |  |  |  | Edges have a default label equal to the character it represents. | 
| 145 |  |  |  |  |  |  | You can override this with the C option. | 
| 146 |  |  |  |  |  |  |  | 
| 147 |  |  |  |  |  |  | =item my $dawg=Text::DAWG::->new(\@words,\*FILEHANDLE); | 
| 148 |  |  |  |  |  |  |  | 
| 149 |  |  |  |  |  |  | =item my $dawg=Text::DAWG::->new(\@words,\*FILEHANDLE,\%options); | 
| 150 |  |  |  |  |  |  |  | 
| 151 |  |  |  |  |  |  | You can pass extra arguments to the constructor to output a dot language | 
| 152 |  |  |  |  |  |  | representation of the trie that is the un-optimised version of the DAWG. | 
| 153 |  |  |  |  |  |  | Groups of trie nodes that correspond to the same DAWG node will be clustered. | 
| 154 |  |  |  |  |  |  |  | 
| 155 |  |  |  |  |  |  | =back | 
| 156 |  |  |  |  |  |  |  | 
| 157 |  |  |  |  |  |  | =head1 TIME AND SPACE | 
| 158 |  |  |  |  |  |  |  | 
| 159 |  |  |  |  |  |  | A Text::DAWG is always slower than a built-in Perl hash. | 
| 160 |  |  |  |  |  |  |  | 
| 161 |  |  |  |  |  |  | A Text::DAWG containing a set of strings with many common prefixes | 
| 162 |  |  |  |  |  |  | and suffixes (e.g. a dictionary of English words) may use less memory | 
| 163 |  |  |  |  |  |  | than a built-in Perl hash.  However, the unoptimised trie and | 
| 164 |  |  |  |  |  |  | the optimisation process itself uses many times as much memory | 
| 165 |  |  |  |  |  |  | as the final result.  Loading a stored DAWG from a file uses very | 
| 166 |  |  |  |  |  |  | little extra memory. | 
| 167 |  |  |  |  |  |  |  | 
| 168 |  |  |  |  |  |  | =head1 AUTHOR | 
| 169 |  |  |  |  |  |  |  | 
| 170 |  |  |  |  |  |  | Bo Lindbergh Eblgl@stacken.kth.seE | 
| 171 |  |  |  |  |  |  |  | 
| 172 |  |  |  |  |  |  | =head1 COPYRIGHT AND LICENSE | 
| 173 |  |  |  |  |  |  |  | 
| 174 |  |  |  |  |  |  | Copyright 2011, Bo Lindbergh | 
| 175 |  |  |  |  |  |  |  | 
| 176 |  |  |  |  |  |  | This library is free software; you can redistribute it and/or modify it | 
| 177 |  |  |  |  |  |  | under the same terms as Perl itself, either Perl version 5.8.9 or, at | 
| 178 |  |  |  |  |  |  | your option, any later version of Perl 5 you may have available. | 
| 179 |  |  |  |  |  |  |  | 
| 180 |  |  |  |  |  |  | =cut | 
| 181 |  |  |  |  |  |  |  | 
| 182 |  |  |  |  |  |  | sub new | 
| 183 |  |  |  |  |  |  | { | 
| 184 | 2 |  |  | 2 | 1 | 32 | my($class,$strings,$fh,$options)=@_; | 
| 185 | 2 |  |  |  |  | 5 | my($work,$self); | 
| 186 |  |  |  |  |  |  |  | 
| 187 | 2 |  |  |  |  | 7 | $work={ | 
| 188 |  |  |  |  |  |  | strings	=> $strings, | 
| 189 |  |  |  |  |  |  | }; | 
| 190 | 2 |  |  |  |  | 7 | bless($work,$class); | 
| 191 | 2 |  |  |  |  | 10 | $work->_init_charmap(); | 
| 192 | 2 |  |  |  |  | 9 | $work->_build_trie(); | 
| 193 | 2 |  |  |  |  | 9 | $work->_init_groups(); | 
| 194 | 2 |  |  |  |  | 9 | $work->_split_groups(); | 
| 195 | 2 |  |  |  |  | 11 | $work->_sort_groups(); | 
| 196 | 2 | 50 |  |  |  | 8 | if ($fh) { | 
| 197 | 0 |  |  |  |  | 0 | $work->_sort_nodes(); | 
| 198 | 0 |  |  |  |  | 0 | $work->write_dot($fh,$options); | 
| 199 |  |  |  |  |  |  | } | 
| 200 | 2 |  |  |  |  | 6 | $self={}; | 
| 201 | 2 |  |  |  |  | 9 | bless($self,$class); | 
| 202 | 2 |  |  |  |  | 9 | $self->_final_charmap($work); | 
| 203 | 2 |  |  |  |  | 9 | $self->_build_dawg($work); | 
| 204 | 2 |  |  |  |  | 59 | $self; | 
| 205 |  |  |  |  |  |  | } | 
| 206 |  |  |  |  |  |  |  | 
| 207 |  |  |  |  |  |  | sub _init_charmap | 
| 208 |  |  |  |  |  |  | { | 
| 209 | 2 |  |  | 2 |  | 4 | my($self)=@_; | 
| 210 | 2 |  |  |  |  | 4 | my($strings,$charix,%histo,@chars); | 
| 211 |  |  |  |  |  |  |  | 
| 212 | 2 |  |  |  |  | 5 | foreach my $string (@{$self->{strings}}) { | 
|  | 2 |  |  |  |  | 14 |  | 
| 213 | 13 |  |  |  |  | 45 | foreach my $char (split(//,$string)) { | 
| 214 | 75 |  |  |  |  | 138 | $histo{$char}++; | 
| 215 |  |  |  |  |  |  | } | 
| 216 |  |  |  |  |  |  | } | 
| 217 | 80 |  |  |  |  | 120 | @chars=sort { | 
| 218 | 2 |  |  |  |  | 17 | $histo{$b}<=>$histo{$a}; | 
| 219 |  |  |  |  |  |  | } keys(%histo); | 
| 220 | 2 |  |  |  |  | 8 | $charix=1; | 
| 221 | 2 |  |  |  |  | 7 | for my $charmap ($self->{charmap}) { | 
| 222 | 2 |  |  |  |  | 36 | $charmap=""; | 
| 223 | 2 |  |  |  |  | 5 | foreach my $char (@chars) { | 
| 224 | 30 |  |  |  |  | 66 | vec($charmap,ord($char),32)=$charix; | 
| 225 | 30 |  |  |  |  | 97 | $charix++; | 
| 226 |  |  |  |  |  |  | } | 
| 227 |  |  |  |  |  |  | } | 
| 228 |  |  |  |  |  |  | } | 
| 229 |  |  |  |  |  |  |  | 
| 230 |  |  |  |  |  |  | sub _build_trie | 
| 231 |  |  |  |  |  |  | { | 
| 232 | 2 |  |  | 2 |  | 5 | my($self)=@_; | 
| 233 |  |  |  |  |  |  |  | 
| 234 | 2 |  |  |  |  | 7 | for my $match ($self->{match}) { | 
| 235 | 2 |  |  |  |  | 4 | for my $charmap ($self->{charmap}) { | 
| 236 | 2 |  |  |  |  | 3 | my(@nodes,@depths); | 
| 237 |  |  |  |  |  |  |  | 
| 238 | 2 |  |  |  |  | 4 | $match=""; | 
| 239 | 2 |  |  |  |  | 5 | $nodes[BADNODE]=""; | 
| 240 | 2 |  |  |  |  | 3 | $nodes[STARTNODE]=""; | 
| 241 | 2 |  |  |  |  | 5 | $depths[BADNODE]=-1; | 
| 242 | 2 |  |  |  |  | 3 | $depths[STARTNODE]=0; | 
| 243 | 2 |  |  |  |  | 3 | foreach my $string (@{$self->{strings}}) { | 
|  | 2 |  |  |  |  | 7 |  | 
| 244 | 13 |  |  |  |  | 15 | my($nodeix,$depth); | 
| 245 |  |  |  |  |  |  |  | 
| 246 | 13 |  |  |  |  | 22 | $nodeix=STARTNODE; | 
| 247 | 13 |  |  |  |  | 18 | $depth=0; | 
| 248 | 13 |  |  |  |  | 85 | foreach my $charix (map(vec($charmap,ord($_),32)-1, | 
| 249 |  |  |  |  |  |  | split(//,$string))) { | 
| 250 | 75 |  |  |  |  | 78 | my($nextix); | 
| 251 |  |  |  |  |  |  |  | 
| 252 | 75 |  |  |  |  | 76 | $depth++; | 
| 253 | 75 |  |  |  |  | 104 | $nextix=vec($nodes[$nodeix],$charix,32); | 
| 254 | 75 | 100 |  |  |  | 138 | if (!$nextix) { | 
| 255 | 74 |  |  |  |  | 104 | $nextix=@nodes; | 
| 256 | 74 |  |  |  |  | 125 | $nodes[$nextix]=""; | 
| 257 | 74 |  |  |  |  | 107 | $depths[$nextix]=$depth; | 
| 258 | 74 |  |  |  |  | 193 | vec($nodes[$nodeix],$charix,32)=$nextix; | 
| 259 |  |  |  |  |  |  | } | 
| 260 | 75 |  |  |  |  | 145 | $nodeix=$nextix; | 
| 261 |  |  |  |  |  |  | } | 
| 262 | 13 |  |  |  |  | 79 | vec($match,$nodeix,1)=1; | 
| 263 |  |  |  |  |  |  | } | 
| 264 | 2 |  |  |  |  | 8 | $self->{nodes}=\@nodes; | 
| 265 | 2 |  |  |  |  | 11 | $self->{depths}=\@depths; | 
| 266 |  |  |  |  |  |  | } | 
| 267 |  |  |  |  |  |  | } | 
| 268 |  |  |  |  |  |  | } | 
| 269 |  |  |  |  |  |  |  | 
| 270 |  |  |  |  |  |  | sub _init_groups | 
| 271 |  |  |  |  |  |  | { | 
| 272 | 2 |  |  | 2 |  | 4 | my($self)=@_; | 
| 273 |  |  |  |  |  |  |  | 
| 274 | 2 |  |  |  |  | 6 | for my $match ($self->{match}) { | 
| 275 | 2 |  |  |  |  | 5 | my($nodes,@match,@nomatch,@groups); | 
| 276 |  |  |  |  |  |  |  | 
| 277 | 2 |  |  |  |  | 4 | $nodes=$self->{nodes}; | 
| 278 | 2 |  |  |  |  | 13 | @groups=( | 
| 279 |  |  |  |  |  |  | [BADNODE], | 
| 280 |  |  |  |  |  |  | ); | 
| 281 | 2 |  |  |  |  | 4 | foreach my $nodeix (STARTNODE .. $#{$nodes}) { | 
|  | 2 |  |  |  |  | 25 |  | 
| 282 | 76 | 100 |  |  |  | 118 | if (vec($match,$nodeix,1)) { | 
| 283 | 13 |  |  |  |  | 25 | push(@match,$nodeix); | 
| 284 |  |  |  |  |  |  | } else { | 
| 285 | 63 |  |  |  |  | 87 | push(@nomatch,$nodeix) | 
| 286 |  |  |  |  |  |  | } | 
| 287 |  |  |  |  |  |  | } | 
| 288 | 2 | 50 |  |  |  | 8 | if (@match) { | 
| 289 | 2 |  |  |  |  | 4 | push(@groups,\@match); | 
| 290 |  |  |  |  |  |  | } | 
| 291 | 2 | 50 |  |  |  | 7 | if (@nomatch) { | 
| 292 | 2 |  |  |  |  | 4 | push(@groups,\@nomatch); | 
| 293 |  |  |  |  |  |  | } | 
| 294 | 2 |  |  |  |  | 12 | $self->{groups}=\@groups; | 
| 295 |  |  |  |  |  |  | } | 
| 296 |  |  |  |  |  |  | } | 
| 297 |  |  |  |  |  |  |  | 
| 298 |  |  |  |  |  |  | sub _split_groups | 
| 299 |  |  |  |  |  |  | { | 
| 300 | 2 |  |  | 2 |  | 3 | my($self)=@_; | 
| 301 | 2 |  |  |  |  | 4 | my($nodes,$groups,@groupmap); | 
| 302 |  |  |  |  |  |  |  | 
| 303 | 2 |  |  |  |  | 5 | $nodes=$self->{nodes}; | 
| 304 | 2 |  |  |  |  | 3 | $groups=$self->{groups}; | 
| 305 | 2 |  |  |  |  | 4 | for (;;) { | 
| 306 | 7 |  |  |  |  | 8 | my(@groups); | 
| 307 |  |  |  |  |  |  |  | 
| 308 | 7 |  |  |  |  | 9 | foreach my $groupix (0 .. $#{$groups}) { | 
|  | 7 |  |  |  |  | 17 |  | 
| 309 | 130 |  |  |  |  | 164 | foreach my $nodeix (@{$groups->[$groupix]}) { | 
|  | 130 |  |  |  |  | 200 |  | 
| 310 | 273 |  |  |  |  | 433 | $groupmap[$nodeix]=$groupix; | 
| 311 |  |  |  |  |  |  | } | 
| 312 |  |  |  |  |  |  | } | 
| 313 | 7 |  |  |  |  | 12 | foreach my $group (@{$groups}) { | 
|  | 7 |  |  |  |  | 11 |  | 
| 314 | 130 | 100 |  |  |  | 122 | if (@{$group}>1) { | 
|  | 130 |  |  |  |  | 216 |  | 
| 315 | 40 |  |  |  |  | 43 | my(%newgroups); | 
| 316 |  |  |  |  |  |  |  | 
| 317 | 40 |  |  |  |  | 40 | foreach my $nodeix (@{$group}) { | 
|  | 40 |  |  |  |  | 62 |  | 
| 318 | 183 |  |  |  |  | 185 | my($key); | 
| 319 |  |  |  |  |  |  |  | 
| 320 | 183 |  |  |  |  | 449 | $key=pack("N*", | 
| 321 |  |  |  |  |  |  | @groupmap[unpack("N*",$nodes->[$nodeix])]); | 
| 322 | 183 |  |  |  |  | 205 | push(@{$newgroups{$key}},$nodeix); | 
|  | 183 |  |  |  |  | 609 |  | 
| 323 |  |  |  |  |  |  | } | 
| 324 | 40 |  |  |  |  | 149 | push(@groups,values(%newgroups)); | 
| 325 |  |  |  |  |  |  | } else { | 
| 326 | 90 |  |  |  |  | 133 | push(@groups,$group); | 
| 327 |  |  |  |  |  |  | } | 
| 328 |  |  |  |  |  |  | } | 
| 329 | 7 | 100 |  |  |  | 12 | last if @groups==@{$groups}; | 
|  | 7 |  |  |  |  | 23 |  | 
| 330 | 5 |  |  |  |  | 18 | $groups=\@groups; | 
| 331 |  |  |  |  |  |  | } | 
| 332 | 2 |  |  |  |  | 9 | $self->{groups}=$groups; | 
| 333 |  |  |  |  |  |  | } | 
| 334 |  |  |  |  |  |  |  | 
| 335 |  |  |  |  |  |  | sub _sort_groups | 
| 336 |  |  |  |  |  |  | { | 
| 337 | 2 |  |  | 2 |  | 4 | my($self)=@_; | 
| 338 | 2 |  |  |  |  | 4 | my($nodes,$depths,$groups); | 
| 339 |  |  |  |  |  |  |  | 
| 340 | 2 |  |  |  |  | 4 | $nodes=$self->{nodes}; | 
| 341 | 2 |  |  |  |  | 4 | $depths=$self->{depths}; | 
| 342 | 2 |  |  |  |  | 3 | $groups=$self->{groups}; | 
| 343 | 2 |  |  |  |  | 3 | foreach my $group (@{$groups}) { | 
|  | 2 |  |  |  |  | 6 |  | 
| 344 | 60 |  |  |  |  | 54 | my($maxdepth,$maxnode); | 
| 345 |  |  |  |  |  |  |  | 
| 346 | 60 |  |  |  |  | 52 | $maxdepth=-2; | 
| 347 | 60 |  |  |  |  | 54 | $maxnode=-1; | 
| 348 | 60 |  |  |  |  | 62 | foreach my $nodeix (@{$group}) { | 
|  | 60 |  |  |  |  | 81 |  | 
| 349 | 78 |  |  |  |  | 74 | my($depth); | 
| 350 |  |  |  |  |  |  |  | 
| 351 | 78 |  |  |  |  | 135 | $depth=$depths->[$nodeix]; | 
| 352 | 78 | 100 |  |  |  | 137 | if ($depth>$maxdepth) { | 
| 353 | 72 |  |  |  |  | 66 | $maxdepth=$depth; | 
| 354 | 72 |  |  |  |  | 122 | $maxnode=$nodeix; | 
| 355 |  |  |  |  |  |  | } | 
| 356 |  |  |  |  |  |  | } | 
| 357 | 60 |  |  |  |  | 126 | $group=[$maxdepth,$maxnode,$group]; | 
| 358 |  |  |  |  |  |  | } | 
| 359 | 2 | 50 |  |  |  | 18 | @{$groups}=sort { | 
|  | 221 |  |  |  |  | 672 |  | 
| 360 | 2 |  |  |  |  | 21 | $a->[0]<=>$b->[0] || $a->[1]<=>$b->[1]; | 
| 361 | 2 |  |  |  |  | 4 | } @{$groups}; | 
| 362 |  |  |  |  |  |  | } | 
| 363 |  |  |  |  |  |  |  | 
| 364 |  |  |  |  |  |  | sub _sort_nodes | 
| 365 |  |  |  |  |  |  | { | 
| 366 | 0 |  |  | 0 |  | 0 | my($self)=@_; | 
| 367 | 0 |  |  |  |  | 0 | my($groups,$nodes); | 
| 368 | 0 |  |  |  |  | 0 | my(@nodemap,@nodes,$newix,$oldmatch); | 
| 369 |  |  |  |  |  |  |  | 
| 370 | 0 |  |  |  |  | 0 | $groups=$self->{groups}; | 
| 371 | 0 |  |  |  |  | 0 | $nodes=$self->{nodes}; | 
| 372 | 0 |  |  |  |  | 0 | $newix=0; | 
| 373 | 0 |  |  |  |  | 0 | foreach my $group (@{$self->{groups}}) { | 
|  | 0 |  |  |  |  | 0 |  | 
| 374 | 0 |  |  |  |  | 0 | foreach my $nodeix (@{$group->[2]}) { | 
|  | 0 |  |  |  |  | 0 |  | 
| 375 | 0 |  |  |  |  | 0 | $nodemap[$nodeix]=$newix++; | 
| 376 |  |  |  |  |  |  | } | 
| 377 |  |  |  |  |  |  | } | 
| 378 | 0 |  |  |  |  | 0 | for my $oldmatch (delete $self->{match}) { | 
| 379 | 0 |  |  |  |  | 0 | for my $match ($self->{match}) { | 
| 380 | 0 |  |  |  |  | 0 | $match="\x00"; | 
| 381 | 0 |  |  |  |  | 0 | foreach my $nodeix (0..$#nodemap) { | 
| 382 | 0 |  |  |  |  | 0 | $nodes[$nodemap[$nodeix]]= | 
| 383 |  |  |  |  |  |  | pack("N*",@nodemap[unpack("N*",$nodes->[$nodeix])]); | 
| 384 | 0 | 0 |  |  |  | 0 | if (vec($oldmatch,$nodeix,1)) { | 
| 385 | 0 |  |  |  |  | 0 | vec($match,$nodemap[$nodeix],1)=1; | 
| 386 |  |  |  |  |  |  | } | 
| 387 |  |  |  |  |  |  | } | 
| 388 |  |  |  |  |  |  | } | 
| 389 |  |  |  |  |  |  | } | 
| 390 | 0 |  |  |  |  | 0 | $self->{nodes}=\@nodes; | 
| 391 | 0 |  |  |  |  | 0 | foreach my $group (@{$groups}) { | 
|  | 0 |  |  |  |  | 0 |  | 
| 392 | 0 |  |  |  |  | 0 | $group->[1]=$nodemap[$group->[1]]; | 
| 393 | 0 |  |  |  |  | 0 | @{$group->[2]}=@nodemap[@{$group->[2]}]; | 
|  | 0 |  |  |  |  | 0 |  | 
|  | 0 |  |  |  |  | 0 |  | 
| 394 |  |  |  |  |  |  | } | 
| 395 |  |  |  |  |  |  | } | 
| 396 |  |  |  |  |  |  |  | 
| 397 |  |  |  |  |  |  | sub _final_charmap | 
| 398 |  |  |  |  |  |  | { | 
| 399 | 2 |  |  | 2 |  | 4 | my($self,$work)=@_; | 
| 400 | 2 |  |  |  |  | 4 | my($nodes,$groups,@histo,@reorder,@remap); | 
| 401 |  |  |  |  |  |  |  | 
| 402 | 2 |  |  |  |  | 4 | $nodes=$work->{nodes}; | 
| 403 | 2 |  |  |  |  | 14 | $groups=$work->{groups}; | 
| 404 | 2 |  |  |  |  | 4 | foreach my $group (@{$groups}) { | 
|  | 2 |  |  |  |  | 5 |  | 
| 405 | 60 |  |  |  |  | 85 | for my $node ($nodes->[$group->[1]]) { | 
| 406 | 60 |  |  |  |  | 121 | foreach my $charix (0 .. length($node)/4-1) { | 
| 407 | 314 | 100 |  |  |  | 609 | if (vec($node,$charix,32)) { | 
| 408 | 67 |  |  |  |  | 223 | $histo[$charix]++; | 
| 409 |  |  |  |  |  |  | } | 
| 410 |  |  |  |  |  |  | } | 
| 411 |  |  |  |  |  |  | } | 
| 412 |  |  |  |  |  |  | } | 
| 413 | 64 |  |  |  |  | 76 | @reorder=sort { | 
| 414 | 2 |  |  |  |  | 13 | $histo[$b]<=>$histo[$a]; | 
| 415 |  |  |  |  |  |  | } (0 .. $#histo); | 
| 416 | 2 |  |  |  |  | 7 | foreach my $charix (0 .. $#reorder) { | 
| 417 | 30 |  |  |  |  | 46 | $remap[$reorder[$charix]]=$charix; | 
| 418 |  |  |  |  |  |  | } | 
| 419 | 2 |  |  |  |  | 9 | for my $triemap ($work->{charmap}) { | 
| 420 | 2 |  |  |  |  | 8 | for my $dawgmap ($self->{charmap}) { | 
| 421 | 2 |  |  |  |  | 5 | $dawgmap=""; | 
| 422 | 2 |  |  |  |  | 7 | foreach my $ord (0 .. length($triemap)/4-1) { | 
| 423 | 244 |  |  |  |  | 200 | my($charix); | 
| 424 |  |  |  |  |  |  |  | 
| 425 | 244 |  |  |  |  | 233 | $charix=vec($triemap,$ord,32); | 
| 426 | 244 | 100 |  |  |  | 407 | next unless $charix; | 
| 427 | 30 |  |  |  |  | 109 | vec($dawgmap,$ord,32)=$remap[$charix-1]+1; | 
| 428 |  |  |  |  |  |  | } | 
| 429 |  |  |  |  |  |  | } | 
| 430 |  |  |  |  |  |  | } | 
| 431 | 2 |  |  |  |  | 10 | $work->{remap}=\@remap; | 
| 432 |  |  |  |  |  |  | } | 
| 433 |  |  |  |  |  |  |  | 
| 434 |  |  |  |  |  |  | sub _build_dawg | 
| 435 |  |  |  |  |  |  | { | 
| 436 | 2 |  |  | 2 |  | 4 | my($self,$work)=@_; | 
| 437 |  |  |  |  |  |  |  | 
| 438 | 2 |  |  |  |  | 7 | for my $triematch ($work->{match}) { | 
| 439 | 2 |  |  |  |  | 6 | for my $dawgmatch ($self->{match}) { | 
| 440 | 2 |  |  |  |  | 3 | my($trienodes,$groups,$remap,@groupmap,@dawgnodes); | 
| 441 |  |  |  |  |  |  |  | 
| 442 | 2 |  |  |  |  | 5 | $dawgmatch="\x00"; | 
| 443 | 2 |  |  |  |  | 4 | $trienodes=$work->{nodes}; | 
| 444 | 2 |  |  |  |  | 4 | $groups=$work->{groups}; | 
| 445 | 2 |  |  |  |  | 3 | $remap=$work->{remap}; | 
| 446 | 2 |  |  |  |  | 3 | foreach my $dawgix (0 .. $#{$groups}) { | 
|  | 2 |  |  |  |  | 6 |  | 
| 447 | 60 |  |  |  |  | 53 | foreach my $trieix (@{$groups->[$dawgix]->[2]}) { | 
|  | 60 |  |  |  |  | 95 |  | 
| 448 | 78 |  |  |  |  | 159 | $groupmap[$trieix]=$dawgix; | 
| 449 |  |  |  |  |  |  | } | 
| 450 |  |  |  |  |  |  | } | 
| 451 | 2 |  |  |  |  | 5 | foreach my $dawgix (0 .. $#{$groups}) { | 
|  | 2 |  |  |  |  | 6 |  | 
| 452 | 60 |  |  |  |  | 59 | my($trieix); | 
| 453 |  |  |  |  |  |  |  | 
| 454 | 60 |  |  |  |  | 78 | $trieix=$groups->[$dawgix]->[1]; | 
| 455 | 60 |  |  |  |  | 87 | for my $trienode ($trienodes->[$trieix]) { | 
| 456 | 60 |  |  |  |  | 104 | for my $dawgnode ($dawgnodes[$dawgix]) { | 
| 457 | 60 |  |  |  |  | 77 | $dawgnode=""; | 
| 458 | 60 |  |  |  |  | 123 | foreach my $charix (0 .. length($trienode)/4-1) { | 
| 459 | 314 |  |  |  |  | 591 | my($nextix); | 
| 460 |  |  |  |  |  |  |  | 
| 461 | 314 |  |  |  |  | 502 | $nextix=vec($trienode,$charix,32); | 
| 462 | 314 | 100 |  |  |  | 818 | if ($nextix) { | 
| 463 | 67 |  |  |  |  | 384 | vec($dawgnode,$remap->[$charix],32)= | 
| 464 |  |  |  |  |  |  | $groupmap[$nextix]; | 
| 465 |  |  |  |  |  |  | } | 
| 466 |  |  |  |  |  |  | } | 
| 467 |  |  |  |  |  |  | } | 
| 468 |  |  |  |  |  |  | } | 
| 469 | 60 | 100 |  |  |  | 136 | if (vec($triematch,$trieix,1)) { | 
| 470 | 2 |  |  |  |  | 9 | vec($dawgmatch,$dawgix,1)=1; | 
| 471 |  |  |  |  |  |  | } | 
| 472 |  |  |  |  |  |  | } | 
| 473 | 2 |  |  |  |  | 15 | $self->{nodes}=\@dawgnodes; | 
| 474 |  |  |  |  |  |  | } | 
| 475 |  |  |  |  |  |  | } | 
| 476 |  |  |  |  |  |  | } | 
| 477 |  |  |  |  |  |  |  | 
| 478 |  |  |  |  |  |  | sub match | 
| 479 |  |  |  |  |  |  | { | 
| 480 | 48 |  |  | 48 | 1 | 4322 | my $self=shift(@_); | 
| 481 | 48 |  |  |  |  | 54 | my($nodes,$nodeix); | 
| 482 |  |  |  |  |  |  |  | 
| 483 | 48 |  |  |  |  | 78 | $nodes=$self->{nodes}; | 
| 484 | 48 |  |  |  |  | 85 | for my $charmap ($self->{charmap}) { | 
| 485 | 48 |  |  |  |  | 57 | $nodeix=STARTNODE; | 
| 486 | 48 |  |  |  |  | 198 | foreach my $char (split(//,$_[0])) { | 
| 487 | 276 |  |  |  |  | 837 | $nodeix=vec($nodes->[$nodeix],vec($charmap,ord($char),32)-1,32); | 
| 488 |  |  |  |  |  |  | } | 
| 489 |  |  |  |  |  |  | } | 
| 490 | 48 |  |  |  |  | 191 | return vec($self->{match},$nodeix,1); | 
| 491 |  |  |  |  |  |  | } | 
| 492 |  |  |  |  |  |  |  | 
| 493 |  |  |  |  |  |  | sub store | 
| 494 |  |  |  |  |  |  | { | 
| 495 | 2 |  |  | 2 | 1 | 3173 | my($self,$fh)=@_; | 
| 496 | 2 |  |  |  |  | 4 | my($nodes); | 
| 497 | 2 |  |  |  |  | 4 | my(@unmap,@nodes,$sizes,$map); | 
| 498 |  |  |  |  |  |  |  | 
| 499 | 2 |  |  |  |  | 4 | $nodes=$self->{nodes}; | 
| 500 | 2 |  |  |  |  | 6 | for my $charmap ($self->{charmap}) { | 
| 501 | 2 |  |  |  |  | 626 | foreach my $ord (0 .. length($charmap)/4-1) { | 
| 502 | 244 |  |  |  |  | 194 | my($charix); | 
| 503 |  |  |  |  |  |  |  | 
| 504 | 244 |  |  |  |  | 232 | $charix=vec($charmap,$ord,32); | 
| 505 | 244 | 100 |  |  |  | 399 | if ($charix) { | 
| 506 | 30 |  |  |  |  | 58 | $unmap[$charix-1]=$ord; | 
| 507 |  |  |  |  |  |  | } | 
| 508 |  |  |  |  |  |  | } | 
| 509 |  |  |  |  |  |  | } | 
| 510 | 2 |  |  |  |  | 15 | $map=pack("w*",@unmap); | 
| 511 | 2 |  |  |  |  | 127 | @nodes=map(pack("w*",unpack("N*",$_)), | 
| 512 | 2 |  |  |  |  | 6 | @{$nodes}[STARTNODE .. $#{$nodes}]); | 
|  | 2 |  |  |  |  | 8 |  | 
| 513 | 2 |  |  |  |  | 26 | $sizes=pack("w*",map(length($_),@nodes)); | 
| 514 | 2 |  |  |  |  | 11 | print $fh "dAWg",pack("N",1); | 
| 515 | 2 |  |  |  |  | 6 | print $fh pack("N",length($map)); | 
| 516 | 2 |  |  |  |  | 3 | print $fh pack("N",length($sizes)); | 
| 517 | 2 | 50 |  |  |  | 8 | print $fh $map if length($map); | 
| 518 | 2 |  |  |  |  | 3 | print $fh $sizes; | 
| 519 | 2 |  |  |  |  | 5 | print $fh $self->{match}; | 
| 520 | 2 |  |  |  |  | 5 | foreach my $node (@nodes) { | 
| 521 | 58 | 100 |  |  |  | 122 | print $fh $node if length($node); | 
| 522 |  |  |  |  |  |  | } | 
| 523 |  |  |  |  |  |  | } | 
| 524 |  |  |  |  |  |  |  | 
| 525 |  |  |  |  |  |  | sub load | 
| 526 |  |  |  |  |  |  | { | 
| 527 | 2 |  |  | 2 | 1 | 115 | my($class,$fh)=@_; | 
| 528 | 2 |  |  |  |  | 4 | my($self,@nodes,$nodecnt,$width); | 
| 529 | 0 |  |  |  |  | 0 | my($got,$mapsize,$sizessize,@sizes,$matchsize); | 
| 530 | 0 |  |  |  |  | 0 | my($used,$reachable); | 
| 531 |  |  |  |  |  |  |  | 
| 532 | 2 |  |  |  |  | 10 | $self={ | 
| 533 |  |  |  |  |  |  | charmap	=> "", | 
| 534 |  |  |  |  |  |  | nodes	=> \@nodes, | 
| 535 |  |  |  |  |  |  | match	=> "", | 
| 536 |  |  |  |  |  |  | }; | 
| 537 | 2 |  |  |  |  | 6 | bless($self,$class); | 
| 538 |  |  |  |  |  |  | { | 
| 539 | 2 |  |  |  |  | 3 | my($head,$magic,$version); | 
|  | 2 |  |  |  |  | 2 |  | 
| 540 |  |  |  |  |  |  |  | 
| 541 | 2 |  |  |  |  | 16 | $got=read($fh,$head,16); | 
| 542 | 2 | 50 |  |  |  | 8 | defined($got) | 
| 543 |  |  |  |  |  |  | or croak $!; | 
| 544 | 2 | 50 |  |  |  | 6 | $got==16 | 
| 545 |  |  |  |  |  |  | or croak "Unexpected EOF"; | 
| 546 | 2 |  |  |  |  | 10 | ($magic,$version,$mapsize,$sizessize)=unpack("A4NNN",$head); | 
| 547 | 2 | 50 |  |  |  | 7 | $magic eq "dAWg" | 
| 548 |  |  |  |  |  |  | or croak "Bad stored data"; | 
| 549 | 2 | 50 |  |  |  | 5 | $version==1 | 
| 550 |  |  |  |  |  |  | or croak "Unknown stored data version"; | 
| 551 | 2 | 50 |  |  |  | 7 | $sizessize>=1 | 
| 552 |  |  |  |  |  |  | or croak "Bad stored data"; | 
| 553 |  |  |  |  |  |  | } | 
| 554 | 2 | 50 |  |  |  | 6 | if ($mapsize>0) { | 
| 555 | 2 |  |  |  |  | 4 | my($packed,@unmap); | 
| 556 |  |  |  |  |  |  |  | 
| 557 | 2 |  |  |  |  | 5 | $got=read($fh,$packed,$mapsize); | 
| 558 | 2 | 50 |  |  |  | 6 | defined($got) | 
| 559 |  |  |  |  |  |  | or croak $!; | 
| 560 | 2 | 50 |  |  |  | 5 | $got==$mapsize | 
| 561 |  |  |  |  |  |  | or croak "Unexpected EOF"; | 
| 562 | 2 |  |  |  |  | 9 | @unmap=unpack("w*",$packed); | 
| 563 | 2 |  |  |  |  | 4 | $width=@unmap; | 
| 564 | 2 |  |  |  |  | 6 | for my $charmap ($self->{charmap}) { | 
| 565 | 2 |  |  |  |  | 5 | foreach my $charix (0 .. $#unmap) { | 
| 566 | 30 |  |  |  |  | 28 | my($ord); | 
| 567 |  |  |  |  |  |  |  | 
| 568 | 30 |  |  |  |  | 35 | $ord=$unmap[$charix]; | 
| 569 | 30 | 50 |  |  |  | 56 | if (vec($charmap,$ord,32)) { | 
| 570 | 0 |  |  |  |  | 0 | croak "Bad stored data"; | 
| 571 |  |  |  |  |  |  | } | 
| 572 | 30 |  |  |  |  | 80 | vec($charmap,$ord,32)=$charix+1; | 
| 573 |  |  |  |  |  |  | } | 
| 574 |  |  |  |  |  |  | } | 
| 575 |  |  |  |  |  |  | } else { | 
| 576 | 0 |  |  |  |  | 0 | $width=0; | 
| 577 |  |  |  |  |  |  | } | 
| 578 |  |  |  |  |  |  | { | 
| 579 | 2 |  |  |  |  | 3 | my($packed); | 
|  | 2 |  |  |  |  | 3 |  | 
| 580 |  |  |  |  |  |  |  | 
| 581 | 2 |  |  |  |  | 5 | $got=read($fh,$packed,$sizessize); | 
| 582 | 2 | 50 |  |  |  | 6 | defined($got) | 
| 583 |  |  |  |  |  |  | or croak $!; | 
| 584 | 2 | 50 |  |  |  | 8 | $got==$sizessize | 
| 585 |  |  |  |  |  |  | or croak "Unexpected EOF"; | 
| 586 | 2 |  |  |  |  | 13 | @sizes=unpack("w*",$packed); | 
| 587 | 2 |  |  |  |  | 4 | $nodecnt=@sizes; | 
| 588 | 2 |  |  |  |  | 7 | unshift(@sizes,0); | 
| 589 |  |  |  |  |  |  | } | 
| 590 |  |  |  |  |  |  |  | 
| 591 | 2 |  |  |  |  | 6 | $matchsize=int(($nodecnt+8)/8); | 
| 592 | 2 |  |  |  |  | 6 | $got=read($fh,$self->{match},$matchsize); | 
| 593 | 2 | 50 |  |  |  | 5 | defined($got) | 
| 594 |  |  |  |  |  |  | or croak $!; | 
| 595 | 2 | 50 |  |  |  | 6 | $got==$matchsize | 
| 596 |  |  |  |  |  |  | or croak "Unexpected EOF"; | 
| 597 |  |  |  |  |  |  |  | 
| 598 | 2 |  |  |  |  | 4 | $used=""; | 
| 599 | 2 |  |  |  |  | 2 | $reachable=""; | 
| 600 | 2 |  |  |  |  | 7 | vec($reachable,BADNODE,1)=1; | 
| 601 | 2 |  |  |  |  | 5 | $nodes[BADNODE]=""; | 
| 602 | 2 |  |  |  |  | 4 | vec($reachable,STARTNODE,1)=1; | 
| 603 | 2 |  |  |  |  | 5 | foreach my $nodeix (STARTNODE .. $nodecnt) { | 
| 604 | 58 |  |  |  |  | 97 | for my $node ($nodes[$nodeix]) { | 
| 605 | 58 |  |  |  |  | 54 | my($nodesize); | 
| 606 |  |  |  |  |  |  |  | 
| 607 | 58 |  |  |  |  | 66 | $nodesize=$sizes[$nodeix]; | 
| 608 | 58 | 100 |  |  |  | 92 | if ($nodesize>0) { | 
| 609 | 56 |  |  |  |  | 53 | my($packed,$nodewidth); | 
| 610 |  |  |  |  |  |  |  | 
| 611 | 56 |  |  |  |  | 95 | $got=read($fh,$packed,$nodesize); | 
| 612 | 56 | 50 |  |  |  | 100 | defined($got) | 
| 613 |  |  |  |  |  |  | or croak $!; | 
| 614 | 56 | 50 |  |  |  | 104 | $got==$nodesize | 
| 615 |  |  |  |  |  |  | or croak "Unexpected EOF"; | 
| 616 | 56 |  |  |  |  | 164 | $node=pack("N*",unpack("w*",$packed)); | 
| 617 | 56 |  |  |  |  | 75 | $nodewidth=length($node)/4; | 
| 618 | 56 | 50 |  |  |  | 112 | $nodewidth<=$width | 
| 619 |  |  |  |  |  |  | or croak "Bad stored data"; | 
| 620 | 56 | 50 |  |  |  | 106 | vec($node,$nodewidth-1,32) | 
| 621 |  |  |  |  |  |  | or croak "Bad stored data"; | 
| 622 | 56 |  |  |  |  | 90 | foreach my $charix (0 .. $nodewidth-1) { | 
| 623 | 304 |  |  |  |  | 249 | my($nextix); | 
| 624 |  |  |  |  |  |  |  | 
| 625 | 304 |  |  |  |  | 294 | $nextix=vec($node,$charix,32); | 
| 626 | 304 | 100 |  |  |  | 523 | next unless $nextix; | 
| 627 | 67 | 50 | 33 |  |  | 248 | if ($nextix>$nodecnt || $nextix<=$nodeix) { | 
| 628 | 0 |  |  |  |  | 0 | croak "Bad stored data"; | 
| 629 |  |  |  |  |  |  | } | 
| 630 | 67 |  |  |  |  | 168 | vec($used,$charix,1)=1; | 
| 631 | 67 |  |  |  |  | 260 | vec($reachable,$nextix,1)=1; | 
| 632 |  |  |  |  |  |  | } | 
| 633 |  |  |  |  |  |  | } else { | 
| 634 | 2 | 50 | 33 |  |  | 18 | $nodecnt==1 || vec($self->{match},$nodeix,1) | 
| 635 |  |  |  |  |  |  | or croak "Bad stored data"; | 
| 636 | 2 |  |  |  |  | 7 | $node=""; | 
| 637 |  |  |  |  |  |  | } | 
| 638 |  |  |  |  |  |  | } | 
| 639 |  |  |  |  |  |  | } | 
| 640 | 2 |  |  |  |  | 16 | $used =~ /\A\xFF*/; | 
| 641 | 2 |  |  |  |  | 10 | foreach my $charix ($+[0]*8 .. $width-1) { | 
| 642 | 6 | 50 |  |  |  | 14 | vec($used,$charix,1) | 
| 643 |  |  |  |  |  |  | or croak "Bad stored data"; | 
| 644 |  |  |  |  |  |  | } | 
| 645 | 2 |  |  |  |  | 6 | $reachable =~ /\A\xFF*/; | 
| 646 | 2 |  |  |  |  | 6 | foreach my $nodeix ($+[0]*8 .. $nodecnt) { | 
| 647 | 4 | 50 |  |  |  | 17 | vec($reachable,$nodeix,1) | 
| 648 |  |  |  |  |  |  | or croak "Bad stored data"; | 
| 649 |  |  |  |  |  |  | } | 
| 650 |  |  |  |  |  |  |  | 
| 651 | 2 |  |  |  |  | 11 | $self; | 
| 652 |  |  |  |  |  |  | } | 
| 653 |  |  |  |  |  |  |  | 
| 654 |  |  |  |  |  |  | my $dot_id_re= | 
| 655 |  |  |  |  |  |  | qr/^(?:[A-Za-z_][0-9A-Za-z_]*|-?(?:[0-9]+(?:\.[0-9]*)|\.[0-9]+))$/; | 
| 656 |  |  |  |  |  |  |  | 
| 657 |  |  |  |  |  |  | sub _dot_id | 
| 658 |  |  |  |  |  |  | { | 
| 659 | 0 |  |  | 0 |  |  | my($id)=@_; | 
| 660 |  |  |  |  |  |  |  | 
| 661 | 0 | 0 |  |  |  |  | if ($id =~ $dot_id_re) { | 
| 662 | 0 |  |  |  |  |  | $id; | 
| 663 |  |  |  |  |  |  | } else { | 
| 664 | 0 |  |  |  |  |  | $id =~ s/\"/\\\"/g; | 
| 665 | 0 |  |  |  |  |  | qq{"$id"}; | 
| 666 |  |  |  |  |  |  | } | 
| 667 |  |  |  |  |  |  | } | 
| 668 |  |  |  |  |  |  |  | 
| 669 |  |  |  |  |  |  | sub _dot_attrs | 
| 670 |  |  |  |  |  |  | { | 
| 671 | 0 |  |  | 0 |  |  | my($attrs)=@_; | 
| 672 | 0 |  |  |  |  |  | my(@attrs); | 
| 673 |  |  |  |  |  |  |  | 
| 674 | 0 |  |  |  |  |  | foreach my $name (sort(keys(%{$attrs}))) { | 
|  | 0 |  |  |  |  |  |  | 
| 675 | 0 |  |  |  |  |  | push(@attrs,_dot_id($name)."="._dot_id($attrs->{$name})); | 
| 676 |  |  |  |  |  |  | } | 
| 677 | 0 |  |  |  |  |  | join(", ",@attrs); | 
| 678 |  |  |  |  |  |  | } | 
| 679 |  |  |  |  |  |  |  | 
| 680 |  |  |  |  |  |  | my %default_options=( | 
| 681 |  |  |  |  |  |  | "" => { | 
| 682 |  |  |  |  |  |  | }, | 
| 683 |  |  |  |  |  |  | graph => { | 
| 684 |  |  |  |  |  |  | }, | 
| 685 |  |  |  |  |  |  | edge => { | 
| 686 |  |  |  |  |  |  | }, | 
| 687 |  |  |  |  |  |  | node => { | 
| 688 |  |  |  |  |  |  | shape	=> "circle", | 
| 689 |  |  |  |  |  |  | }, | 
| 690 |  |  |  |  |  |  | match => { | 
| 691 |  |  |  |  |  |  | shape	=> "doublecircle", | 
| 692 |  |  |  |  |  |  | }, | 
| 693 |  |  |  |  |  |  | start => { | 
| 694 |  |  |  |  |  |  | }, | 
| 695 |  |  |  |  |  |  | chars => { | 
| 696 |  |  |  |  |  |  | " " => { | 
| 697 |  |  |  |  |  |  | label => "SP", | 
| 698 |  |  |  |  |  |  | }, | 
| 699 |  |  |  |  |  |  | }, | 
| 700 |  |  |  |  |  |  | ); | 
| 701 |  |  |  |  |  |  |  | 
| 702 |  |  |  |  |  |  | sub _dot_break | 
| 703 |  |  |  |  |  |  | { | 
| 704 | 0 |  |  | 0 |  |  | my($fh)=@_; | 
| 705 |  |  |  |  |  |  |  | 
| 706 | 0 |  |  |  |  |  | for my $text ($_[1]) { | 
| 707 | 0 |  |  |  |  |  | my($pos,$len); | 
| 708 |  |  |  |  |  |  |  | 
| 709 | 0 |  |  |  |  |  | $pos=0; | 
| 710 | 0 |  |  |  |  |  | $len=length($text); | 
| 711 | 0 |  |  |  |  |  | while ($len>64) { | 
| 712 | 0 |  |  |  |  |  | my($break); | 
| 713 |  |  |  |  |  |  |  | 
| 714 | 0 |  |  |  |  |  | $break=rindex($text," ",$pos+64); | 
| 715 | 0 |  |  |  |  |  | print $fh "\t",substr($text,$pos,$break-$pos),"\n"; | 
| 716 | 0 |  |  |  |  |  | $pos=$break+1; | 
| 717 | 0 |  |  |  |  |  | $len=length($text)-$pos; | 
| 718 |  |  |  |  |  |  | } | 
| 719 | 0 |  |  |  |  |  | print $fh "\t",substr($text,$pos),";\n"; | 
| 720 |  |  |  |  |  |  | } | 
| 721 |  |  |  |  |  |  | } | 
| 722 |  |  |  |  |  |  |  | 
| 723 |  |  |  |  |  |  | sub write_dot | 
| 724 |  |  |  |  |  |  | { | 
| 725 | 0 |  |  | 0 | 1 |  | my($self,$fh,$options)=@_; | 
| 726 | 0 |  |  |  |  |  | my($nodes); | 
| 727 | 0 |  |  |  |  |  | my(@charunmap,@order); | 
| 728 |  |  |  |  |  |  |  | 
| 729 | 0 |  |  |  |  |  | $nodes=$self->{nodes}; | 
| 730 | 0 |  |  |  |  |  | for my $charmap ($self->{charmap}) { | 
| 731 | 0 |  |  |  |  |  | foreach my $ord (0 .. length($charmap)/4-1) { | 
| 732 | 0 |  |  |  |  |  | my($charix); | 
| 733 |  |  |  |  |  |  |  | 
| 734 | 0 |  |  |  |  |  | $charix=vec($charmap,$ord,32); | 
| 735 | 0 | 0 |  |  |  |  | next unless $charix; | 
| 736 | 0 |  |  |  |  |  | $charunmap[$charix-1]=chr($ord); | 
| 737 |  |  |  |  |  |  | } | 
| 738 |  |  |  |  |  |  | } | 
| 739 | 0 |  |  |  |  |  | @order=sort { | 
| 740 | 0 |  |  |  |  |  | $charunmap[$a] cmp $charunmap[$b]; | 
| 741 |  |  |  |  |  |  | } (0 .. $#charunmap); | 
| 742 |  |  |  |  |  |  |  | 
| 743 | 0 |  | 0 |  |  |  | $options||={}; | 
| 744 | 0 |  |  |  |  |  | while (my($key,$value)=each(%default_options)) { | 
| 745 | 0 |  | 0 |  |  |  | $options->{$key}||=$value; | 
| 746 |  |  |  |  |  |  | } | 
| 747 | 0 |  |  |  |  |  | $options->{startmatch}||={ | 
| 748 | 0 |  |  |  |  |  | %{$options->{start}}, | 
| 749 | 0 |  | 0 |  |  |  | %{$options->{match}}, | 
| 750 |  |  |  |  |  |  | }; | 
| 751 |  |  |  |  |  |  |  | 
| 752 |  |  |  |  |  |  | { | 
| 753 | 0 |  |  |  |  |  | my($id,$nullattrs); | 
|  | 0 |  |  |  |  |  |  | 
| 754 |  |  |  |  |  |  |  | 
| 755 | 0 |  |  |  |  |  | $id=$options->{id}; | 
| 756 | 0 |  |  |  |  |  | print $fh "digraph"; | 
| 757 | 0 | 0 |  |  |  |  | if (defined($id)) { | 
| 758 | 0 |  |  |  |  |  | print $fh " ",_dot_id($id); | 
| 759 |  |  |  |  |  |  | } | 
| 760 | 0 |  |  |  |  |  | print $fh " {\n"; | 
| 761 | 0 |  |  |  |  |  | $nullattrs=_dot_attrs($options->{""}); | 
| 762 | 0 | 0 |  |  |  |  | if ($nullattrs ne "") { | 
| 763 | 0 |  |  |  |  |  | print $fh "    $nullattrs;\n"; | 
| 764 |  |  |  |  |  |  | } | 
| 765 |  |  |  |  |  |  | } | 
| 766 |  |  |  |  |  |  |  | 
| 767 | 0 |  |  |  |  |  | foreach my $class (qw(graph node edge)) { | 
| 768 | 0 |  |  |  |  |  | my($classattrs); | 
| 769 |  |  |  |  |  |  |  | 
| 770 | 0 |  |  |  |  |  | $classattrs=_dot_attrs($options->{$class}); | 
| 771 | 0 | 0 |  |  |  |  | if ($classattrs ne "") { | 
| 772 | 0 |  |  |  |  |  | print $fh "    $class [$classattrs];\n"; | 
| 773 |  |  |  |  |  |  | } | 
| 774 |  |  |  |  |  |  | } | 
| 775 |  |  |  |  |  |  |  | 
| 776 | 0 | 0 |  |  |  |  | if ($options->{readable}) { | 
| 777 | 0 |  |  |  |  |  | for my $match ($self->{match}) { | 
| 778 | 0 |  |  |  |  |  | foreach my $nodeix (STARTNODE .. $#{$nodes}) { | 
|  | 0 |  |  |  |  |  |  | 
| 779 | 0 |  |  |  |  |  | my($attrs); | 
| 780 |  |  |  |  |  |  |  | 
| 781 | 0 | 0 |  |  |  |  | if (vec($match,$nodeix,1)) { | 
| 782 | 0 | 0 |  |  |  |  | if ($nodeix==STARTNODE) { | 
| 783 | 0 |  |  |  |  |  | $attrs=_dot_attrs($options->{startmatch}); | 
| 784 |  |  |  |  |  |  | } else { | 
| 785 | 0 |  |  |  |  |  | $attrs=_dot_attrs($options->{match}); | 
| 786 |  |  |  |  |  |  | } | 
| 787 |  |  |  |  |  |  | } else { | 
| 788 | 0 | 0 |  |  |  |  | if ($nodeix==STARTNODE) { | 
| 789 | 0 |  |  |  |  |  | $attrs=_dot_attrs($options->{start}); | 
| 790 |  |  |  |  |  |  | } else { | 
| 791 | 0 |  |  |  |  |  | $attrs=""; | 
| 792 |  |  |  |  |  |  | } | 
| 793 |  |  |  |  |  |  | } | 
| 794 | 0 | 0 |  |  |  |  | if ($attrs ne "") { | 
| 795 | 0 |  |  |  |  |  | print $fh "    $nodeix [$attrs];\n"; | 
| 796 |  |  |  |  |  |  | } | 
| 797 | 0 |  |  |  |  |  | for my $node ($nodes->[$nodeix]) { | 
| 798 | 0 |  |  |  |  |  | foreach my $charix (@order) { | 
| 799 | 0 |  |  |  |  |  | my($nextix,%attrs,$char); | 
| 800 |  |  |  |  |  |  |  | 
| 801 | 0 |  |  |  |  |  | $nextix=vec($node,$charix,32); | 
| 802 | 0 | 0 |  |  |  |  | next unless $nextix; | 
| 803 | 0 |  |  |  |  |  | $char=$charunmap[$charix]; | 
| 804 | 0 | 0 |  |  |  |  | $attrs=_dot_attrs({ | 
| 805 |  |  |  |  |  |  | label => $char, | 
| 806 | 0 |  |  |  |  |  | %{$options->{chars}->{$char} || {}}, | 
| 807 |  |  |  |  |  |  | }); | 
| 808 | 0 |  |  |  |  |  | print $fh "    $nodeix\->$nextix [$attrs];\n"; | 
| 809 |  |  |  |  |  |  | } | 
| 810 |  |  |  |  |  |  | } | 
| 811 |  |  |  |  |  |  | } | 
| 812 |  |  |  |  |  |  | } | 
| 813 |  |  |  |  |  |  | } else { | 
| 814 | 0 |  |  |  |  |  | for my $match ($self->{match}) { | 
| 815 | 0 |  |  |  |  |  | my($startattrs,$matchattrs,@matchids); | 
| 816 |  |  |  |  |  |  |  | 
| 817 | 0 | 0 |  |  |  |  | if (vec($match,STARTNODE,1)) { | 
| 818 | 0 |  |  |  |  |  | $startattrs=_dot_attrs($options->{startmatch}); | 
| 819 |  |  |  |  |  |  | } else { | 
| 820 | 0 |  |  |  |  |  | $startattrs=_dot_attrs($options->{start}); | 
| 821 |  |  |  |  |  |  | } | 
| 822 | 0 |  |  |  |  |  | $matchattrs=_dot_attrs($options->{match}); | 
| 823 | 0 | 0 |  |  |  |  | if ($startattrs ne "") { | 
| 824 | 0 | 0 |  |  |  |  | if ($startattrs eq $matchattrs) { | 
| 825 | 0 |  |  |  |  |  | push(@matchids,STARTNODE); | 
| 826 |  |  |  |  |  |  | } else  { | 
| 827 | 0 |  |  |  |  |  | print $fh "    ".STARTNODE." [$startattrs];\n"; | 
| 828 |  |  |  |  |  |  | } | 
| 829 |  |  |  |  |  |  | } | 
| 830 | 0 | 0 |  |  |  |  | if ($matchattrs ne "") { | 
| 831 | 0 |  |  |  |  |  | foreach my $nodeix (STARTNODE+1 .. $#{$nodes}) { | 
|  | 0 |  |  |  |  |  |  | 
| 832 | 0 | 0 |  |  |  |  | if (vec($match,$nodeix,1)) { | 
| 833 | 0 |  |  |  |  |  | push(@matchids,$nodeix); | 
| 834 |  |  |  |  |  |  | } | 
| 835 |  |  |  |  |  |  | } | 
| 836 | 0 | 0 |  |  |  |  | if (@matchids>=26/(length($matchattrs)+8)+1) { | 
| 837 | 0 |  |  |  |  |  | print $fh "    subgraph {\n"; | 
| 838 | 0 |  |  |  |  |  | print $fh "\tnode [$matchattrs];\n"; | 
| 839 | 0 |  |  |  |  |  | _dot_break($fh,join(" ",@matchids)); | 
| 840 | 0 |  |  |  |  |  | print $fh "    }\n"; | 
| 841 |  |  |  |  |  |  | } else { | 
| 842 | 0 |  |  |  |  |  | foreach my $nodeix (@matchids) { | 
| 843 | 0 |  |  |  |  |  | print $fh "    $nodeix [$matchattrs];\n"; | 
| 844 |  |  |  |  |  |  | } | 
| 845 |  |  |  |  |  |  | } | 
| 846 |  |  |  |  |  |  | } | 
| 847 |  |  |  |  |  |  | } | 
| 848 |  |  |  |  |  |  | { | 
| 849 | 0 |  |  |  |  |  | my(@charedges); | 
|  | 0 |  |  |  |  |  |  | 
| 850 |  |  |  |  |  |  |  | 
| 851 | 0 |  |  |  |  |  | foreach my $nodeix (STARTNODE .. $#{$nodes}) { | 
|  | 0 |  |  |  |  |  |  | 
| 852 | 0 |  |  |  |  |  | for my $node ($nodes->[$nodeix]) { | 
| 853 | 0 |  |  |  |  |  | foreach my $charix (@order) { | 
| 854 | 0 |  |  |  |  |  | my($nextix,%attrs,$char); | 
| 855 |  |  |  |  |  |  |  | 
| 856 | 0 |  |  |  |  |  | $nextix=vec($node,$charix,32); | 
| 857 | 0 | 0 |  |  |  |  | next unless $nextix; | 
| 858 | 0 |  |  |  |  |  | push(@{$charedges[$charix]},[$nodeix,$nextix]); | 
|  | 0 |  |  |  |  |  |  | 
| 859 |  |  |  |  |  |  | } | 
| 860 |  |  |  |  |  |  | } | 
| 861 |  |  |  |  |  |  | } | 
| 862 | 0 |  |  |  |  |  | foreach my $charix (@order) { | 
| 863 | 0 |  |  |  |  |  | my($char,$attrs,$edges); | 
| 864 |  |  |  |  |  |  |  | 
| 865 | 0 |  |  |  |  |  | $char=$charunmap[$charix]; | 
| 866 | 0 | 0 |  |  |  |  | $attrs=_dot_attrs({ | 
| 867 |  |  |  |  |  |  | label => $char, | 
| 868 | 0 |  |  |  |  |  | %{$options->{chars}->{$char} || {}}, | 
| 869 |  |  |  |  |  |  | }); | 
| 870 | 0 |  |  |  |  |  | $edges=$charedges[$charix]; | 
| 871 | 0 | 0 |  |  |  |  | if (@{$edges}>=26/(length($attrs)+8)+1) { | 
|  | 0 |  |  |  |  |  |  | 
| 872 | 0 |  |  |  |  |  | print $fh "    subgraph {\n"; | 
| 873 | 0 |  |  |  |  |  | print $fh "\tedge [$attrs];\n"; | 
| 874 | 0 |  |  |  |  |  | _dot_break( | 
| 875 | 0 |  |  |  |  |  | $fh,join(" ",map("$_->[0]\->$_->[1]",@{$edges}))); | 
| 876 | 0 |  |  |  |  |  | print $fh "    }\n"; | 
| 877 |  |  |  |  |  |  | } else { | 
| 878 | 0 |  |  |  |  |  | foreach my $edge (@{$edges}) { | 
|  | 0 |  |  |  |  |  |  | 
| 879 | 0 |  |  |  |  |  | print $fh "    $edge->[0]\->$edge->[1] [$attrs];\n"; | 
| 880 |  |  |  |  |  |  | } | 
| 881 |  |  |  |  |  |  | } | 
| 882 |  |  |  |  |  |  | } | 
| 883 |  |  |  |  |  |  | } | 
| 884 |  |  |  |  |  |  | } | 
| 885 | 0 | 0 |  |  |  |  | if (defined(my $groups=$self->{groups})) { | 
| 886 | 0 |  |  |  |  |  | foreach my $groupix (0 .. $#{$groups}) { | 
|  | 0 |  |  |  |  |  |  | 
| 887 | 0 |  |  |  |  |  | my($nodes); | 
| 888 |  |  |  |  |  |  |  | 
| 889 | 0 |  |  |  |  |  | $nodes=$groups->[$groupix]->[2]; | 
| 890 | 0 | 0 |  |  |  |  | if (@{$nodes}>=2) { | 
|  | 0 |  |  |  |  |  |  | 
| 891 | 0 |  |  |  |  |  | print $fh "    subgraph cluster_$groupix {\n"; | 
| 892 | 0 |  |  |  |  |  | _dot_break($fh,join(" ",@{$nodes})); | 
|  | 0 |  |  |  |  |  |  | 
| 893 | 0 |  |  |  |  |  | print $fh "    }\n"; | 
| 894 |  |  |  |  |  |  | } | 
| 895 |  |  |  |  |  |  | } | 
| 896 |  |  |  |  |  |  | } | 
| 897 |  |  |  |  |  |  |  | 
| 898 | 0 |  |  |  |  |  | print $fh "}\n"; | 
| 899 |  |  |  |  |  |  | } | 
| 900 |  |  |  |  |  |  |  | 
| 901 |  |  |  |  |  |  | 1; | 
| 902 |  |  |  |  |  |  |  |