| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package String::Simrank; | 
| 2 |  |  |  |  |  |  |  | 
| 3 |  |  |  |  |  |  | # Note to developers: | 
| 4 |  |  |  |  |  |  | # enable the VERSIONs (here and in the "use Inline C ..." | 
| 5 |  |  |  |  |  |  | # statement) to match each other when ready to install in sitelib or | 
| 6 |  |  |  |  |  |  | # when ready to distribute.  Otherwise, disable them | 
| 7 |  |  |  |  |  |  | # with commenting symbol for testing. | 
| 8 |  |  |  |  |  |  |  | 
| 9 |  |  |  |  |  |  | # use base 'Exporter'; | 
| 10 |  |  |  |  |  |  | # @EXPORT_OK = qw(add subtract); | 
| 11 | 1 |  |  | 1 |  | 31740 | use strict; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 41 |  | 
| 12 | 1 |  |  | 1 |  | 5 | use File::Basename; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 128 |  | 
| 13 | 1 |  |  | 1 |  | 1163 | use IO::File; | 
|  | 1 |  |  |  |  | 12744 |  | 
|  | 1 |  |  |  |  | 166 |  | 
| 14 | 1 |  |  | 1 |  | 10 | use Fcntl; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 499 |  | 
| 15 | 1 |  |  | 1 |  | 11025 | use Data::Dumper; | 
|  | 1 |  |  |  |  | 35031 |  | 
|  | 1 |  |  |  |  | 90 |  | 
| 16 | 1 |  |  | 1 |  | 1139 | use Storable; | 
|  | 1 |  |  |  |  | 4409 |  | 
|  | 1 |  |  |  |  | 90 |  | 
| 17 | 1 |  |  | 1 |  | 14 | use vars qw($VERSION $NAME); | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 82 |  | 
| 18 |  |  |  |  |  |  |  | 
| 19 |  |  |  |  |  |  |  | 
| 20 |  |  |  |  |  |  | $VERSION = '0.079'; | 
| 21 |  |  |  |  |  |  | $NAME = 'String::Simrank'; | 
| 22 |  |  |  |  |  |  |  | 
| 23 |  |  |  |  |  |  | # Must specify a NAME and VERSION parameter. The NAME must match your module's package | 
| 24 |  |  |  |  |  |  | # name. The VERSION parameter must match the module's $VERSION | 
| 25 |  |  |  |  |  |  | # variable and they must both be of the form /^\d\.\d\d$/. | 
| 26 |  |  |  |  |  |  | # DATA refers to the __DATA__ section near bottom of this file. | 
| 27 | 1 |  |  | 1 |  | 1223 | use Inline C => 'DATA'; | 
|  | 1 |  |  |  |  | 26131 |  | 
|  | 1 |  |  |  |  | 9 |  | 
| 28 |  |  |  |  |  |  |  | 
| 29 |  |  |  |  |  |  | =head1 NAME | 
| 30 |  |  |  |  |  |  |  | 
| 31 |  |  |  |  |  |  | String::Simrank - Rapid comparisons of many strings for k-mer similarity. | 
| 32 |  |  |  |  |  |  | =cut | 
| 33 |  |  |  |  |  |  |  | 
| 34 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 35 |  |  |  |  |  |  |  | 
| 36 |  |  |  |  |  |  | use String::Simrank; | 
| 37 |  |  |  |  |  |  | my $sr = new String::Simrank( | 
| 38 |  |  |  |  |  |  | { data => 'db.fasta'}); | 
| 39 |  |  |  |  |  |  | $sr->formatdb(); | 
| 40 |  |  |  |  |  |  | $sr->match_oligos({ query => 'query.fasta' }); | 
| 41 |  |  |  |  |  |  |  | 
| 42 |  |  |  |  |  |  | You may utilize  k-mers of various sizes, e.g.: | 
| 43 |  |  |  |  |  |  | $sr->formatdb( { wordlen => 6 } ); | 
| 44 |  |  |  |  |  |  | =cut | 
| 45 |  |  |  |  |  |  |  | 
| 46 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 47 |  |  |  |  |  |  |  | 
| 48 |  |  |  |  |  |  | The String::Simrank module allows rapid searches for similarity between query | 
| 49 |  |  |  |  |  |  | strings and a database of strings.  This module is maintained by molecular | 
| 50 |  |  |  |  |  |  | biologists who use it for searching for similarities among strings representing | 
| 51 |  |  |  |  |  |  | contiguous DNA or RNA sequences.  This program does not | 
| 52 |  |  |  |  |  |  | construct an alignment but rather finds the ratio | 
| 53 |  |  |  |  |  |  | of nmers (A.K.A. ngrams) that are shared between a query and database | 
| 54 |  |  |  |  |  |  | records. | 
| 55 |  |  |  |  |  |  | The input file should be fasta formatted (either aligned or unaligned) multiple | 
| 56 |  |  |  |  |  |  | sequence file. | 
| 57 |  |  |  |  |  |  | The memory consumption is moderate and grows linearly with the number of sequences | 
| 58 |  |  |  |  |  |  | and depends on the nmer size defined by the user.  Using 7-mers, ~20,000 strings | 
| 59 |  |  |  |  |  |  | each ~1,500 characters in length requires ~50 Mb. | 
| 60 |  |  |  |  |  |  |  | 
| 61 |  |  |  |  |  |  | The module can be used from the command line through the script | 
| 62 |  |  |  |  |  |  | C provided in the examples folder. | 
| 63 |  |  |  |  |  |  |  | 
| 64 |  |  |  |  |  |  | By default the output is written to STDOUT and | 
| 65 |  |  |  |  |  |  | represents the similarity of each query string to the top hits in the | 
| 66 |  |  |  |  |  |  | the database.  The format is query_id, tab, best match database_id, colon, percent similarity, space | 
| 67 |  |  |  |  |  |  | second best match database_id, colon, percent similarity. | 
| 68 |  |  |  |  |  |  | query1  EscCol36:100.00 EscCol36:100.00 EscCol43:99.59  EscCol29:99.24  EscCol33:99.17  EscCol10:99.02 | 
| 69 |  |  |  |  |  |  |  | 
| 70 |  |  |  |  |  |  | =cut | 
| 71 |  |  |  |  |  |  |  | 
| 72 |  |  |  |  |  |  | =head1 METHODS | 
| 73 |  |  |  |  |  |  |  | 
| 74 |  |  |  |  |  |  | =cut | 
| 75 |  |  |  |  |  |  |  | 
| 76 |  |  |  |  |  |  | =head2 new( { data_path => $path }) | 
| 77 |  |  |  |  |  |  |  | 
| 78 |  |  |  |  |  |  | my $sr = new String::Simrank( { data => 'many_seqs.fasta' }); | 
| 79 |  |  |  |  |  |  |  | 
| 80 |  |  |  |  |  |  | Instantiates a new String::Simrank object. | 
| 81 |  |  |  |  |  |  | Each simrank database should live within its own String::Simrank object. | 
| 82 |  |  |  |  |  |  | In other words, each unique combination of a fasta database and n-mer | 
| 83 |  |  |  |  |  |  | length will have its own instance. | 
| 84 |  |  |  |  |  |  | If the database has already been formated then the wordlength will be | 
| 85 |  |  |  |  |  |  | read from the formatted database's .bin file. | 
| 86 |  |  |  |  |  |  | Parameters: | 
| 87 |  |  |  |  |  |  |  | 
| 88 |  |  |  |  |  |  | =over 4 | 
| 89 |  |  |  |  |  |  |  | 
| 90 |  |  |  |  |  |  | =item data | 
| 91 |  |  |  |  |  |  |  | 
| 92 |  |  |  |  |  |  | Sets the path of where the data fasta file is (Database sequences, fasta format). | 
| 93 |  |  |  |  |  |  | The same terminal directory is where the formated database will reside with the same | 
| 94 |  |  |  |  |  |  | base name but the .fasta will be substituted with .bin. | 
| 95 |  |  |  |  |  |  | Sequence names(identifiers) will be first 10 characters matched between the '>' and the first space character. | 
| 96 |  |  |  |  |  |  | Be sure the sequences names are unique within this string. | 
| 97 |  |  |  |  |  |  |  | 
| 98 |  |  |  |  |  |  | =back | 
| 99 |  |  |  |  |  |  |  | 
| 100 |  |  |  |  |  |  | =cut | 
| 101 |  |  |  |  |  |  |  | 
| 102 |  |  |  |  |  |  | sub new { | 
| 103 |  |  |  |  |  |  | # Todd Z. DeSantis, March 2008 | 
| 104 |  |  |  |  |  |  |  | 
| 105 | 0 |  |  | 0 | 1 |  | my ($class, $cl_args) = @_; | 
| 106 |  |  |  |  |  |  | # Parameter hr is called $cl_args since in | 
| 107 |  |  |  |  |  |  | # many instances these will come directly from the command line. | 
| 108 | 0 |  |  |  |  |  | my $self = bless {}, $class; | 
| 109 | 0 |  |  |  |  |  | $self->_check_arguments_for_new($cl_args); | 
| 110 |  |  |  |  |  |  | # print STDERR "you are using String::Simrank version $VERSION \n"; | 
| 111 | 0 |  |  |  |  |  | return $self; | 
| 112 |  |  |  |  |  |  | } | 
| 113 |  |  |  |  |  |  |  | 
| 114 |  |  |  |  |  |  | =head2 _check_arguments_for_new | 
| 115 |  |  |  |  |  |  |  | 
| 116 |  |  |  |  |  |  | Internally used function. | 
| 117 |  |  |  |  |  |  | Validates the 'data' argument needed so this instance knows what data | 
| 118 |  |  |  |  |  |  | file will be linked. Method checks that fasta file exists and is readable | 
| 119 |  |  |  |  |  |  | and creates the name which corresponds to the .bin file. | 
| 120 |  |  |  |  |  |  | If errors are found, program exits. | 
| 121 |  |  |  |  |  |  | Returns a hash or hash ref | 
| 122 |  |  |  |  |  |  | depending on the context of the method call. | 
| 123 |  |  |  |  |  |  |  | 
| 124 |  |  |  |  |  |  | =cut | 
| 125 |  |  |  |  |  |  |  | 
| 126 |  |  |  |  |  |  |  | 
| 127 |  |  |  |  |  |  | sub _check_arguments_for_new { | 
| 128 |  |  |  |  |  |  | # Niels Larsen, May 2004. | 
| 129 |  |  |  |  |  |  | # Todd Z. DeSantis, March 2008. | 
| 130 |  |  |  |  |  |  |  | 
| 131 | 0 |  |  | 0 |  |  | my ($self, | 
| 132 |  |  |  |  |  |  | $cl_args,    # Command Line ARGument hash | 
| 133 |  |  |  |  |  |  | # not really required to come from the command line | 
| 134 |  |  |  |  |  |  | ) = @_; | 
| 135 |  |  |  |  |  |  |  | 
| 136 | 0 |  |  |  |  |  | my ( @errors, $error); | 
| 137 |  |  |  |  |  |  |  | 
| 138 | 0 | 0 |  |  |  |  | if ( $cl_args->{"data"} ) { | 
| 139 |  |  |  |  |  |  | # $cl_args->{"data"} = &Common::File::full_file_path( $cl_args->{"data"} ); | 
| 140 |  |  |  |  |  |  |  | 
| 141 | 0 | 0 |  |  |  |  | if ( -r $cl_args->{"data"}) { | 
| 142 | 0 |  |  |  |  |  | $cl_args->{"binary"} = $cl_args->{"data"}; | 
| 143 | 0 |  |  |  |  |  | $cl_args->{"binary"} =~ s/\.[^\.]+$//; | 
| 144 | 0 |  |  |  |  |  | $cl_args->{"binary"} .= ".bin"; | 
| 145 |  |  |  |  |  |  |  | 
| 146 | 0 |  |  |  |  |  | $self->{param}{data} = $cl_args->{"data"}; | 
| 147 | 0 |  |  |  |  |  | $self->{param}{binary} = $cl_args->{"binary"}; | 
| 148 | 0 | 0 | 0 |  |  |  | if (-e $cl_args->{"binary"} && -r $cl_args->{"binary"}) { | 
| 149 | 0 |  |  |  |  |  | print $cl_args->{"binary"} . " has been formatted\n"; | 
| 150 | 0 |  |  |  |  |  | $self->{binary_ready} = 1; | 
| 151 |  |  |  |  |  |  | } else { | 
| 152 | 0 |  |  |  |  |  | $self->{binary_ready} = 0; | 
| 153 |  |  |  |  |  |  | } | 
| 154 |  |  |  |  |  |  |  | 
| 155 |  |  |  |  |  |  | } else { | 
| 156 | 0 |  |  |  |  |  | push @errors, qq (Simrank.pm: Input fasta data file not found or not readable -> "$cl_args->{'data'}"); | 
| 157 |  |  |  |  |  |  | } | 
| 158 |  |  |  |  |  |  |  | 
| 159 |  |  |  |  |  |  | } else { | 
| 160 | 0 |  |  |  |  |  | push @errors, qq (Fasta sequence file must be specified with *data* argument); | 
| 161 |  |  |  |  |  |  | } | 
| 162 |  |  |  |  |  |  |  | 
| 163 |  |  |  |  |  |  | # Print errors if any and exit, | 
| 164 |  |  |  |  |  |  |  | 
| 165 | 0 | 0 |  |  |  |  | if ( @errors ) | 
| 166 |  |  |  |  |  |  | { | 
| 167 | 0 |  |  |  |  |  | foreach $error ( @errors ) | 
| 168 |  |  |  |  |  |  | { | 
| 169 | 0 |  |  |  |  |  | print STDERR "$error\n"; | 
| 170 |  |  |  |  |  |  | # &Common::Messages::echo_red( "ERROR: "); | 
| 171 |  |  |  |  |  |  | # &Common::Messages::echo( "$error\n" ); | 
| 172 |  |  |  |  |  |  | } | 
| 173 |  |  |  |  |  |  |  | 
| 174 | 0 |  |  |  |  |  | exit; | 
| 175 |  |  |  |  |  |  | } | 
| 176 |  |  |  |  |  |  | else { | 
| 177 |  |  |  |  |  |  |  | 
| 178 | 0 | 0 |  |  |  |  | wantarray ? return %{ $cl_args } : return $cl_args; | 
|  | 0 |  |  |  |  |  |  | 
| 179 |  |  |  |  |  |  | } | 
| 180 |  |  |  |  |  |  |  | 
| 181 |  |  |  |  |  |  | } | 
| 182 |  |  |  |  |  |  |  | 
| 183 |  |  |  |  |  |  | =head2 formatdb({ wordlen => $int, minlen => $int, silent = $bool }) | 
| 184 |  |  |  |  |  |  |  | 
| 185 |  |  |  |  |  |  | $sr->formatdb({wordlen => 8, minlen => 500, silent => 0, valid_chars => 'ACGT'}); | 
| 186 |  |  |  |  |  |  |  | 
| 187 |  |  |  |  |  |  | From an input collection of strings (data), generates a binary file | 
| 188 |  |  |  |  |  |  | optimized for retrieval of k-mer similarities. The file will contain | 
| 189 |  |  |  |  |  |  | a pre-computed map of which sequences - given by their index | 
| 190 |  |  |  |  |  |  | number in the input data file - contain a given k-mer. All the | 
| 191 |  |  |  |  |  |  | valid k-mers of the given length are mapped. This means each | 
| 192 |  |  |  |  |  |  | k-mer from a query string can be used to look up the database strings | 
| 193 |  |  |  |  |  |  | it occurs in, very quickly. The memory consumption is moderate | 
| 194 |  |  |  |  |  |  | and grows linearly with the number of sequences; 20,000 takes | 
| 195 |  |  |  |  |  |  | 50-55 mb. | 
| 196 |  |  |  |  |  |  | The first characters between the '>' character and the first space | 
| 197 |  |  |  |  |  |  | is recognized as the string (sequence) identifier and should be unique for each record. | 
| 198 |  |  |  |  |  |  | The number of sequences found in the input collection | 
| 199 |  |  |  |  |  |  | is returned and the file xyz.bin is written to disk. | 
| 200 |  |  |  |  |  |  |  | 
| 201 |  |  |  |  |  |  | Parameters: | 
| 202 |  |  |  |  |  |  |  | 
| 203 |  |  |  |  |  |  | =over 4 | 
| 204 |  |  |  |  |  |  |  | 
| 205 |  |  |  |  |  |  | =item pre_subst | 
| 206 |  |  |  |  |  |  |  | 
| 207 |  |  |  |  |  |  | An integer that determines how to interpret some special characters in the strings. | 
| 208 |  |  |  |  |  |  | The substitution is done before k-mers are extracted. | 
| 209 |  |  |  |  |  |  | Use 1 to eliminate all periods(.), hypens(-), and space characters(\s, which includes | 
| 210 |  |  |  |  |  |  | \s,\t,\n). This is the default behavior. | 
| 211 |  |  |  |  |  |  | Use 2 to convert same characters as above into underscores(_). | 
| 212 |  |  |  |  |  |  |  | 
| 213 |  |  |  |  |  |  | =item wordlen | 
| 214 |  |  |  |  |  |  |  | 
| 215 |  |  |  |  |  |  | An integer that specifies the k-mer length to index for each record in the input data file. | 
| 216 |  |  |  |  |  |  | Default is 7. | 
| 217 |  |  |  |  |  |  |  | 
| 218 |  |  |  |  |  |  | =item minlen | 
| 219 |  |  |  |  |  |  |  | 
| 220 |  |  |  |  |  |  | Specifies the minimum string length for a database sequence to be worthy of indexing. | 
| 221 |  |  |  |  |  |  | Default is 50. | 
| 222 |  |  |  |  |  |  |  | 
| 223 |  |  |  |  |  |  | =item valid_chars | 
| 224 |  |  |  |  |  |  |  | 
| 225 |  |  |  |  |  |  | Specifies the characters to allow for formatting.  Entire k-mer must be composed | 
| 226 |  |  |  |  |  |  | of these characters otherwise it will be ignored. When not defined, all | 
| 227 |  |  |  |  |  |  | characters are valid. | 
| 228 |  |  |  |  |  |  | Default is undef (indicating all characters are valid) | 
| 229 |  |  |  |  |  |  |  | 
| 230 |  |  |  |  |  |  | =item silent | 
| 231 |  |  |  |  |  |  |  | 
| 232 |  |  |  |  |  |  | Specifies if caller wants progress messages printed to STDERR while formatting. | 
| 233 |  |  |  |  |  |  | Default is false, meaning not silent. | 
| 234 |  |  |  |  |  |  |  | 
| 235 |  |  |  |  |  |  | =back | 
| 236 |  |  |  |  |  |  |  | 
| 237 |  |  |  |  |  |  | =cut | 
| 238 |  |  |  |  |  |  |  | 
| 239 |  |  |  |  |  |  |  | 
| 240 |  |  |  |  |  |  | sub formatdb { | 
| 241 |  |  |  |  |  |  | # Niels Larsen, November 2005. | 
| 242 |  |  |  |  |  |  | # Todd Z. DeSantis, July 2010 | 
| 243 |  |  |  |  |  |  |  | 
| 244 | 0 |  |  | 0 | 1 |  | my ( $self, $cl_args) = @_; | 
| 245 | 0 |  |  |  |  |  | $self->_check_arguments_for_formatdb($cl_args); | 
| 246 |  |  |  |  |  |  |  | 
| 247 |  |  |  |  |  |  | # Returns an integer. | 
| 248 |  |  |  |  |  |  |  | 
| 249 | 0 |  |  |  |  |  | my ( $data_fh, $bin_fh, $entry, @seq_oligos, $oligos, $oli2seqoffs, | 
| 250 |  |  |  |  |  |  | $seqnum, $oligo, @sids, $sids, $bytstr, $begpos, | 
| 251 |  |  |  |  |  |  | $seqtot, $bytlen, $olibeg, | 
| 252 |  |  |  |  |  |  | $oli_counts, $id, $seq, $count, $all_begpos, $all_bytlen, | 
| 253 |  |  |  |  |  |  | $valid_chars, $lastnums, $lastnum, @seqoffs, $off, @seqnums, | 
| 254 |  |  |  |  |  |  | $id_len); | 
| 255 |  |  |  |  |  |  | ## I think I can exlude $lastnums, never written to disk. | 
| 256 |  |  |  |  |  |  |  | 
| 257 | 0 |  |  |  |  |  | my $cl_wordlen = $self->{param}{wordlen}; | 
| 258 | 0 |  |  |  |  |  | my $cl_silent = $self->{param}{silent}; | 
| 259 | 0 |  |  |  |  |  | my $cl_binary = $self->{param}{binary}; | 
| 260 | 0 |  |  |  |  |  | my $cl_data = $self->{param}{data}; | 
| 261 | 0 | 0 |  |  |  |  | print "cl_data: $cl_data\n"  unless ($cl_silent); | 
| 262 | 0 | 0 |  |  |  |  | print "cl_binary: $cl_binary\n" unless ($cl_silent); | 
| 263 |  |  |  |  |  |  |  | 
| 264 |  |  |  |  |  |  |  | 
| 265 |  |  |  |  |  |  | # Find the number of sequences and the longest length identifier, | 
| 266 | 0 | 0 |  |  |  |  | print "Counting new fasta records and measuring size of identifiers... " if not $cl_silent; | 
| 267 | 0 |  |  |  |  |  | $seqtot = _count_records($cl_data); | 
| 268 | 0 | 0 |  |  |  |  | print "found $seqtot\n" if not $cl_silent; | 
| 269 |  |  |  |  |  |  |  | 
| 270 | 0 |  |  |  |  |  | $seqnum = 0; | 
| 271 | 0 |  |  |  |  |  | $oli_counts = ""; | 
| 272 | 0 |  |  |  |  |  | $valid_chars = $self->{param}{valid_chars}; | 
| 273 | 0 |  |  |  |  |  | $id_len = 0;  # init | 
| 274 |  |  |  |  |  |  |  | 
| 275 | 0 |  |  |  |  |  | $data_fh = new IO::File $self->{param}{data}, "r"; | 
| 276 |  |  |  |  |  |  |  | 
| 277 |  |  |  |  |  |  | { | 
| 278 | 0 |  |  |  |  |  | local $/ = '>'; | 
|  | 0 |  |  |  |  |  |  | 
| 279 | 0 | 0 |  |  |  |  | print "Mapping sub-sequences ... \n" unless ($cl_silent); | 
| 280 | 0 |  |  |  |  |  | while ( <$data_fh> ) { | 
| 281 | 0 | 0 |  |  |  |  | next if ($_ eq '>'); # toss the first empty record | 
| 282 | 0 |  |  |  |  |  | chomp $_; | 
| 283 |  |  |  |  |  |  |  | 
| 284 | 0 |  |  |  |  |  | my ($header, @seq_lines) = split /\n/, $_; | 
| 285 | 0 |  |  |  |  |  | my ($id) = split /\s/, $header; | 
| 286 | 0 |  |  |  |  |  | my $sequence = ''; | 
| 287 | 0 | 0 |  |  |  |  | if ($self->{param}{pre_subst} == 1) { | 
|  |  | 0 |  |  |  |  |  | 
| 288 | 0 |  |  |  |  |  | $sequence = join '', @seq_lines; | 
| 289 | 0 |  |  |  |  |  | $sequence =~ s/[\.\-\s]//g; | 
| 290 |  |  |  |  |  |  | } elsif ($self->{param}{pre_subst} == 2) { | 
| 291 | 0 |  |  |  |  |  | $sequence = join '_', @seq_lines; | 
| 292 | 0 |  |  |  |  |  | $sequence =~ s/[\.\-\s]/_/g; | 
| 293 |  |  |  |  |  |  | } | 
| 294 | 0 | 0 | 0 |  |  |  | next unless ($id && $sequence); | 
| 295 |  |  |  |  |  |  |  | 
| 296 | 0 |  |  |  |  |  | push @sids, $id; | 
| 297 |  |  |  |  |  |  | # format to a uniform character length AFTER longest is found | 
| 298 | 0 | 0 |  |  |  |  | $id_len = length($id) if (length($id) > $id_len); | 
| 299 |  |  |  |  |  |  |  | 
| 300 | 0 |  |  |  |  |  | @seq_oligos = _create_oligos( $sequence, $cl_wordlen, $valid_chars); | 
| 301 |  |  |  |  |  |  |  | 
| 302 | 0 |  |  |  |  |  | foreach $oligo ( @seq_oligos ) { | 
| 303 | 0 | 0 |  |  |  |  | if ( exists $oli2seqoffs->{ $oligo } ) { | 
| 304 | 0 |  |  |  |  |  | $lastnum = $lastnums->{ $oligo };   ## I think I can exlude $lastnums, never written to disk. | 
| 305 | 0 |  |  |  |  |  | $oli2seqoffs->{ $oligo } .= ",". ($seqnum - $lastnum); | 
| 306 |  |  |  |  |  |  | # Keep track of the hops needed to find this oligo in the seqnums | 
| 307 |  |  |  |  |  |  | } else { | 
| 308 | 0 |  |  |  |  |  | $oli2seqoffs->{ $oligo } = $seqnum; | 
| 309 |  |  |  |  |  |  | } | 
| 310 |  |  |  |  |  |  |  | 
| 311 | 0 |  |  |  |  |  | $lastnums->{ $oligo } = $seqnum; | 
| 312 |  |  |  |  |  |  | # keep track of the last $seqnum in which this $oligo was found. | 
| 313 |  |  |  |  |  |  | ## I think I can exlude $lastnums, never written to disk. | 
| 314 |  |  |  |  |  |  |  | 
| 315 |  |  |  |  |  |  | } | 
| 316 |  |  |  |  |  |  |  | 
| 317 | 0 |  |  |  |  |  | $oli_counts .= ",". ( scalar @seq_oligos ); | 
| 318 |  |  |  |  |  |  | # string-encoded ordered list of count of unique oligos in each seqnum | 
| 319 |  |  |  |  |  |  |  | 
| 320 | 0 |  |  |  |  |  | $seqnum++; | 
| 321 | 0 | 0 | 0 |  |  |  | if ( !$cl_silent && $seqnum % 5000 == 0 ) { | 
| 322 | 0 |  |  |  |  |  | print "$seqnum done of $seqtot ," . | 
| 323 | 0 |  |  |  |  |  | scalar(keys %{ $oli2seqoffs }) . " unique kmers found ...\n"; | 
| 324 |  |  |  |  |  |  | # this message is accurate since we just incremented | 
| 325 |  |  |  |  |  |  | # $seqnum.  So if we just finshed index 4 (which is the | 
| 326 |  |  |  |  |  |  | # fifth sequence, then $seqnum would have a value of 5" | 
| 327 |  |  |  |  |  |  | } | 
| 328 |  |  |  |  |  |  | } | 
| 329 |  |  |  |  |  |  | } | 
| 330 |  |  |  |  |  |  |  | 
| 331 | 0 |  |  |  |  |  | $seqtot = $seqnum; | 
| 332 |  |  |  |  |  |  | # this is true since $seqnum was incremented after last sequence | 
| 333 |  |  |  |  |  |  |  | 
| 334 | 0 | 0 |  |  |  |  | if ( not $cl_silent ) | 
| 335 |  |  |  |  |  |  | { | 
| 336 | 0 |  |  |  |  |  | print "Total sequence count: $seqnum of $seqtot ," . | 
| 337 | 0 |  |  |  |  |  | scalar(keys %{ $oli2seqoffs }) . " unique kmers found.\n"; | 
| 338 |  |  |  |  |  |  | } | 
| 339 | 0 |  |  |  |  |  | $data_fh->close; | 
| 340 |  |  |  |  |  |  |  | 
| 341 |  |  |  |  |  |  |  | 
| 342 | 0 |  |  |  |  |  | $self->{db_string_count} = $seqnum; | 
| 343 | 0 |  |  |  |  |  | $self->{unique_kmer_count} = scalar(keys %{ $oli2seqoffs }); | 
|  | 0 |  |  |  |  |  |  | 
| 344 |  |  |  |  |  |  |  | 
| 345 |  |  |  |  |  |  | # Write binary file. We write binary representations using syswrite | 
| 346 |  |  |  |  |  |  | # and save the word length and number of sequences (the rest can be | 
| 347 |  |  |  |  |  |  | # calculated) | 
| 348 |  |  |  |  |  |  |  | 
| 349 | 0 |  |  |  |  |  | eval { $bin_fh = IO::File->new( $cl_binary, 'w' ) }; | 
|  | 0 |  |  |  |  |  |  | 
| 350 | 0 | 0 |  |  |  |  | if ($@) { | 
| 351 | 0 |  |  |  |  |  | print STDERR $@; | 
| 352 |  |  |  |  |  |  | # such as: your vendor has not defined Fcntl macro O_LARGEFILE | 
| 353 | 0 | 0 |  |  |  |  | if ( not $bin_fh = new IO::File $cl_binary, "w") { | 
| 354 |  |  |  |  |  |  |  | 
| 355 | 0 |  |  |  |  |  | print STDERR "Could not syswrite-open file $cl_binary\n"; | 
| 356 | 0 |  |  |  |  |  | exit; | 
| 357 |  |  |  |  |  |  | } | 
| 358 |  |  |  |  |  |  | } | 
| 359 |  |  |  |  |  |  | # print STDERR Dumper($bin_fh); | 
| 360 | 0 | 0 |  |  |  |  | unless (defined $bin_fh) { | 
| 361 | 0 |  |  |  |  |  | print STDERR "A filehandle to $cl_binary has not been defined\n"; | 
| 362 | 0 |  |  |  |  |  | exit; | 
| 363 |  |  |  |  |  |  | } | 
| 364 |  |  |  |  |  |  |  | 
| 365 | 0 | 0 |  |  |  |  | if ( not $cl_silent ) { | 
| 366 | 0 |  |  |  |  |  | print "Total number of unique oligos: " . scalar(keys %{ $oli2seqoffs } ) . "\n"; | 
|  | 0 |  |  |  |  |  |  | 
| 367 | 0 |  |  |  |  |  | print "Writing oligo map to file ...\n"; | 
| 368 |  |  |  |  |  |  | } | 
| 369 |  |  |  |  |  |  |  | 
| 370 |  |  |  |  |  |  | # Id length, word_length, and total number of sequences, | 
| 371 |  |  |  |  |  |  |  | 
| 372 | 0 |  |  |  |  |  | $bytstr = pack "A10", $id_len;  # the max_length is encoded as text | 
| 373 | 0 |  |  |  |  |  | syswrite $bin_fh, $bytstr; | 
| 374 |  |  |  |  |  |  |  | 
| 375 | 0 |  |  |  |  |  | $bytstr = pack "A10", $cl_wordlen; | 
| 376 | 0 |  |  |  |  |  | syswrite $bin_fh, $bytstr; | 
| 377 |  |  |  |  |  |  |  | 
| 378 | 0 |  |  |  |  |  | $bytstr = pack "A10", $seqtot; | 
| 379 | 0 |  |  |  |  |  | syswrite $bin_fh, $bytstr; | 
| 380 |  |  |  |  |  |  |  | 
| 381 |  |  |  |  |  |  | # Short ids as a string of fixed-length character words, | 
| 382 | 0 |  |  |  |  |  | $bytstr = pack "A$id_len" x $seqtot, @sids; | 
| 383 | 0 |  |  |  |  |  | syswrite $bin_fh, $bytstr; | 
| 384 |  |  |  |  |  |  |  | 
| 385 |  |  |  |  |  |  | # Write byte strings of matching sequence numbers, while keeping | 
| 386 |  |  |  |  |  |  | # track of their byte position offsets, | 
| 387 | 0 |  |  |  |  |  | $begpos = sysseek $bin_fh, 0, 1;   # recommended "systell", perl book says | 
| 388 | 0 | 0 |  |  |  |  | $begpos = 0 if ($begpos =~ /^0/);  # sometimes $begpos gets set from sysseek as "0 but true" | 
| 389 |  |  |  |  |  |  | # I think this has something to do with not using the Fcntl | 
| 390 |  |  |  |  |  |  | # flags: O_WRONLY|O_CREAT|O_EXCL|O_LARGEFILE | 
| 391 |  |  |  |  |  |  | # since this behavoir emerged when I stopped using them. | 
| 392 |  |  |  |  |  |  | # BUT, I expect sysseek to return a positive int | 
| 393 |  |  |  |  |  |  | # since many bytes have been already written to this filehandle. | 
| 394 |  |  |  |  |  |  |  | 
| 395 | 0 |  |  |  |  |  | @seq_oligos = (); | 
| 396 |  |  |  |  |  |  |  | 
| 397 | 0 |  |  |  |  |  | $all_begpos = ""; | 
| 398 | 0 |  |  |  |  |  | $all_bytlen = ""; | 
| 399 |  |  |  |  |  |  |  | 
| 400 | 0 |  |  |  |  |  | $count = 0; | 
| 401 |  |  |  |  |  |  |  | 
| 402 | 0 |  |  |  |  |  | foreach $oligo ( sort(keys %{ $oli2seqoffs }) ) # "sort" added by tzd  Apr-11-2008 | 
|  | 0 |  |  |  |  |  |  | 
| 403 |  |  |  |  |  |  | { | 
| 404 |  |  |  |  |  |  |  | 
| 405 | 0 |  |  |  |  |  | @seqoffs = eval $oli2seqoffs->{ $oligo }; | 
| 406 |  |  |  |  |  |  | # print STDERR '@seqoffs:' . join(',', @seqoffs) . "\n"; | 
| 407 |  |  |  |  |  |  |  | 
| 408 | 0 |  |  |  |  |  | @seqnums = shift @seqoffs; | 
| 409 |  |  |  |  |  |  |  | 
| 410 |  |  |  |  |  |  |  | 
| 411 | 0 |  |  |  |  |  | foreach $off ( @seqoffs ) | 
| 412 |  |  |  |  |  |  | { | 
| 413 | 0 |  |  |  |  |  | push @seqnums, $seqnums[-1] + $off; | 
| 414 |  |  |  |  |  |  | # -1 looks-up the value of the last element in the @seqnums | 
| 415 |  |  |  |  |  |  |  | 
| 416 |  |  |  |  |  |  | } | 
| 417 |  |  |  |  |  |  | # print STDERR '@seqnums:' . join(',', @seqnums) . "\n"; | 
| 418 |  |  |  |  |  |  |  | 
| 419 | 0 |  |  |  |  |  | $bytstr = pack "I*", @seqnums;  # unsigned integers | 
| 420 |  |  |  |  |  |  | # the rest of this module | 
| 421 |  |  |  |  |  |  | # assumes that integers will | 
| 422 |  |  |  |  |  |  | # be 4-byte which is a fairly | 
| 423 |  |  |  |  |  |  | # safe assumption | 
| 424 | 0 |  |  |  |  |  | syswrite $bin_fh, $bytstr; | 
| 425 |  |  |  |  |  |  |  | 
| 426 | 0 |  |  |  |  |  | $bytlen = length $bytstr; | 
| 427 |  |  |  |  |  |  |  | 
| 428 | 0 |  |  |  |  |  | $all_begpos .= ",$begpos"; | 
| 429 | 0 |  |  |  |  |  | $all_bytlen .= ",$bytlen"; | 
| 430 |  |  |  |  |  |  |  | 
| 431 | 0 |  |  |  |  |  | $begpos += $bytlen; | 
| 432 |  |  |  |  |  |  |  | 
| 433 | 0 |  |  |  |  |  | delete $oli2seqoffs->{ $oligo }; | 
| 434 |  |  |  |  |  |  |  | 
| 435 | 0 |  |  |  |  |  | push @seq_oligos, $oligo; | 
| 436 |  |  |  |  |  |  |  | 
| 437 | 0 |  |  |  |  |  | $count += 1; | 
| 438 |  |  |  |  |  |  | } | 
| 439 |  |  |  |  |  |  |  | 
| 440 | 0 |  |  |  |  |  | $oli2seqoffs = {}; | 
| 441 |  |  |  |  |  |  |  | 
| 442 | 0 |  |  |  |  |  | $olibeg = $begpos; | 
| 443 |  |  |  |  |  |  |  | 
| 444 |  |  |  |  |  |  | # Write the oligos found, | 
| 445 |  |  |  |  |  |  |  | 
| 446 | 0 |  |  |  |  |  | $bytstr = pack "A$cl_wordlen" x ( scalar @seq_oligos ), @seq_oligos; | 
| 447 | 0 |  |  |  |  |  | syswrite $bin_fh, $bytstr; | 
| 448 |  |  |  |  |  |  |  | 
| 449 |  |  |  |  |  |  | # The begin positions of the corresponding run of sequence indices, | 
| 450 |  |  |  |  |  |  | # and their lengths, | 
| 451 |  |  |  |  |  |  |  | 
| 452 | 0 |  |  |  |  |  | $all_begpos =~ s/^,//;  # get rid of initial comma | 
| 453 | 0 |  |  |  |  |  | $bytstr = pack "I*", eval $all_begpos; | 
| 454 | 0 |  |  |  |  |  | syswrite $bin_fh, $bytstr; | 
| 455 |  |  |  |  |  |  |  | 
| 456 | 0 |  |  |  |  |  | $all_bytlen =~ s/^,//; | 
| 457 | 0 |  |  |  |  |  | $bytstr = pack "I*", eval $all_bytlen; | 
| 458 | 0 |  |  |  |  |  | syswrite $bin_fh, $bytstr; | 
| 459 |  |  |  |  |  |  |  | 
| 460 |  |  |  |  |  |  | # Store the number of unique oligos for every sequence, | 
| 461 |  |  |  |  |  |  |  | 
| 462 | 0 |  |  |  |  |  | $oli_counts =~ s/^,//; | 
| 463 | 0 |  |  |  |  |  | $bytstr = pack "I*", eval $oli_counts; | 
| 464 | 0 |  |  |  |  |  | syswrite $bin_fh, $bytstr; | 
| 465 |  |  |  |  |  |  |  | 
| 466 |  |  |  |  |  |  | # Finally, we need to store the number of oligos encountered and the | 
| 467 |  |  |  |  |  |  | # byte position of where they were stored, | 
| 468 |  |  |  |  |  |  |  | 
| 469 | 0 |  |  |  |  |  | $bytstr = pack "A10", scalar @seq_oligos; | 
| 470 | 0 |  |  |  |  |  | syswrite $bin_fh, $bytstr; | 
| 471 |  |  |  |  |  |  |  | 
| 472 | 0 |  |  |  |  |  | $bytstr = pack "A10", $olibeg; | 
| 473 | 0 |  |  |  |  |  | syswrite $bin_fh, $bytstr; | 
| 474 |  |  |  |  |  |  |  | 
| 475 | 0 |  |  |  |  |  | $bin_fh->close; | 
| 476 |  |  |  |  |  |  |  | 
| 477 |  |  |  |  |  |  |  | 
| 478 | 0 | 0 |  |  |  |  | print "format binary file complete\n" unless ($cl_silent); | 
| 479 | 0 |  |  |  |  |  | return $seqnum; | 
| 480 |  |  |  |  |  |  | } | 
| 481 |  |  |  |  |  |  |  | 
| 482 |  |  |  |  |  |  | =head2 _check_arguments_for_formatdb | 
| 483 |  |  |  |  |  |  |  | 
| 484 |  |  |  |  |  |  | It checks that word length and minimum sequence length is reasonable. | 
| 485 |  |  |  |  |  |  | Sets silent to true by default. | 
| 486 |  |  |  |  |  |  | Sets pre_subst to 1 by default. | 
| 487 |  |  |  |  |  |  | Defines binary file name. | 
| 488 |  |  |  |  |  |  | If errors are found, program exits. | 
| 489 |  |  |  |  |  |  | Returns a hash or hash ref | 
| 490 |  |  |  |  |  |  | depending on the context of the method call. | 
| 491 |  |  |  |  |  |  |  | 
| 492 |  |  |  |  |  |  | =cut | 
| 493 |  |  |  |  |  |  |  | 
| 494 |  |  |  |  |  |  |  | 
| 495 |  |  |  |  |  |  | sub _check_arguments_for_formatdb { | 
| 496 |  |  |  |  |  |  | # Niels Larsen, May 2004. | 
| 497 |  |  |  |  |  |  | # Todd Z. DeSantis, March 2008. | 
| 498 | 0 |  |  | 0 |  |  | my ( $self, | 
| 499 |  |  |  |  |  |  | $cl_args,    # Command Line ARGument hash | 
| 500 |  |  |  |  |  |  | # not really reqired to come from the command line | 
| 501 |  |  |  |  |  |  | ) = @_; | 
| 502 |  |  |  |  |  |  |  | 
| 503 | 0 |  |  |  |  |  | my ( @errors, $error); | 
| 504 |  |  |  |  |  |  |  | 
| 505 |  |  |  |  |  |  | ## see if caller wants to constrain the alphabet | 
| 506 | 0 | 0 |  |  |  |  | if ( exists $cl_args->{valid_chars} ) { | 
| 507 | 0 |  |  |  |  |  | $self->{param}{valid_chars} = $cl_args->{valid_chars}; | 
| 508 |  |  |  |  |  |  | } else { | 
| 509 | 0 |  |  |  |  |  | $self->{param}{valid_chars} = undef; | 
| 510 |  |  |  |  |  |  | } | 
| 511 |  |  |  |  |  |  |  | 
| 512 |  |  |  |  |  |  | ## see if caller wants to make some pre-substitutions | 
| 513 | 0 | 0 |  |  |  |  | if ( exists $cl_args->{pre_subst} ) { | 
| 514 | 0 |  |  |  |  |  | $self->{param}{pre_subst} = $cl_args->{pre_subst}; | 
| 515 |  |  |  |  |  |  | } else { | 
| 516 | 0 |  |  |  |  |  | $self->{param}{pre_subst} = 1; | 
| 517 |  |  |  |  |  |  | } | 
| 518 |  |  |  |  |  |  |  | 
| 519 |  |  |  |  |  |  | # Ensure mininum sequence length is 1 or more and fill in 50 | 
| 520 |  |  |  |  |  |  | # if user didnt specify it | 
| 521 | 0 | 0 |  |  |  |  | if ( $cl_args->{"minlen"} ) | 
| 522 |  |  |  |  |  |  | { | 
| 523 | 0 | 0 |  |  |  |  | if ( $cl_args->{"minlen"} < 1 ) { | 
| 524 | 0 |  |  |  |  |  | push @errors, qq (Minimum sequence length should be 1 or more); | 
| 525 |  |  |  |  |  |  | } | 
| 526 |  |  |  |  |  |  | } | 
| 527 |  |  |  |  |  |  | else { | 
| 528 | 0 |  |  |  |  |  | $cl_args->{"minlen"} = 50; | 
| 529 |  |  |  |  |  |  | } | 
| 530 | 0 |  |  |  |  |  | $self->{param}{minlen} = $cl_args->{"minlen"}; | 
| 531 |  |  |  |  |  |  |  | 
| 532 |  |  |  |  |  |  | # Ensure word length is over 0 and less than | 
| 533 |  |  |  |  |  |  | # or equal to minlen and fill in 7 if the | 
| 534 |  |  |  |  |  |  | # user didnt specify it, | 
| 535 | 0 | 0 |  |  |  |  | if ( $cl_args->{"wordlen"} ) { | 
| 536 | 0 | 0 |  |  |  |  | if ( $cl_args->{"wordlen"} < 1 ) { | 
| 537 | 0 |  |  |  |  |  | push @errors, qq (Word length should be at least 1); | 
| 538 |  |  |  |  |  |  | } | 
| 539 | 0 | 0 |  |  |  |  | if ( $cl_args->{"wordlen"} > $cl_args->{"minlen"} ) { | 
| 540 | 0 |  |  |  |  |  | push @errors, qq (Word length no greater than the minlen); | 
| 541 |  |  |  |  |  |  | } | 
| 542 |  |  |  |  |  |  | } else { | 
| 543 | 0 |  |  |  |  |  | $cl_args->{"wordlen"} = 7; | 
| 544 |  |  |  |  |  |  | } | 
| 545 | 0 |  |  |  |  |  | $self->{param}{wordlen} = $cl_args->{"wordlen"}; | 
| 546 |  |  |  |  |  |  |  | 
| 547 | 0 | 0 |  |  |  |  | if ( not defined $cl_args->{"silent"} ) { | 
| 548 | 0 |  |  |  |  |  | $cl_args->{"silent"} = 0 | 
| 549 |  |  |  |  |  |  | } | 
| 550 | 0 |  |  |  |  |  | $self->{param}{silent} = $cl_args->{"silent"}; | 
| 551 |  |  |  |  |  |  |  | 
| 552 |  |  |  |  |  |  | # Print errors if any and exit, | 
| 553 | 0 | 0 |  |  |  |  | if ( @errors ) | 
| 554 |  |  |  |  |  |  | { | 
| 555 | 0 |  |  |  |  |  | foreach $error ( @errors ) | 
| 556 |  |  |  |  |  |  | { | 
| 557 | 0 |  |  |  |  |  | print STDERR "$error\n"; | 
| 558 |  |  |  |  |  |  | } | 
| 559 |  |  |  |  |  |  |  | 
| 560 | 0 |  |  |  |  |  | exit; | 
| 561 |  |  |  |  |  |  | } | 
| 562 |  |  |  |  |  |  | else { | 
| 563 | 0 | 0 |  |  |  |  | wantarray ? return %{ $cl_args } : return $cl_args; | 
|  | 0 |  |  |  |  |  |  | 
| 564 |  |  |  |  |  |  | } | 
| 565 |  |  |  |  |  |  |  | 
| 566 |  |  |  |  |  |  | } | 
| 567 |  |  |  |  |  |  |  | 
| 568 |  |  |  |  |  |  | =head2 match_oligos({query => $path}) | 
| 569 |  |  |  |  |  |  |  | 
| 570 |  |  |  |  |  |  | $sr->match_oligos({query => 'query.fasta'}); | 
| 571 |  |  |  |  |  |  | $sr->match_oligos({ query => 'query.fasta', | 
| 572 |  |  |  |  |  |  | outlen => 10, | 
| 573 |  |  |  |  |  |  | minpct => 95, | 
| 574 |  |  |  |  |  |  | reverse => 1, | 
| 575 |  |  |  |  |  |  | outfile => '/home/donny/sr_results.txt', | 
| 576 |  |  |  |  |  |  | }); | 
| 577 |  |  |  |  |  |  |  | 
| 578 |  |  |  |  |  |  | my $matches = $sr->match_oligos({query => 'query.fasta'}); | 
| 579 |  |  |  |  |  |  | print Dumper($matches); | 
| 580 |  |  |  |  |  |  | foreach my $k (keys %{$matches} ) { | 
| 581 |  |  |  |  |  |  | print "matches for $k :\n"; | 
| 582 |  |  |  |  |  |  | foreach my $hit ( @{ $matches->{$k} } ) { | 
| 583 |  |  |  |  |  |  | print "hit id:" . $hit->[0] . " perc:" . $hit->[1] . "\n"; | 
| 584 |  |  |  |  |  |  | } | 
| 585 |  |  |  |  |  |  | } | 
| 586 |  |  |  |  |  |  |  | 
| 587 |  |  |  |  |  |  | This routine quickly estimates the overall similarity between | 
| 588 |  |  |  |  |  |  | a given set of DNA or RNA sequence(s) and a background set | 
| 589 |  |  |  |  |  |  | of database sequences (usually homologues). | 
| 590 |  |  |  |  |  |  | It returns a sorted list of similarities as a | 
| 591 |  |  |  |  |  |  | table. The similarity between sequences A and B are the number | 
| 592 |  |  |  |  |  |  | of unique k-words (short subsequence) that they share, divided | 
| 593 |  |  |  |  |  |  | by the smallest total unique k-word count in either A or B. The result | 
| 594 |  |  |  |  |  |  | are scores that do not depend on sequence lengths. When | 
| 595 |  |  |  |  |  |  | called in void context, the routine prints to the given output | 
| 596 |  |  |  |  |  |  | file or STDOUT; otherwise a hash_ref is returned. | 
| 597 |  |  |  |  |  |  |  | 
| 598 |  |  |  |  |  |  | Parameters: | 
| 599 |  |  |  |  |  |  |  | 
| 600 |  |  |  |  |  |  | =over 4 | 
| 601 |  |  |  |  |  |  |  | 
| 602 |  |  |  |  |  |  | =item query | 
| 603 |  |  |  |  |  |  |  | 
| 604 |  |  |  |  |  |  | Sets the path where the query fasta file will be found. | 
| 605 |  |  |  |  |  |  | Required.  In future versions, query could be a data structure instead. | 
| 606 |  |  |  |  |  |  | Need to build an abstract iterator for this feature to be enabled. | 
| 607 |  |  |  |  |  |  |  | 
| 608 |  |  |  |  |  |  | =item minpct | 
| 609 |  |  |  |  |  |  |  | 
| 610 |  |  |  |  |  |  | A real number indicating the  the minimum percent match that should | 
| 611 |  |  |  |  |  |  | be output/returned. | 
| 612 |  |  |  |  |  |  | Default = 50. | 
| 613 |  |  |  |  |  |  |  | 
| 614 |  |  |  |  |  |  | =item outlen | 
| 615 |  |  |  |  |  |  |  | 
| 616 |  |  |  |  |  |  | An integer indicating  the maximum number of ranked db matches that | 
| 617 |  |  |  |  |  |  | should be output/returned. | 
| 618 |  |  |  |  |  |  | Default = 100. | 
| 619 |  |  |  |  |  |  |  | 
| 620 |  |  |  |  |  |  | =item valid_chars | 
| 621 |  |  |  |  |  |  |  | 
| 622 |  |  |  |  |  |  | Specifies the characters to allow for kmer-searching.  Entire n-mer must be composed | 
| 623 |  |  |  |  |  |  | of these characters otherwise it will be ignored. When not defined, all | 
| 624 |  |  |  |  |  |  | characters are valid. | 
| 625 |  |  |  |  |  |  | Default is undef (indicating all characters are valid) | 
| 626 |  |  |  |  |  |  |  | 
| 627 |  |  |  |  |  |  | =item reverse | 
| 628 |  |  |  |  |  |  |  | 
| 629 |  |  |  |  |  |  | A boolean value. If true, reverses input sequence. | 
| 630 |  |  |  |  |  |  | Default = false. | 
| 631 |  |  |  |  |  |  |  | 
| 632 |  |  |  |  |  |  | =item noids | 
| 633 |  |  |  |  |  |  |  | 
| 634 |  |  |  |  |  |  | A boolean value. If true, prints database index numbers instead of sequence ids. | 
| 635 |  |  |  |  |  |  | Default = false. | 
| 636 |  |  |  |  |  |  |  | 
| 637 |  |  |  |  |  |  | =item outfile | 
| 638 |  |  |  |  |  |  |  | 
| 639 |  |  |  |  |  |  | Sets the path of the output file (instead of sending to STDOUT). | 
| 640 |  |  |  |  |  |  | Default = false.  Meaning output is sent to STDOUT. | 
| 641 |  |  |  |  |  |  |  | 
| 642 |  |  |  |  |  |  | =item silent | 
| 643 |  |  |  |  |  |  |  | 
| 644 |  |  |  |  |  |  | Specifies if caller wants progress messages printed to STDERR while matching. | 
| 645 |  |  |  |  |  |  | Default is false, meaning not silent. | 
| 646 |  |  |  |  |  |  |  | 
| 647 |  |  |  |  |  |  | =back | 
| 648 |  |  |  |  |  |  |  | 
| 649 |  |  |  |  |  |  | =cut | 
| 650 |  |  |  |  |  |  |  | 
| 651 |  |  |  |  |  |  |  | 
| 652 |  |  |  |  |  |  | sub match_oligos { | 
| 653 |  |  |  |  |  |  | # Niels Larsen, May 2004. | 
| 654 |  |  |  |  |  |  | # Todd Z. DeSantis, March 2008. | 
| 655 |  |  |  |  |  |  |  | 
| 656 | 0 |  |  | 0 | 1 |  | my ( $self, | 
| 657 |  |  |  |  |  |  | $cl_args,    # Arguments hash | 
| 658 |  |  |  |  |  |  | ) = @_; | 
| 659 | 0 |  |  |  |  |  | $self->_check_arguments_for_match_oligos($cl_args); | 
| 660 |  |  |  |  |  |  | # Returns a list of matches if not called in void context. | 
| 661 |  |  |  |  |  |  |  | 
| 662 | 0 |  |  |  |  |  | my ( $q_seqs, $count, $silent, @sids, $wordlen, $seqnum, $bin_fh, | 
| 663 |  |  |  |  |  |  | $query_fh, $bytstr, $olinum, $length, @oligos, $oligo, @scores, | 
| 664 |  |  |  |  |  |  | $oli2seqnums, $vector, @begpos, @endpos, $oli2pos, $i, @seqnums, | 
| 665 |  |  |  |  |  |  | $j, $seqtot, $score, @lengths, @temp, $valid_chars, $minscore, | 
| 666 |  |  |  |  |  |  | $olibeg, $olitot, $begpos, $endpos, $pack_bits, @scovec, $scovec, | 
| 667 |  |  |  |  |  |  | $query_sid, $out_fh, $outdir, $outline, $matches, @oli_counts, | 
| 668 |  |  |  |  |  |  | $oli_count, $scovec_null, $index, $outlen, $pos, $id, $seq, | 
| 669 |  |  |  |  |  |  | $id_len); | 
| 670 |  |  |  |  |  |  |  | 
| 671 | 0 |  |  |  |  |  | my $void_context = 1; | 
| 672 |  |  |  |  |  |  | # Recall 'wantarray' returns | 
| 673 |  |  |  |  |  |  | # True if the context is looking for a list value, | 
| 674 |  |  |  |  |  |  | # False if the context is looking for a scalar, | 
| 675 |  |  |  |  |  |  | # Undef if the context is looking for no value (void context). | 
| 676 | 0 | 0 |  |  |  |  | $void_context = 0 if (defined wantarray);  # caller wants data back | 
| 677 |  |  |  |  |  |  |  | 
| 678 | 0 |  |  |  |  |  | $silent = $self->{param}{silent}; | 
| 679 |  |  |  |  |  |  |  | 
| 680 |  |  |  |  |  |  | # >>>>>>>>>>>>>>>>>>>>>> FILE MAP SECTION <<<<<<<<<<<<<<<<<<<<<<<< | 
| 681 |  |  |  |  |  |  |  | 
| 682 |  |  |  |  |  |  | # This section creates $oli2pos, where each key is a subsequence | 
| 683 |  |  |  |  |  |  | # and values are [ file byte position, byte length to read ]. This | 
| 684 |  |  |  |  |  |  | # map is used below to sum up similarities. | 
| 685 |  |  |  |  |  |  |  | 
| 686 | 0 |  |  |  |  |  | eval { $bin_fh = IO::File->new( $self->{param}{binary}, O_RDONLY|O_LARGEFILE ) }; | 
|  | 0 |  |  |  |  |  |  | 
| 687 | 0 | 0 |  |  |  |  | if ($@) { | 
| 688 |  |  |  |  |  |  | # such as: your vendor has not defined Fcntl macro O_LARGEFILE | 
| 689 | 0 | 0 |  |  |  |  | if ( not $bin_fh = new IO::File $self->{param}{binary}, "r") { | 
| 690 |  |  |  |  |  |  |  | 
| 691 | 0 |  |  |  |  |  | print STDERR "Could not open file $self->{param}{binary} for reading\n"; | 
| 692 | 0 |  |  |  |  |  | exit; | 
| 693 |  |  |  |  |  |  | } | 
| 694 |  |  |  |  |  |  | } | 
| 695 |  |  |  |  |  |  |  | 
| 696 |  |  |  |  |  |  | # Word length and total number of sequences, | 
| 697 |  |  |  |  |  |  | # reminder: sysread FILEHANDLE,SCALAR,LENGTH | 
| 698 |  |  |  |  |  |  | # Attempts to read LENGTH bytes of data into variable SCALAR | 
| 699 |  |  |  |  |  |  | # from the specified FILEHANDLE, | 
| 700 | 0 |  |  |  |  |  | sysread $bin_fh, $bytstr, 10; | 
| 701 | 0 |  |  |  |  |  | $id_len = $bytstr * 1; | 
| 702 |  |  |  |  |  |  |  | 
| 703 | 0 |  |  |  |  |  | sysread $bin_fh, $bytstr, 10; | 
| 704 | 0 |  |  |  |  |  | $wordlen = $bytstr * 1; | 
| 705 |  |  |  |  |  |  |  | 
| 706 | 0 |  |  |  |  |  | sysread $bin_fh, $bytstr, 10; | 
| 707 | 0 |  |  |  |  |  | $seqtot = $bytstr * 1; | 
| 708 |  |  |  |  |  |  |  | 
| 709 |  |  |  |  |  |  | # String ids as a string of fixed-length character words, | 
| 710 | 0 |  |  |  |  |  | sysread $bin_fh, $bytstr, $seqtot * $id_len; | 
| 711 | 0 |  |  |  |  |  | @sids = unpack "A$id_len" x $seqtot, $bytstr; | 
| 712 |  |  |  |  |  |  |  | 
| 713 |  |  |  |  |  |  | # Get the oligos and the begin and end positions of where the sequence | 
| 714 |  |  |  |  |  |  | # indices start and their lengths, | 
| 715 |  |  |  |  |  |  |  | 
| 716 | 0 |  |  |  |  |  | sysseek $bin_fh, -10, 2;   # goto 10 bytes before EOF | 
| 717 | 0 |  |  |  |  |  | sysread $bin_fh, $bytstr, 10; | 
| 718 | 0 |  |  |  |  |  | $olibeg = $bytstr * 1; | 
| 719 |  |  |  |  |  |  |  | 
| 720 | 0 |  |  |  |  |  | sysseek $bin_fh, -20, 2;   # goto 20 bytes before EOF | 
| 721 | 0 |  |  |  |  |  | sysread $bin_fh, $bytstr, 10; | 
| 722 | 0 |  |  |  |  |  | $olitot = $bytstr * 1; | 
| 723 |  |  |  |  |  |  |  | 
| 724 | 0 |  |  |  |  |  | sysseek $bin_fh, $olibeg, 0; | 
| 725 | 0 |  |  |  |  |  | sysread $bin_fh, $bytstr, $olitot * $wordlen; | 
| 726 | 0 |  |  |  |  |  | @oligos = unpack "A$wordlen" x $olitot, $bytstr; | 
| 727 |  |  |  |  |  |  |  | 
| 728 | 0 |  |  |  |  |  | sysread $bin_fh, $bytstr, 4 * $olitot; | 
| 729 | 0 |  |  |  |  |  | @begpos = unpack "I*", $bytstr; | 
| 730 |  |  |  |  |  |  |  | 
| 731 | 0 |  |  |  |  |  | sysread $bin_fh, $bytstr, 4 * $olitot; | 
| 732 | 0 |  |  |  |  |  | @lengths = unpack "I*", $bytstr; | 
| 733 |  |  |  |  |  |  |  | 
| 734 | 0 |  |  |  |  |  | sysread $bin_fh, $bytstr, 4 * $seqtot; | 
| 735 | 0 |  |  |  |  |  | @oli_counts = unpack "I*", $bytstr; | 
| 736 |  |  |  |  |  |  |  | 
| 737 |  |  |  |  |  |  | # Create a hash that returns the file positions of where to get the | 
| 738 |  |  |  |  |  |  | # matching sequence indices, | 
| 739 |  |  |  |  |  |  |  | 
| 740 | 0 |  |  |  |  |  | for ( $i = 0; $i < scalar @oligos; $i++ ) { | 
| 741 | 0 |  |  |  |  |  | $oli2pos->{ $oligos[$i] } = [ $begpos[$i], $lengths[$i] ]; | 
| 742 |  |  |  |  |  |  | } | 
| 743 |  |  |  |  |  |  |  | 
| 744 |  |  |  |  |  |  | # >>>>>>>>>>>>>>>>>>>> PROCESS QUERY ENTRIES <<<<<<<<<<<<<<<<<<<<<<< | 
| 745 |  |  |  |  |  |  |  | 
| 746 | 0 | 0 | 0 |  |  |  | if ( $void_context == 1 && $self->{param}{outfile} ) { | 
| 747 |  |  |  |  |  |  | # call was made in void context | 
| 748 | 0 |  |  |  |  |  | $out_fh = new IO::File $self->{param}{outfile}, "w"; | 
| 749 | 0 | 0 |  |  |  |  | print "Outfile opened.\n" unless ($self->{param}{silent}); | 
| 750 |  |  |  |  |  |  | } | 
| 751 |  |  |  |  |  |  |  | 
| 752 | 0 |  |  |  |  |  | $query_fh = new IO::File $self->{param}{"query"}, "r"; | 
| 753 | 0 |  |  |  |  |  | $scovec_null = pack "I*", (0) x $seqtot; | 
| 754 |  |  |  |  |  |  |  | 
| 755 | 0 |  | 0 |  |  |  | $minscore = ( $self->{param}{"minpct"} || 0 ) / 100; | 
| 756 |  |  |  |  |  |  |  | 
| 757 |  |  |  |  |  |  | { | 
| 758 | 0 |  |  |  |  |  | local $/ = '>'; | 
|  | 0 |  |  |  |  |  |  | 
| 759 | 0 |  |  |  |  |  | while ( <$query_fh>) { | 
| 760 | 0 | 0 |  |  |  |  | next if ($_ eq '>'); # toss the first empty record | 
| 761 | 0 |  |  |  |  |  | chomp $_; | 
| 762 | 0 |  |  |  |  |  | my ($header, @seq_lines) = split /\n/, $_; | 
| 763 | 0 |  |  |  |  |  | my ($query_sid) = split /\s/, $header; | 
| 764 | 0 |  |  |  |  |  | my $sequence = ''; | 
| 765 | 0 | 0 |  |  |  |  | if ($self->{param}{pre_subst} == 1) { | 
|  |  | 0 |  |  |  |  |  | 
| 766 | 0 |  |  |  |  |  | $sequence = join '', @seq_lines; | 
| 767 | 0 |  |  |  |  |  | $sequence =~ s/[\.\-\s]//g; | 
| 768 |  |  |  |  |  |  | } elsif ($self->{param}{pre_subst} == 2) { | 
| 769 | 0 |  |  |  |  |  | $sequence = join '_', @seq_lines; | 
| 770 | 0 |  |  |  |  |  | $sequence =~ s/[\.\-\s]/_/g; | 
| 771 |  |  |  |  |  |  | } | 
| 772 |  |  |  |  |  |  |  | 
| 773 | 0 | 0 | 0 |  |  |  | next unless ($query_sid && $sequence); | 
| 774 |  |  |  |  |  |  |  | 
| 775 | 0 | 0 |  |  |  |  | if ( $self->{param}{"reverse"} ) { | 
| 776 | 0 |  |  |  |  |  | $sequence = reverse $sequence; | 
| 777 |  |  |  |  |  |  | } | 
| 778 |  |  |  |  |  |  |  | 
| 779 | 0 | 0 |  |  |  |  | print "Processing $query_sid ... \n" unless  ( $silent ); | 
| 780 |  |  |  |  |  |  |  | 
| 781 | 0 |  |  |  |  |  | $scovec = $scovec_null; | 
| 782 |  |  |  |  |  |  |  | 
| 783 |  |  |  |  |  |  | # >>>>>>>>>>>>>>>>>>>>>> COUNTING SECTION <<<<<<<<<<<<<<<<<<<<<<<< | 
| 784 |  |  |  |  |  |  |  | 
| 785 |  |  |  |  |  |  | # Look up the sequences that contain each of the oligos found | 
| 786 |  |  |  |  |  |  | # in the sequence. The sort makes the disk jump around less, | 
| 787 |  |  |  |  |  |  | # since the oligos were written in sorted order to the bin file | 
| 788 |  |  |  |  |  |  |  | 
| 789 | 0 |  |  |  |  |  | @oligos = _create_oligos( $sequence, $wordlen, $self->{param}{valid_chars} ); | 
| 790 |  |  |  |  |  |  |  | 
| 791 | 0 |  |  |  |  |  | foreach $oligo ( sort @oligos ) {  # "sort" added by tzd  Apr-11-2008 | 
| 792 |  |  |  |  |  |  |  | 
| 793 | 0 | 0 |  |  |  |  | if ( defined ( $pos = $oli2pos->{ $oligo }->[0] ) ) { | 
| 794 | 0 |  |  |  |  |  | sysseek $bin_fh, $pos, 0; | 
| 795 |  |  |  |  |  |  |  | 
| 796 | 0 |  |  |  |  |  | $length = $oli2pos->{ $oligo }->[1]; | 
| 797 | 0 |  |  |  |  |  | sysread $bin_fh, $bytstr, $length; | 
| 798 |  |  |  |  |  |  |  | 
| 799 |  |  |  |  |  |  | # $bytstr is a perl string. Every 4-byte block | 
| 800 |  |  |  |  |  |  | # in this string encodes a 4 byte integer | 
| 801 |  |  |  |  |  |  | # so >4 billion integers are available. | 
| 802 |  |  |  |  |  |  | # but when passed to C | 
| 803 |  |  |  |  |  |  | # it will be an integer array. | 
| 804 |  |  |  |  |  |  |  | 
| 805 |  |  |  |  |  |  | # This is a C function for speed, the code is at the end of this | 
| 806 |  |  |  |  |  |  | # file. The third argument is the maximum index of $bytstr array, | 
| 807 |  |  |  |  |  |  |  | 
| 808 |  |  |  |  |  |  | ####### testing only | 
| 809 |  |  |  |  |  |  | # print STDERR join (',', (unpack "I*", $scovec)) . "\n"; | 
| 810 |  |  |  |  |  |  | # print STDERR join (',', (unpack "I*", $bytstr)) . "\n"; | 
| 811 |  |  |  |  |  |  | ####### | 
| 812 |  |  |  |  |  |  |  | 
| 813 | 0 |  |  |  |  |  | &update_scores_C( \$bytstr, \$scovec, ( $length - 1 ) / 4 ); | 
| 814 |  |  |  |  |  |  |  | 
| 815 |  |  |  |  |  |  | ####### testing only | 
| 816 |  |  |  |  |  |  | # print STDERR '$query_sid:' . $query_sid . | 
| 817 |  |  |  |  |  |  | #	' $oligo:' . $oligo . "\n" . | 
| 818 |  |  |  |  |  |  | #	Dumper(unpack "I*", $scovec) . "\n"; | 
| 819 |  |  |  |  |  |  | ####### | 
| 820 |  |  |  |  |  |  | } | 
| 821 |  |  |  |  |  |  | } | 
| 822 |  |  |  |  |  |  |  | 
| 823 | 0 |  |  |  |  |  | @scovec = unpack "I*", $scovec; | 
| 824 |  |  |  |  |  |  |  | 
| 825 |  |  |  |  |  |  | # >>>>>>>>>>>>>>>>>>>>>> EXTRACT OUTPUT DATA <<<<<<<<<<<<<<<<<<<<<< | 
| 826 |  |  |  |  |  |  |  | 
| 827 |  |  |  |  |  |  | # The score vector, @scovec, now contains integer scores in some | 
| 828 |  |  |  |  |  |  | # slots, zeros in others. To get an ordered list of just the positive | 
| 829 |  |  |  |  |  |  | # scores we could grep and sort, but that would be slow because the | 
| 830 |  |  |  |  |  |  | # list is long. Instead, keep a list of scores that is only as long | 
| 831 |  |  |  |  |  |  | # as the output length; insert values that are higher than the last | 
| 832 |  |  |  |  |  |  | # element and ignore the rest, | 
| 833 |  |  |  |  |  |  |  | 
| 834 | 0 |  |  |  |  |  | $oli_count = scalar @oligos; | 
| 835 | 0 |  |  |  |  |  | $outlen = $self->{param}{"outlen"}; | 
| 836 |  |  |  |  |  |  |  | 
| 837 | 0 |  |  |  |  |  | @scores = (); | 
| 838 |  |  |  |  |  |  |  | 
| 839 | 0 |  |  |  |  |  | for ( $i = 0; $i < scalar @scovec; $i++ ) | 
| 840 |  |  |  |  |  |  | { | 
| 841 | 0 | 0 |  |  |  |  | if ( $scovec[$i] ) { | 
| 842 |  |  |  |  |  |  |  | 
| 843 |  |  |  |  |  |  | #### use for testing | 
| 844 |  |  |  |  |  |  | # print STDERR '$oli_count:' . $oli_count . '  $oli_counts[$i]:' . $oli_counts[$i] . '  $scovec[$i]:' . $scovec[$i] . "\n"; | 
| 845 |  |  |  |  |  |  | ##### | 
| 846 |  |  |  |  |  |  |  | 
| 847 | 0 | 0 |  |  |  |  | if ( $oli_count < $oli_counts[$i] ) { | 
| 848 | 0 |  |  |  |  |  | $score = $scovec[$i] / $oli_count; | 
| 849 |  |  |  |  |  |  | } else { | 
| 850 | 0 |  |  |  |  |  | $score = $scovec[$i] / $oli_counts[$i]; | 
| 851 |  |  |  |  |  |  | } | 
| 852 |  |  |  |  |  |  |  | 
| 853 | 0 | 0 |  |  |  |  | if ( $score >= $minscore ) | 
| 854 |  |  |  |  |  |  | { | 
| 855 | 0 | 0 |  |  |  |  | if ( scalar @scores < $outlen ) | 
|  |  | 0 |  |  |  |  |  | 
| 856 |  |  |  |  |  |  | { | 
| 857 | 0 |  |  |  |  |  | $index = _nearest_index( \@scores, $score ); | 
| 858 | 0 |  |  |  |  |  | splice @scores, $index, 0, [ $i, $score ]; | 
| 859 |  |  |  |  |  |  | } | 
| 860 |  |  |  |  |  |  | elsif ( $score > $scores[0]->[1] ) | 
| 861 |  |  |  |  |  |  | { | 
| 862 | 0 |  |  |  |  |  | $index = _nearest_index( \@scores, $score ); | 
| 863 | 0 |  |  |  |  |  | splice @scores, $index, 0, [ $i, $score ]; | 
| 864 | 0 |  |  |  |  |  | shift @scores; | 
| 865 |  |  |  |  |  |  | } | 
| 866 |  |  |  |  |  |  | } | 
| 867 |  |  |  |  |  |  | } | 
| 868 |  |  |  |  |  |  | } | 
| 869 |  |  |  |  |  |  |  | 
| 870 | 0 | 0 |  |  |  |  | if ( $cl_args->{"noids"} ) { | 
| 871 | 0 |  |  |  |  |  | foreach $score ( @scores ) { | 
| 872 | 0 |  |  |  |  |  | $score->[1] = sprintf "%.2f", 100 * $score->[1]; | 
| 873 |  |  |  |  |  |  | } | 
| 874 |  |  |  |  |  |  | } else { | 
| 875 | 0 |  |  |  |  |  | foreach $score ( @scores ) { | 
| 876 | 0 |  |  |  |  |  | $score->[0] = $sids[ $score->[0] ]; | 
| 877 | 0 |  |  |  |  |  | $score->[1] = sprintf "%.2f", 100 * $score->[1]; | 
| 878 |  |  |  |  |  |  | # Can we add a percision parameter here ? | 
| 879 |  |  |  |  |  |  | # may be able to save disk space in the output file by getting | 
| 880 |  |  |  |  |  |  | # just the 1/10 place for instance | 
| 881 |  |  |  |  |  |  | } | 
| 882 |  |  |  |  |  |  | } | 
| 883 |  |  |  |  |  |  |  | 
| 884 | 0 |  |  |  |  |  | @scores = reverse @scores; | 
| 885 |  |  |  |  |  |  |  | 
| 886 |  |  |  |  |  |  | # >>>>>>>>>>>>>>>>>>>>>> OUTPUT SECTION <<<<<<<<<<<<<<<<<<<<<<<<< | 
| 887 |  |  |  |  |  |  |  | 
| 888 |  |  |  |  |  |  | # If called in non-void context, generate a data structure. If | 
| 889 |  |  |  |  |  |  | # in void context, print tabular data to stdout or an output file | 
| 890 |  |  |  |  |  |  | # if specified, | 
| 891 |  |  |  |  |  |  |  | 
| 892 | 0 | 0 |  |  |  |  | if ( $void_context == 0 ) { | 
| 893 |  |  |  |  |  |  | # means caller wants a scalar back | 
| 894 |  |  |  |  |  |  | # which will be a hash ref | 
| 895 | 0 |  |  |  |  |  | $matches->{ $query_sid } = &Storable::dclone( \@scores ); | 
| 896 |  |  |  |  |  |  | } else { | 
| 897 | 0 |  |  |  |  |  | $outline = "$query_sid\t" . join "\t", map { $_->[0] .":". $_->[1] } @scores; | 
|  | 0 |  |  |  |  |  |  | 
| 898 |  |  |  |  |  |  |  | 
| 899 | 0 | 0 |  |  |  |  | if ( defined $out_fh ) { | 
| 900 | 0 |  |  |  |  |  | print $out_fh "$outline\n"; | 
| 901 |  |  |  |  |  |  | } else { | 
| 902 | 0 |  |  |  |  |  | print "$outline\n"; | 
| 903 |  |  |  |  |  |  | } | 
| 904 |  |  |  |  |  |  | } | 
| 905 |  |  |  |  |  |  |  | 
| 906 | 0 | 0 |  |  |  |  | unless ( $silent ) { | 
| 907 | 0 |  |  |  |  |  | print "$query_sid done\n"; | 
| 908 |  |  |  |  |  |  | # &Common::Messages::echo_green( qq (done\n) ); | 
| 909 |  |  |  |  |  |  | } | 
| 910 |  |  |  |  |  |  | } # end while query_fh | 
| 911 |  |  |  |  |  |  | } # end local block | 
| 912 | 0 |  |  |  |  |  | $query_fh->close; | 
| 913 | 0 |  |  |  |  |  | $bin_fh->close; | 
| 914 |  |  |  |  |  |  |  | 
| 915 |  |  |  |  |  |  | # Return data structure if non-void context, otherwise nothing, | 
| 916 | 0 | 0 |  |  |  |  | if ( $void_context == 0 ) { | 
| 917 |  |  |  |  |  |  | # means caller wants a scalar back | 
| 918 | 0 |  |  |  |  |  | return $matches; | 
| 919 |  |  |  |  |  |  | } else { | 
| 920 | 0 |  |  |  |  |  | return; | 
| 921 |  |  |  |  |  |  | } | 
| 922 |  |  |  |  |  |  | } | 
| 923 |  |  |  |  |  |  |  | 
| 924 |  |  |  |  |  |  | =head2 _check_arguments_for_match_oligos | 
| 925 |  |  |  |  |  |  |  | 
| 926 |  |  |  |  |  |  | Internally used function. | 
| 927 |  |  |  |  |  |  | Validates the arguments for outlen, minpct, reverse, | 
| 928 |  |  |  |  |  |  | query, noids, silent, pre_subst and output. | 
| 929 |  |  |  |  |  |  | If errors are found, program exits. | 
| 930 |  |  |  |  |  |  | Returns a hash or hash ref | 
| 931 |  |  |  |  |  |  | depending on the context of the method call. | 
| 932 |  |  |  |  |  |  |  | 
| 933 |  |  |  |  |  |  | =cut | 
| 934 |  |  |  |  |  |  |  | 
| 935 |  |  |  |  |  |  | sub _check_arguments_for_match_oligos { | 
| 936 |  |  |  |  |  |  | # Niels Larsen, May 2004. | 
| 937 |  |  |  |  |  |  | # Todd Z. DeSantis, March 2008. | 
| 938 | 0 |  |  | 0 |  |  | my ( $self, | 
| 939 |  |  |  |  |  |  | $cl_args,    # Command Line ARGument hash | 
| 940 |  |  |  |  |  |  | # not really reqired to come from the command line | 
| 941 |  |  |  |  |  |  | ) = @_; | 
| 942 |  |  |  |  |  |  |  | 
| 943 | 0 |  |  |  |  |  | my ( @errors, $error, $outdir ); | 
| 944 |  |  |  |  |  |  |  | 
| 945 |  |  |  |  |  |  | ## see if caller wants to constrain the alphabet | 
| 946 | 0 | 0 |  |  |  |  | if ( exists $cl_args->{valid_chars} ) { | 
| 947 | 0 |  |  |  |  |  | $self->{param}{valid_chars} = $cl_args->{valid_chars}; | 
| 948 |  |  |  |  |  |  | } else { | 
| 949 | 0 |  |  |  |  |  | $self->{param}{valid_chars} = undef; | 
| 950 |  |  |  |  |  |  | } | 
| 951 |  |  |  |  |  |  |  | 
| 952 |  |  |  |  |  |  | ## see if caller wants to make some pre-substitutions | 
| 953 | 0 | 0 |  |  |  |  | if ( exists $cl_args->{pre_subst} ) { | 
| 954 | 0 |  |  |  |  |  | $self->{param}{pre_subst} = $cl_args->{pre_subst}; | 
| 955 |  |  |  |  |  |  | } else { | 
| 956 | 0 |  |  |  |  |  | $self->{param}{pre_subst} = 1; | 
| 957 |  |  |  |  |  |  | } | 
| 958 |  |  |  |  |  |  |  | 
| 959 |  |  |  |  |  |  | # Ensure output list has at least one similarity in it and set | 
| 960 |  |  |  |  |  |  | # it to 100 if the user didnt specify it, | 
| 961 |  |  |  |  |  |  |  | 
| 962 | 0 | 0 |  |  |  |  | if ( $cl_args->{"outlen"} ) | 
| 963 |  |  |  |  |  |  | { | 
| 964 | 0 | 0 |  |  |  |  | if ( $cl_args->{"outlen"} < 1 ) { | 
| 965 | 0 |  |  |  |  |  | push @errors, qq (Output list length should be 1 or more); | 
| 966 |  |  |  |  |  |  | } | 
| 967 |  |  |  |  |  |  | } | 
| 968 |  |  |  |  |  |  | else { | 
| 969 | 0 |  |  |  |  |  | $cl_args->{"outlen"} = 100; | 
| 970 |  |  |  |  |  |  | } | 
| 971 | 0 |  |  |  |  |  | $self->{param}{outlen} = $cl_args->{"outlen"}; | 
| 972 |  |  |  |  |  |  |  | 
| 973 | 0 | 0 |  |  |  |  | if ($cl_args->{"minpct"} ) { | 
| 974 | 0 |  |  |  |  |  | $self->{param}{minpct} = $cl_args->{"minpct"}; | 
| 975 |  |  |  |  |  |  | }  else { | 
| 976 | 0 |  |  |  |  |  | $self->{param}{minpct} = 0; | 
| 977 |  |  |  |  |  |  | } | 
| 978 |  |  |  |  |  |  |  | 
| 979 | 0 | 0 |  |  |  |  | if ($cl_args->{reverse}) { | 
| 980 | 0 |  |  |  |  |  | $self->{param}{"reverse"} = $cl_args->{reverse}; | 
| 981 |  |  |  |  |  |  | } else { | 
| 982 | 0 |  |  |  |  |  | $self->{param}{"reverse"} = 0; | 
| 983 |  |  |  |  |  |  | } | 
| 984 |  |  |  |  |  |  | # The mandatory query: | 
| 985 |  |  |  |  |  |  | # must be a file path | 
| 986 |  |  |  |  |  |  | # in the future, allow a hash ref of the structure | 
| 987 |  |  |  |  |  |  | # $hr->{$id} = $sequence_string | 
| 988 |  |  |  |  |  |  |  | 
| 989 |  |  |  |  |  |  | # if file given, | 
| 990 |  |  |  |  |  |  | # check that its readable. If not given, error, | 
| 991 | 0 | 0 |  |  |  |  | if ( $cl_args->{"query"} ) { | 
| 992 |  |  |  |  |  |  | # $cl_args->{"query"} = &Common::File::full_file_path( $cl_args->{"query"} ); | 
| 993 |  |  |  |  |  |  |  | 
| 994 | 0 | 0 |  |  |  |  | if ( not -r $cl_args->{"query"} ) { | 
| 995 | 0 |  |  |  |  |  | push @errors, qq (Query file not found or not readable -> "$cl_args->{'query'}"); | 
| 996 |  |  |  |  |  |  | } | 
| 997 |  |  |  |  |  |  | } | 
| 998 |  |  |  |  |  |  | else { | 
| 999 | 0 |  |  |  |  |  | push @errors, qq (Query sequence file must be specified); | 
| 1000 |  |  |  |  |  |  | } | 
| 1001 | 0 |  |  |  |  |  | $self->{param}{query} = $cl_args->{"query"}; | 
| 1002 |  |  |  |  |  |  |  | 
| 1003 |  |  |  |  |  |  | # If output file given, check if it exists and if its directory | 
| 1004 |  |  |  |  |  |  | # does not exist | 
| 1005 | 0 | 0 |  |  |  |  | if ( $cl_args->{"outfile"} ) { | 
| 1006 |  |  |  |  |  |  |  | 
| 1007 | 0 | 0 |  |  |  |  | if ( -e $cl_args->{"outfile"} )        { | 
| 1008 | 0 |  |  |  |  |  | push @errors, qq (Output file exists -> "$cl_args->{'outfile'}"); | 
| 1009 |  |  |  |  |  |  | } | 
| 1010 |  |  |  |  |  |  |  | 
| 1011 | 0 |  |  |  |  |  | $outdir = File::Basename::dirname( $cl_args->{"outfile"} ); | 
| 1012 |  |  |  |  |  |  |  | 
| 1013 | 0 | 0 |  |  |  |  | if ( not -d $outdir ) { | 
|  |  | 0 |  |  |  |  |  | 
| 1014 | 0 |  |  |  |  |  | push @errors, qq (Output directory does not exist -> "$outdir"); | 
| 1015 |  |  |  |  |  |  | } elsif ( not -w $outdir ) { | 
| 1016 | 0 |  |  |  |  |  | push @errors, qq (Output directory is not writable -> "$outdir"); | 
| 1017 |  |  |  |  |  |  | } | 
| 1018 |  |  |  |  |  |  | } else { | 
| 1019 | 0 |  |  |  |  |  | $cl_args->{"outfile"} = ""; | 
| 1020 |  |  |  |  |  |  | } | 
| 1021 | 0 |  |  |  |  |  | $self->{param}{outfile} = $cl_args->{"outfile"}; | 
| 1022 |  |  |  |  |  |  | # print STDERR 'outfile:' . $self->{param}{outfile} . "\n"; | 
| 1023 |  |  |  |  |  |  |  | 
| 1024 | 0 | 0 |  |  |  |  | if ( defined $cl_args->{"silent"} ) { | 
| 1025 | 0 |  |  |  |  |  | $self->{param}{silent} = $cl_args->{"silent"}; | 
| 1026 |  |  |  |  |  |  | } | 
| 1027 | 0 | 0 |  |  |  |  | if ( not defined $self->{param}{silent}) { | 
| 1028 | 0 |  |  |  |  |  | $self->{param}{silent}  = 1; | 
| 1029 |  |  |  |  |  |  | } | 
| 1030 |  |  |  |  |  |  |  | 
| 1031 |  |  |  |  |  |  | # Print errors if any and exit, | 
| 1032 |  |  |  |  |  |  |  | 
| 1033 | 0 | 0 |  |  |  |  | if ( @errors ) | 
| 1034 |  |  |  |  |  |  | { | 
| 1035 | 0 |  |  |  |  |  | foreach $error ( @errors ) | 
| 1036 |  |  |  |  |  |  | { | 
| 1037 | 0 |  |  |  |  |  | print STDERR "$error\n"; | 
| 1038 |  |  |  |  |  |  | # &Common::Messages::echo_red( "ERROR: "); | 
| 1039 |  |  |  |  |  |  | # &Common::Messages::echo( "$error\n" ); | 
| 1040 |  |  |  |  |  |  | } | 
| 1041 |  |  |  |  |  |  |  | 
| 1042 | 0 |  |  |  |  |  | exit; | 
| 1043 |  |  |  |  |  |  | } | 
| 1044 |  |  |  |  |  |  | else { | 
| 1045 | 0 | 0 |  |  |  |  | wantarray ? return %{ $cl_args } : return $cl_args; | 
|  | 0 |  |  |  |  |  |  | 
| 1046 |  |  |  |  |  |  | } | 
| 1047 |  |  |  |  |  |  |  | 
| 1048 |  |  |  |  |  |  | } | 
| 1049 |  |  |  |  |  |  |  | 
| 1050 |  |  |  |  |  |  | =head2 _create_oligos | 
| 1051 |  |  |  |  |  |  |  | 
| 1052 |  |  |  |  |  |  | Internal method. | 
| 1053 |  |  |  |  |  |  | Creates a list of unique words of a given length from a given sequence. | 
| 1054 |  |  |  |  |  |  | Enforces k-mers composed of purely valid_char if requested. | 
| 1055 |  |  |  |  |  |  | Converts characters to upper case where possible.  This may become an option | 
| 1056 |  |  |  |  |  |  | in the future. | 
| 1057 |  |  |  |  |  |  |  | 
| 1058 |  |  |  |  |  |  | =cut | 
| 1059 |  |  |  |  |  |  |  | 
| 1060 |  |  |  |  |  |  |  | 
| 1061 |  |  |  |  |  |  | sub _create_oligos { | 
| 1062 |  |  |  |  |  |  | # Niels Larsen, November 2005. | 
| 1063 |  |  |  |  |  |  | # Todd Z. DeSantis, March 2008 | 
| 1064 |  |  |  |  |  |  |  | 
| 1065 |  |  |  |  |  |  | my ( | 
| 1066 | 0 |  |  | 0 |  |  | $str,        # Sequence string | 
| 1067 |  |  |  |  |  |  | $word_len,    # Word length | 
| 1068 |  |  |  |  |  |  | $valid_chars | 
| 1069 |  |  |  |  |  |  | ) = @_; | 
| 1070 |  |  |  |  |  |  |  | 
| 1071 |  |  |  |  |  |  |  | 
| 1072 | 0 |  |  |  |  |  | my (@oligos, %oligos, $i, $len); | 
| 1073 | 0 |  |  |  |  |  | $str = uc $str; | 
| 1074 |  |  |  |  |  |  |  | 
| 1075 | 0 |  |  |  |  |  | my @good_spans = (); | 
| 1076 | 0 | 0 |  |  |  |  | if ($valid_chars) { | 
| 1077 | 0 |  |  |  |  |  | @good_spans = split (/[^$valid_chars]/, $str);  # pattern match done once | 
| 1078 |  |  |  |  |  |  | # instead of for every k-mer | 
| 1079 |  |  |  |  |  |  | } else { | 
| 1080 |  |  |  |  |  |  | # put whole string as first element | 
| 1081 | 0 |  |  |  |  |  | $good_spans[0] = $str; | 
| 1082 |  |  |  |  |  |  | } | 
| 1083 | 0 |  |  |  |  |  | foreach my $good_span (@good_spans) { | 
| 1084 |  |  |  |  |  |  | # print STDERR "good_span: $good_span\n"; | 
| 1085 | 0 |  |  |  |  |  | my $span_len = length($good_span); | 
| 1086 | 0 | 0 |  |  |  |  | next if ($span_len < $word_len); | 
| 1087 | 0 |  |  |  |  |  | my $i; | 
| 1088 | 0 |  |  |  |  |  | for ( $i = 0; $i <= $span_len - $word_len; $i++ ) { | 
| 1089 | 0 |  |  |  |  |  | $oligos{ substr $good_span, $i, $word_len } = undef; | 
| 1090 |  |  |  |  |  |  | } | 
| 1091 |  |  |  |  |  |  | } | 
| 1092 |  |  |  |  |  |  |  | 
| 1093 | 0 |  |  |  |  |  | @oligos = keys %oligos; | 
| 1094 |  |  |  |  |  |  |  | 
| 1095 |  |  |  |  |  |  | ## use for testing: | 
| 1096 |  |  |  |  |  |  |  | 
| 1097 |  |  |  |  |  |  | #	foreach my $o (@oligos) { | 
| 1098 |  |  |  |  |  |  | #	    if ($o =~ /[^ACGT]/) { | 
| 1099 |  |  |  |  |  |  | # 		print STDERR $o . "\n"; | 
| 1100 |  |  |  |  |  |  | #	    } | 
| 1101 |  |  |  |  |  |  | #	} | 
| 1102 |  |  |  |  |  |  |  | 
| 1103 |  |  |  |  |  |  |  | 
| 1104 |  |  |  |  |  |  |  | 
| 1105 | 0 | 0 |  |  |  |  | wantarray ? return @oligos : \@oligos; | 
| 1106 |  |  |  |  |  |  | } | 
| 1107 |  |  |  |  |  |  |  | 
| 1108 |  |  |  |  |  |  | =head2 _nearest_index | 
| 1109 |  |  |  |  |  |  |  | 
| 1110 |  |  |  |  |  |  | Finds the index of a given sorted array where a given number | 
| 1111 |  |  |  |  |  |  | would fit in the sorted order. The returned index can then | 
| 1112 |  |  |  |  |  |  | be used to insert the number, for example. The array may | 
| 1113 |  |  |  |  |  |  | contain integers and/or floats, same for the number. | 
| 1114 |  |  |  |  |  |  |  | 
| 1115 |  |  |  |  |  |  | =cut | 
| 1116 |  |  |  |  |  |  |  | 
| 1117 |  |  |  |  |  |  | sub _nearest_index | 
| 1118 |  |  |  |  |  |  | { | 
| 1119 |  |  |  |  |  |  | # Niels Larsen, May 2004. | 
| 1120 | 0 |  |  | 0 |  |  | my ( $array,       # Array of numbers | 
| 1121 |  |  |  |  |  |  | $value,       # Lookup value | 
| 1122 |  |  |  |  |  |  | ) = @_; | 
| 1123 |  |  |  |  |  |  |  | 
| 1124 |  |  |  |  |  |  | # Returns an integer. | 
| 1125 |  |  |  |  |  |  |  | 
| 1126 | 0 |  |  |  |  |  | my ( $low, $high ) = ( 0, scalar @{ $array } ); | 
|  | 0 |  |  |  |  |  |  | 
| 1127 | 0 |  |  |  |  |  | my ( $cur ); | 
| 1128 |  |  |  |  |  |  |  | 
| 1129 | 0 |  |  |  |  |  | while ( $low < $high ) | 
| 1130 |  |  |  |  |  |  | { | 
| 1131 | 0 |  |  |  |  |  | $cur = int ( ( $low + $high ) / 2 ); | 
| 1132 |  |  |  |  |  |  |  | 
| 1133 | 0 | 0 |  |  |  |  | if ( $array->[$cur]->[1] < $value ) { | 
| 1134 | 0 |  |  |  |  |  | $low = $cur + 1; | 
| 1135 |  |  |  |  |  |  | } else { | 
| 1136 | 0 |  |  |  |  |  | $high = $cur; | 
| 1137 |  |  |  |  |  |  | } | 
| 1138 |  |  |  |  |  |  | } | 
| 1139 |  |  |  |  |  |  |  | 
| 1140 | 0 |  |  |  |  |  | return $low; | 
| 1141 |  |  |  |  |  |  | } | 
| 1142 |  |  |  |  |  |  |  | 
| 1143 |  |  |  |  |  |  | =head2 _count_records | 
| 1144 |  |  |  |  |  |  |  | 
| 1145 |  |  |  |  |  |  | Counts the number of records in a fasta formated file. | 
| 1146 |  |  |  |  |  |  |  | 
| 1147 |  |  |  |  |  |  | =cut | 
| 1148 |  |  |  |  |  |  |  | 
| 1149 |  |  |  |  |  |  | sub _count_records | 
| 1150 |  |  |  |  |  |  | { | 
| 1151 |  |  |  |  |  |  | # Todd Z. DeSantis, July 2010. | 
| 1152 | 0 |  |  | 0 |  |  | my ( $fasta_file ) = @_; | 
| 1153 |  |  |  |  |  |  |  | 
| 1154 |  |  |  |  |  |  | # Returns an integer. | 
| 1155 | 0 |  |  |  |  |  | my $c = 0; | 
| 1156 | 0 |  |  |  |  |  | my $ffh = new IO::File $fasta_file, "r"; | 
| 1157 | 0 |  |  |  |  |  | while (<$ffh>) { | 
| 1158 | 0 | 0 |  |  |  |  | $c++ if ($_ =~ /^\>/); | 
| 1159 |  |  |  |  |  |  | } | 
| 1160 | 0 |  |  |  |  |  | $ffh->close; | 
| 1161 | 0 |  |  |  |  |  | return $c; | 
| 1162 |  |  |  |  |  |  | } | 
| 1163 |  |  |  |  |  |  |  | 
| 1164 |  |  |  |  |  |  |  | 
| 1165 |  |  |  |  |  |  | 1; | 
| 1166 |  |  |  |  |  |  |  | 
| 1167 |  |  |  |  |  |  | =head2 _update_scores_C | 
| 1168 |  |  |  |  |  |  |  | 
| 1169 |  |  |  |  |  |  | Receives a list of indicies, score vector, and the final index | 
| 1170 |  |  |  |  |  |  | Updates the score vector by incrementing the oligo | 
| 1171 |  |  |  |  |  |  | match count for the correct sequences. | 
| 1172 |  |  |  |  |  |  |  | 
| 1173 |  |  |  |  |  |  | =cut | 
| 1174 |  |  |  |  |  |  |  | 
| 1175 |  |  |  |  |  |  | __DATA__ |