File Coverage

lib/Bio/Minimizer.pm
Criterion Covered Total %
statement 87 89 97.7
branch n/a
condition 6 6 100.0
subroutine 9 10 90.0
pod 1 3 33.3
total 103 108 95.3


line stmt bran cond sub pod time code
1             #!/usr/bin/env perl
2              
3             # Minimizer.pm: a minimizer package for kmers
4             # Author: Lee Katz
5              
6             package Bio::Minimizer;
7             require 5.12.0;
8             our $VERSION=0.8;
9              
10 7     7   103087 use strict;
  7         18  
  7         231  
11 7     7   35 use warnings;
  7         7  
  7         233  
12 7     7   34 use File::Basename qw/basename/;
  7         9  
  7         457  
13 7     7   37 use Data::Dumper;
  7         11  
  7         619  
14 7     7   36 use Carp qw/carp croak/;
  7         12  
  7         349  
15              
16 7     7   1961 use List::MoreUtils qw/uniq/;
  7         45776  
  7         58  
17              
18             sub logmsg{
19 0     0 0 0 local $0 = basename $0;
20 0         0 print STDERR "$0: @_\n";
21             }
22              
23             =pod
24              
25             =head1 NAME
26              
27             Bio::Minimizer - minimizer package
28              
29             Based on the ideas put forth by Roberts et al 2004:
30             https://academic.oup.com/bioinformatics/article/20/18/3363/202143
31              
32             =head1 SYNOPSIS
33              
34             my $minimizer = Bio::Minimizer->new($sequenceString);
35             my $kmers = $minimizer->{kmers}; # hash of minimizer => kmer
36             my $minimizers= $minimizer->{minimizers};# hash of minimizer => [kmer1,kmer2,...]
37              
38             # hash of minimizer => [start1,start2,...]
39             # Start coordinates are on the fwd strand even when
40             # matched against the rev strand.
41             my $starts = $minimizer->{starts};
42              
43             # With more options
44             my $minimizer2= Bio::Minimizer->new($sequenceString,{k=>31,l=>21});
45              
46             =head1 DESCRIPTION
47              
48             Creates a set of minimizers from sequence
49              
50             =head1 EXAMPLES
51              
52             example: Sort a fastq file by minimizer, potentially
53             shrinking gzip size.
54              
55             This is implemented in this package's scripts/sort*.pl scripts.
56              
57             use Bio::Minimizer
58              
59             # Read fastq file via stdin, in this example
60             while(my $id = <>){
61             # Grab an entry
62             ($seq,$plus,$qual) = (scalar(<>), scalar(<>), scalar(<>));
63             chomp($id,$seq,$plus,$qual);
64              
65             # minimizer object
66             $MINIMIZER = Bio::Minimizer->new($seq,{k=>length($seq)});
67             # The only minimizer in this entry because k==length(seq)
68             $minMinimizer = (values(%{$$MINIMIZER{minimizers}}))[0];
69              
70             # combine the minimum minimizer with the entry, for
71             # sorting later.
72             # Save the entry as a string so that we don't have to
73             # parse it later.
74             my $entry = [$minMinimizer, "$id\n$seq\n$plus\n$qual\n"];
75             push(@entry,$entry);
76             }
77            
78             for my $e(sort {$$a[0] cmp $$b[0]} @entry){
79             print $$e[1];
80             }
81              
82             =head1 METHODS
83              
84             =over
85              
86             =item Bio::Minimizer->new()
87              
88             Arguments:
89             sequence A string of ACGT
90             settings A hash
91             k Kmer length
92             l Minimizer length (some might call it lmer)
93             numcpus Number of threads to use. (not used)
94              
95             =back
96              
97             =cut
98              
99             sub new{
100 4010     4010 1 679437 my($class,$sequence,$settings) = @_;
101              
102             # Extract from $settings or set defaults
103 4010   100     26145 my $k = $$settings{k} || 31;
104 4010   100     13896 my $l = $$settings{l} || 21;
105 4010   100     9480 my $numcpus = $$settings{numcpus} || 1;
106              
107             # Alter the sequence a bit
108 4010         11404 $sequence = uc($sequence); # work in uppercase only
109 4010         12552 $sequence =~ s/\s+//g; # Remove all whitespace
110              
111 4010         24869 my $self={
112             sequence => $sequence,
113             revcom => "", # revcom of sequence filled in by _minimizers()
114             k => $k, # kmer length
115             l => $l, # minimizer length
116             numcpus => $numcpus,
117            
118             # Filled in by _minimizers()
119             minimizers => {}, # kmer => minimizer
120             kmers => {}, # minimizer => [kmer1,kmer2,...]
121             starts => {}, # minimizer => [start1,start2,...]
122             };
123              
124 4010         9855 bless($self,$class);
125              
126             # Set $$self{minimizers} right away
127 4010         13199 $self->_minimizers($sequence);
128              
129 4010         19818 return $self;
130             }
131              
132             # Argument: string of nucleotides
133             sub _minimizers{
134 4010     4010   10162 my($self,$seq) = @_;
135              
136 4010         7138 my $seqLength = length($seq);
137              
138             # Also reverse-complement the sequence
139 4010         8996 my $revcom = reverse($seq);
140 4010         13552 $revcom =~ tr/ATCGatcg/TAGCtagc/;
141 4010         7435 $$self{revcom} = $revcom;
142              
143             # Length of kmers
144 4010         6742 my $k = $$self{k};
145              
146             # All sequence segments. Probably only seq and revcom.
147 4010         11679 my $fwdMinimizers = $self->minimizerWorker([$seq]);
148 4010         13389 my $revMinimizers = $self->minimizerWorker([$revcom]);
149              
150             # Merge minimizer hashes
151 4010         7930 my %MINIMIZER = (%{$$fwdMinimizers{minimizers}}, %{$$revMinimizers{minimizers}});
  4010         141401  
  4010         479667  
152 4010         57708 $$self{minimizers} = \%MINIMIZER;
153              
154             # Merge start site hashes
155 4010         7151 my %START;
156 4010         288469 for my $m(uniq(values(%MINIMIZER))){
157             # Add all start sites for fwd site minimizers
158 312753         328169 for my $pos(@{ $$fwdMinimizers{starts}{$m} }){
  312753         463154  
159 148361         143295 push(@{ $START{$m} }, $pos);
  148361         241801  
160             }
161             # recalculate rev minimizer positions
162 312753         315071 for my $pos(@{ $$revMinimizers{starts}{$m} }){
  312753         443735  
163 164402         168517 my $revPos = $pos;
164             #my $revPos = $seqLength - $pos - $k + 1;
165 164402         155564 push(@{ $START{$m} }, $revPos);
  164402         274350  
166             }
167              
168 312753         319722 $START{$m} = [sort {$a <=> $b} @{$START{$m}}];
  11         26  
  312753         552483  
169             }
170 4010         20689 $$self{starts} = \%START;
171              
172             # Get a hash %KMER of minimizer=>[kmer1,kmer2,...]
173 4010         5350 my %KMER;
174 4010         13097 while(my($kmer,$minimizer) = each(%MINIMIZER)){
175 1684224         1807230 push(@{ $KMER{$minimizer} }, $kmer);
  1684224         3409040  
176             }
177              
178             # Deduplicate %KMER
179 4010         20183 while(my($m, $kmers) = each(%KMER)){
180 312753         1101624 $kmers = [sort {$a cmp $b} uniq(@$kmers)];
  3228134         3771833  
181             }
182             #die Dumper $$self{kmers}{ACGTA};
183 4010         183371 $$self{kmers} = \%KMER;
184             }
185              
186             sub minimizerWorker{
187 8020     8020 0 16231 my($self, $seqArr) = @_;
188              
189 8020         12293 my %MINIMIZER; # minimizers that this thread finds
190             my %START; # minimizer => [start1,start2,...]
191              
192             # Lengths of kmers and lmers
193 8020         15689 my ($k,$l)=($$self{k}, $$self{l});
194              
195             # How many minimizers we'll get per kmer: the difference in lengths, plus 1
196 8020         14107 my $minimizersPerKmer = $k-$l+1;
197              
198 8020         14139 for my $sequence(@$seqArr){
199             # Number of kmers in the seq is the length of the seq, minus $k, plus 1
200 8020         12116 my $numKmers = length($sequence) - $k + 1;
201              
202             # Create a small array of lmers along the way
203             # so that they don't have to be recalculated
204             # all the time between kmers.
205 8020         8967 my @lmer;
206              
207 8020         14300 for(my $kmerPos=0; $kmerPos<$numKmers; $kmerPos++){
208              
209             # The kmer is the subsequence starting at $kmerPos, length $k
210 1684224         2139229 my $kmer=substr($sequence,$kmerPos,$k);
211            
212             # Get lmers along the length of the sequence into the @lmer buffer.
213             # The start counter $lmerPos how many lmers are already in the buffer.
214 1684224         2546425 for(my $lmerPos=scalar(@lmer); $lmerPos < $minimizersPerKmer; $lmerPos++){
215             # The lmer will start at $kmerPos plus how many lmers are already
216             # in the buffer @lmer, for length $l.
217 1764432         2234117 my $lmer = substr($sequence, $kmerPos+$lmerPos, $l);
218 1764432         3419386 push(@lmer, [$lmerPos, $lmer]);
219             }
220              
221             # The minimizer is the lowest lmer lexicographically sorted.
222 1684224         2572596 my $minimizerStruct = (sort {$$a[1] cmp $$b[1]} @lmer)[0];
  44093106         46434458  
223 1684224         2876221 $MINIMIZER{$kmer} = $$minimizerStruct[1];
224              
225             # Record the start position
226 1684224         1805330 my $minimizerStart = $$minimizerStruct[0] + $kmerPos;
227             #push(@{ $START{$$minimizerStruct[1]} }, $minimizerStart);
228 1684224         2494513 $START{$$minimizerStruct[1]}{$minimizerStart}=1;
229              
230             #logmsg join("\t", $minimizerStart,$$minimizerStruct[1], $kmer);
231              
232             # Remove one lmer to reflect the step size of one
233             # for the next iteration of the loop.
234 1684224         1766950 my $removedLmer = shift(@lmer);
235 1684224         2102709 for(@lmer){
236 16844016         18851000 $$_[0]--; # lmer position decrement
237             }
238             }
239             }
240              
241             # Change index to array of unique sites
242 8020         35041 while(my($m, $starts) = each(%START)){
243             #$START{$m} = [sort {$a <=> $b} uniq(@$starts)];
244 312757         897605 $START{$m} = [sort {$a<=>$b} keys(%$starts)];
  6         25  
245             }
246              
247             # Return kmer=>minimizer, minimizer=>[start1,start2,...]
248 8020         33132 return {minimizers=>\%MINIMIZER, starts=>\%START};
249             }
250              
251             1;
252