File Coverage

lib/Mashtree.pm
Criterion Covered Total %
statement 19 21 90.4
branch n/a
condition n/a
subroutine 7 7 100.0
pod n/a
total 26 28 92.8


line stmt bran cond sub pod time code
1             #!/usr/bin/env perl
2             package Mashtree;
3 5     5   288302 use strict;
  5         37  
  5         120  
4 5     5   19 use warnings;
  5         6  
  5         150  
5 5     5   18 use Exporter qw(import);
  5         8  
  5         115  
6 5     5   20 use File::Basename qw/fileparse basename dirname/;
  5         7  
  5         223  
7 5     5   1445 use Data::Dumper;
  5         19816  
  5         254  
8 5     5   31 use List::Util qw/shuffle/;
  5         7  
  5         237  
9              
10 5     5   1602 use threads;
  0            
  0            
11             use threads::shared;
12              
13             use lib dirname($INC{"Mashtree.pm"});
14             use Bio::Matrix::IO;
15             use Bio::TreeIO;
16              
17             our @EXPORT_OK = qw(
18             logmsg openFastq _truncateFilename distancesToPhylip createTreeFromPhylip sortNames treeDist
19             @fastqExt @fastaExt @bamExt @vcfExt @richseqExt @mshExt
20             $MASHTREE_VERSION
21             );
22              
23             local $0=basename $0;
24              
25             ######
26             # CONSTANTS
27              
28             our $VERSION = "0.29";
29             our $MASHTREE_VERSION=$VERSION;
30             our @fastqExt=qw(.fastq.gz .fastq .fq .fq.gz);
31             our @fastaExt=qw(.fasta .fna .faa .mfa .fas .fsa .fa);
32             our @bamExt=qw(.sorted.bam .bam);
33             our @vcfExt=qw(.vcf.gz .vcf);
34             our @mshExt=qw(.msh);
35             # Richseq extensions were obtained mostly from bioperl under
36             # the genbank, embl, and swissprot entries, under
37             # the source for Bio::SeqIO
38             our @richseqExt=qw(.gb .gbank .genbank .gbk .gbs .gbf .embl .ebl .emb .dat .swiss .sp);
39              
40             # Helpful things
41             my $fhStick :shared; # A thread can only open a fastq file if it has the talking stick.
42              
43             #################################################
44             ### COMMON SUBS/TOOLS (not object subroutines) ##
45             #################################################
46             # Redefine how scripts die
47             $SIG{'__DIE__'} = sub {
48             local $0=basename($0);
49             my $e = $_[0] || "";
50             my $callerSub=(caller(1))[3] || (caller(0))[3] || "UnknownSub";
51              
52             $e =~ s/(at [^\s]+? line \d+\.$)/\nStopped $1/;
53             die("$0: $callerSub: $e");
54             };
55             # Centralized logmsg
56             #sub logmsg {print STDERR "$0: ".(caller(1))[3].": @_\n";}
57             sub logmsg {
58             local $0=basename $0;
59             my $parentSub=(caller(1))[3] || (caller(0))[3];
60             $parentSub=~s/^main:://;
61              
62             # Find the thread ID and stringify it
63             my $tid=threads->tid;
64             $tid=($tid) ? "(TID$tid)" : "";
65              
66             my $msg="$0: $parentSub$tid: @_\n";
67              
68             print STDERR $msg;
69             }
70              
71             # Opens a fastq file in a thread-safe way.
72             sub openFastq{
73             my($fastq,$settings)=@_;
74              
75             my $fh;
76              
77             lock($fhStick);
78              
79             my @fastqExt=qw(.fastq.gz .fastq .fq.gz .fq);
80             my($name,$dir,$ext)=fileparse($fastq,@fastqExt);
81             if($ext =~/\.gz$/){
82             open($fh,"zcat $fastq | ") or die "ERROR: could not open $fastq for reading!: $!";
83             } else {
84             open($fh,"<",$fastq) or die "ERROR: could not open $fastq for reading!: $!";
85             }
86             return $fh;
87             }
88              
89             # Removes fastq extension, removes directory name,
90             # truncates to a length, and adds right-padding.
91             sub _truncateFilename{
92             my($file,$settings)=@_;
93             $$settings{truncLength}||=255;
94             # strip off msh and get the basename
95             my $name=basename($file,'.msh');
96             # One more extension
97             $name=basename($name,@fastqExt,@richseqExt,@fastaExt);
98             # Truncate
99             $name=substr($name,0,$$settings{truncLength});
100             # Add in padding
101             $name.=" " x ($$settings{truncLength}-length($name));
102             return $name;
103             }
104              
105              
106             # 1. Read the mash distances
107             # 2. Create a phylip file
108             sub distancesToPhylip{
109             my($distances,$outdir,$settings)=@_;
110              
111             my $phylip = "$outdir/distances.phylip";
112             # NOTE: need to regenerate the combined distances each time
113             # because I need to allow variation in the input samples.
114             #return $phylip if(-e $phylip);
115              
116             # The way phylip is, I need to know the genome names
117             # a priori
118             my %name;
119             open(MASHDIST,"<",$distances) or die "ERROR: could not open $distances for reading: $!";
120             while(<MASHDIST>){
121             next if(/^#/);
122             my($name)=split(/\t/,$_);
123             $name=~s/^\s+|\s+$//g; # whitespace trim before right-padding is added
124             $name{_truncateFilename($name,$settings)}=1;
125             }
126             close MASHDIST;
127             my @name=sortNames([keys(%name)],$settings);
128             # Index the array
129             my $columnIndex=0;
130             for(@name){
131             $name{$_}=$columnIndex++;
132             }
133              
134             # Load up the matrix object
135             logmsg "Reading the distances file at $distances";
136             open(MASHDIST,"<",$distances) or die "ERROR: could not open $distances for reading: $!";
137             my $query="UNKNOWN"; # Default ID in case anything goes wrong
138             my @m;
139             while(<MASHDIST>){
140             chomp;
141             if(/^#query\s+(.+)/){
142             $query=_truncateFilename($1,$settings);
143             } else {
144             my ($reference,$distance)=split(/\t/,$_);
145             $reference=_truncateFilename($reference,$settings);
146             $distance=sprintf("%0.8f",$distance);
147             $m[$name{$query}][$name{$reference}]=$distance;
148             $m[$name{$reference}][$name{$query}]=$distance;
149             }
150             }
151             close MASHDIST;
152             #my $matrixObj=Bio::Matrix::Generic->new(-rownames=>\@name,-colnames=>\@name,-values=>\@m);
153              
154             # taking this method from write_matrix in http://cpansearch.perl.org/src/CJFIELDS/BioPerl-1.6.924/Bio/Matrix/IO/phylip.pm
155             my $str;
156             $str.=(" " x 4) . scalar(@name)."\n";
157             for(my $i=0;$i<@name;$i++){
158             $str.=$name[$i];
159             my $count=0;
160             for(my $j=0;$j<@name;$j++){
161             if($count < $#name){
162             $str.=$m[$i][$j]. " ";
163             } else {
164             $str.=$m[$i][$j];
165             }
166             $count++;
167             }
168             $str.="\n";
169             }
170             open(PHYLIP,">",$phylip) or die "ERROR: could not write to $phylip: $!";
171             print PHYLIP $str;
172             close PHYLIP;
173             return $phylip;
174             }
175              
176             sub sortNames{
177             my($name,$settings)=@_;
178             my @sorted;
179             if($$settings{'sort-order'} =~ /^(abc|alphabet)$/){
180             @sorted=sort { $a cmp $b } @$name;
181             } elsif($$settings{'sort-order'}=~/^rand(om)?/){
182             @sorted=shuffle(@$name);
183             } elsif($$settings{'sort-order'} eq 'input-order'){
184             @sorted=@$name;
185             } else {
186             die "ERROR: I don't understand sort-order $$settings{'sort-order'}";
187             }
188             return @sorted;
189             }
190              
191             # Create tree file with Quicktree but bioperl
192             # as a backup.
193             sub createTreeFromPhylip{
194             my($phylip,$outdir,$settings)=@_;
195              
196             my $treeObj;
197              
198             my $quicktreePath=`which quicktree 2>/dev/null`;
199             # bioperl if there was an error with which quicktree
200             if($?){
201             logmsg "Creating tree with BioPerl";
202             my $dfactory = Bio::Tree::DistanceFactory->new(-method=>"NJ");
203             my $matrix = Bio::Matrix::IO->new(-format=>"phylip", -file=>$phylip)->next_matrix;
204             $treeObj = $dfactory->make_tree($matrix);
205             open(TREE,">","$outdir/tree.dnd") or die "ERROR: could not open $outdir/tree.dnd: $!";
206             print TREE $treeObj->as_text("newick");
207             print TREE "\n";
208             close TREE;
209             }
210             # quicktree
211             else {
212             logmsg "Creating tree with QuickTree";
213             system("quicktree -in m $phylip > $outdir/tree.dnd.tmp");
214             die "ERROR with quicktree" if $?;
215             $treeObj=Bio::TreeIO->new(-file=>"$outdir/tree.dnd.tmp")->next_tree;
216             my $outtree=Bio::TreeIO->new(-file=>">$outdir/tree.dnd", -format=>"newick");
217             $outtree->write_tree($treeObj);
218              
219             unlink("$outdir/tree.dnd.tmp");
220             }
221              
222             return $treeObj;
223              
224             }
225              
226             # Lee's implementation of a tree distance. The objective
227             # is to return zero if two trees are the same.
228             sub treeDist{
229             my($treeObj1,$treeObj2)=@_;
230              
231             # If the tree objects are really strings, then make Bio::Tree::Tree objects
232             if(!ref($treeObj1)){
233             if(-e $treeObj1){ # if this is a file, get the contents
234             $treeObj1=`cat $treeObj1`;
235             }
236             $treeObj1=Bio::TreeIO->new(-string=>$treeObj1)->next_tree;
237             }
238             if(!ref($treeObj2)){
239             if(-e $treeObj2){ # if this is a file, get the contents
240             $treeObj2=`cat $treeObj2`;
241             }
242             $treeObj2=Bio::TreeIO->new(-string=>$treeObj2)->next_tree;
243             }
244             for($treeObj1,$treeObj2){
245             #$_->force_binary;
246             }
247            
248             # Get all leaf nodes so that they can be compared
249             my @nodes1=sort {$a->id cmp $b->id} grep{$_->is_Leaf} $treeObj1->get_nodes;
250             my @nodes2=sort {$a->id cmp $b->id} grep{$_->is_Leaf} $treeObj2->get_nodes;
251             my $numNodes=@nodes1;
252              
253             # Test 1: are these the same nodes?
254             my $nodeString1=join(" ",map{$_->id} @nodes1);
255             my $nodeString2=join(" ",map{$_->id} @nodes2);
256             if($nodeString1 ne $nodeString2){
257             # TODO print out the differing nodes?
258             logmsg "ERROR: nodes are not the same in both trees!\n $nodeString1\n $nodeString2";
259             return ~0; #largest int
260             }
261              
262             # Find the number of branches it takes to get to each node.
263             # Turn it into a Euclidean distance
264             my $euclideanDistance=0;
265             for(my $i=0;$i<$numNodes;$i++){
266             for(my $j=$i+1;$j<$numNodes;$j++){
267             my ($numBranches1,$numBranches2);
268              
269             my $lca1=$treeObj1->get_lca($nodes1[$i],$nodes1[$j]);
270             my $lca2=$treeObj2->get_lca($nodes2[$i],$nodes2[$j]);
271            
272             # Distance in tree1
273             my $distance1=0;
274             my @ancestory1=reverse $treeObj1->get_lineage_nodes($nodes1[$i]);
275             my @ancestory2=reverse $treeObj1->get_lineage_nodes($nodes1[$j]);
276             for my $currentNode(@ancestory1){
277             $distance1++;
278             last if($currentNode eq $lca1);
279             }
280             for my $currentNode(@ancestory2){
281             $distance1++;
282             last if($currentNode eq $lca1);
283             }
284            
285             # Distance in tree2
286             my $distance2=0;
287             my @ancestory3=reverse $treeObj2->get_lineage_nodes($nodes2[$i]);
288             my @ancestory4=reverse $treeObj2->get_lineage_nodes($nodes2[$j]);
289             for my $currentNode(@ancestory3){
290             $distance2++;
291             last if($currentNode eq $lca2);
292             }
293             for my $currentNode(@ancestory4){
294             $distance2++;
295             last if($currentNode eq $lca2);
296             }
297              
298             if($distance1 != $distance2){
299             logmsg "These two nodes do not have the same distance between trees: ".$nodes1[$i]->id." and ".$nodes1[$j]->id;
300             }
301              
302             # Add up the Euclidean distance
303             $euclideanDistance+=($distance1 - $distance2) ** 2;
304             }
305             }
306             $euclideanDistance=sqrt($euclideanDistance);
307             return $euclideanDistance;
308             }
309              
310             1;
311              
312              
313             __END__
314              
315             =head1 NAME
316              
317             Mashtree - Create a tree using Mash distances.
318              
319             =cut
320