File Coverage

blib/lib/Bio/Tools/Run/Phylo/Phylip/ProtDist.pm
Criterion Covered Total %
statement 58 160 36.2
branch 8 72 11.1
condition 0 18 0.0
subroutine 15 21 71.4
pod 6 6 100.0
total 87 277 31.4


line stmt bran cond sub pod time code
1             # BioPerl module for Bio::Tools::Run::Phylo::Phylip::ProtDist
2             #
3             # Created by
4             #
5             # Shawn Hoon
6             #
7             # You may distribute this module under the same terms as perl itself
8              
9             # POD documentation - main docs before the code
10              
11             =head1 NAME
12              
13             Bio::Tools::Run::Phylo::Phylip::ProtDist - Wrapper for the phylip
14             program protdist
15              
16             =head1 SYNOPSIS
17              
18             #Create a SimpleAlign object
19             @params = ('ktuple' => 2, 'matrix' => 'BLOSUM');
20             $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params);
21             $inputfilename = 't/data/cysprot.fa';
22             $aln = $factory->run($inputfilename); # $aln is a SimpleAlign object.
23              
24              
25             # Create the Distance Matrix using a default PAM matrix and id name
26             # lengths limit of 30 note to use id name length greater than the
27             # standard 10 in protdist, you will need to modify the protdist source
28             # code
29              
30             @params = ('MODEL' => 'PAM');
31             $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params);
32              
33             my ($matrix) = $protdist_factory->run($aln); # an array of Bio::Matrix::PhylipDist matrix
34              
35             #finding the distance between two sequences
36             my $distance = $matrix->get_entry('protein_name_1','protein_name_2');
37             my @column = $matrix->get_column('protein_name_1');
38             my @row = $martrix->get_row('protein_name_1');
39             my @diag = $matrix->get_diagonal();
40             print $matrix->print_matrix;
41              
42              
43             #Alternatively, one can create the matrix by passing in a file
44             #name containing a multiple alignment in phylip format
45             $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params);
46             my ($matrix) = $protdist_factory->run('/home/shawnh/prot.phy');
47              
48             # To prevent PHYLIP from truncating sequence names:
49             # Step 1. Shelf the original names:
50             my ($aln_safe, $ref_name)= # $aln_safe has serial names
51             $aln->set_displayname_safe(); # $ref_name holds original names
52             # Step 2. Run ProtDist and Neighbor:
53             ($matrix) = $protdist_factory->
54             create_distance_matrix($aln_safe); # Use $aln_safe instead of $aln
55             ($tree) = $neighbor_factory->run($matrix);
56             # Step 3. Retrieve orgininal OTU names:
57             use Bio::Tree::Tree;
58             my @nodes=$tree->get_nodes();
59             foreach my $nd (@nodes){
60             $nd->id($ref_name->{$nd->id_output}) if $nd->is_Leaf;
61             }
62              
63             =head1 DESCRIPTION
64              
65             Wrapper for protdist Joseph Felsentein for creating a distance matrix
66             comparing protein sequences from a multiple alignment file or a
67             L object and returns a L object;
68              
69             VERSION Support
70              
71             This wrapper currently supports v3.5 of phylip. There is also support
72             for v3.6.
73              
74             =head1 PARAMETERS FOR PROTDIST COMPUTATION
75              
76             =head2 MODEL
77              
78             Title : MODEL
79             Description : (optional)
80              
81             This sets the model of amino acid substitution used
82             in the calculation of the distances. 3 different
83             models are supported:
84             PAM Dayhoff PAM Matrix(default)
85             KIMURA Kimura's Distance CAT
86              
87             Categories Distance Usage: @params =
88             ('model'=>'X');#where X is one of the values above
89              
90             Defaults to PAM For more information on the usage of
91             the different models, please refer to the
92             documentation
93             defaults to Equal
94             (0.25,0.25,0.25,0.25) found in the phylip package.
95              
96             Additional models in PHYLIP 3.6
97             PMB - Henikoff/Tillier PMB matrix
98             JTT - Jones/Taylor/Thornton
99              
100             =head2 MULTIPLE
101              
102             Title : MULTIPLE
103             Description: (optional)
104              
105             This allows multiple distance matrices to be generated from multiple
106             MSA.
107              
108             Usage: @params = ('MULTIPLE'=>100) where the value specifyies the
109             number of aligments given.
110              
111             =head2 ALL SUBSEQUENT PARAMETERS WILL ONLY WORK IN CONJUNCTION WITH
112              
113             THE Categories Distance MODEL*
114              
115             =head2 GENCODE
116              
117             Title : GENCODE
118             Description : (optional)
119              
120             This option allows the user to select among various
121             nuclear and mitochondrial genetic codes.
122              
123             Acceptable Values:
124             U Universal
125             M Mitochondrial
126             V Vertebrate mitochondrial
127             F Fly mitochondrial
128             Y Yeast mitochondrial
129             Usage : @params = ('gencode'=>'X');
130             where X is one of the letters above
131             Defaults to U
132              
133             =head2 CATEGORY
134              
135             Title : CATEGORY
136             Description : (optional)
137              
138             This option sets the categorization of amino acids
139             all have groups: (Glu Gln Asp Asn), (Lys Arg His),
140             (Phe Tyr Trp) plus:
141             G George/Hunt/Barker:
142             (Cys), (Met Val Leu Ileu),
143             (Gly Ala Ser Thr Pro)
144             C Chemical:
145             (Cys Met), (Val Leu Ileu Gly Ala Ser Thr),
146             (Pro)
147             H Hall:
148             (Cys), (Met Val Leu Ileu), (Gly Ala Ser Thr),
149             (Pro)
150              
151             Usage : @params = ('category'=>'X');
152             where X is one of the letters above
153             Defaults to G
154              
155             =head2 PROBCHANGE
156              
157             Title : PROBCHANGE
158             Description : (optional)
159             This option sets the ease of changing category of amino
160             acid. (1.0 if no difficulty of changing,less if less
161             easy. Can't be negative)
162              
163             Usage : @params = ('probchange'=>X) where 0<=X<=1
164             Defaults to 0.4570
165              
166             =head2 TRANS
167              
168             Title : TRANS
169             Description : (optional)
170             This option sets transition/transversion ratio can be
171             any positive number
172              
173             Usage : @params = ('trans'=>X) where X >= 0
174             Defaults to 2
175              
176             =head2 FREQ
177              
178             Title : FREQ
179             Description : (optional)
180             This option sets the frequency of each base (A,C,G,T)
181             The sum of the frequency must sum to 1.
182             For example A,C,G,T = (0.25,0.5,0.125,0.125)
183              
184             Usage : @params = ('freq'=>('W','X','Y','Z')
185             where W + X + Y + Z = 1
186             Defaults to Equal (0.25,0.25,0.25,0.25)
187              
188              
189             =head1 FEEDBACK
190              
191             =head2 Mailing Lists
192              
193             User feedback is an integral part of the evolution of this and other
194             Bioperl modules. Send your comments and suggestions preferably to one
195             of the Bioperl mailing lists. Your participation is much appreciated.
196              
197             bioperl-l@bioperl.org - General discussion
198             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
199              
200             =head2 Support
201              
202             Please direct usage questions or support issues to the mailing list:
203              
204             I
205              
206             rather than to the module maintainer directly. Many experienced and
207             reponsive experts will be able look at the problem and quickly
208             address it. Please include a thorough description of the problem
209             with code and data examples if at all possible.
210              
211             =head2 Reporting Bugs
212              
213             Report bugs to the Bioperl bug tracking system to help us keep track
214             the bugs and their resolution. Bug reports can be submitted via the
215             web:
216              
217             http://redmine.open-bio.org/projects/bioperl/
218              
219             =head1 AUTHOR - Shawn Hoon
220              
221             Email shawnh@fugu-sg.org
222              
223             =head1 APPENDIX
224              
225             The rest of the documentation details each of the object
226             methods. Internal methods are usually preceded with a _
227              
228             =cut
229              
230             #'
231              
232            
233             package Bio::Tools::Run::Phylo::Phylip::ProtDist;
234              
235 2         139 use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME
236             @PROTDIST_PARAMS @OTHER_SWITCHES
237 2     2   204196 %OK_FIELD);
  2         2  
238 2     2   6 use strict;
  2         2  
  2         30  
239 2     2   1268 use Bio::SimpleAlign;
  2         162155  
  2         76  
240 2     2   1061 use Bio::AlignIO;
  2         8198  
  2         47  
241 2     2   778 use Bio::TreeIO;
  2         26913  
  2         50  
242 2     2   795 use Bio::Tools::Run::Phylo::Phylip::Base;
  2         4  
  2         53  
243 2     2   9 use Bio::Tools::Run::Phylo::Phylip::PhylipConf qw(%Menu);
  2         1  
  2         209  
244 2     2   861 use Bio::Tools::Phylo::Phylip::ProtDist;
  2         5286  
  2         44  
245 2     2   8 use Cwd;
  2         3  
  2         210  
246              
247              
248             # inherit from Phylip::Base which has some methods for dealing with
249             # Phylip specifics
250             @ISA = qw(Bio::Tools::Run::Phylo::Phylip::Base);
251              
252             # You will need to enable the protdist program. This
253             # can be done in (at least) 3 ways:
254             #
255             # 1. define an environmental variable PHYLIPDIR:
256             # export PHYLIPDIR=/home/shawnh/PHYLIP/bin
257             #
258             # 2. include a definition of an environmental variable CLUSTALDIR in
259             # every script that will use Clustal.pm.
260             # $ENV{PHYLIPDIR} = '/home/shawnh/PHYLIP/bin';
261             #
262             # 3. You can set the path to the program through doing:
263             # my @params('program'=>'/usr/local/bin/protdist');
264             # my $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params);
265             #
266              
267              
268             BEGIN {
269 2     2   5 @PROTDIST_PARAMS = qw(MODEL GENCODE CATEGORY PROBCHANGE TRANS WEIGHTS FREQ MULTIPLE);
270 2         4 @OTHER_SWITCHES = qw(QUIET);
271 2         4 foreach my $attr(@PROTDIST_PARAMS,@OTHER_SWITCHES) {
272 18         2175 $OK_FIELD{$attr}++;
273             }
274             }
275              
276             =head2 program_name
277              
278             Title : program_name
279             Usage : >program_name()
280             Function: holds the program name
281             Returns: string
282             Args : None
283              
284             =cut
285              
286             sub program_name {
287 6     6 1 24 return 'protdist';
288             }
289              
290             =head2 program_dir
291              
292             Title : program_dir
293             Usage : ->program_dir()
294             Function: returns the program directory, obtained from ENV variable.
295             Returns: string
296             Args :
297              
298             =cut
299              
300             sub program_dir {
301 3 50   3 1 12 return Bio::Root::IO->catfile($ENV{PHYLIPDIR}) if $ENV{PHYLIPDIR};
302             }
303              
304             sub new {
305 1     1 1 110 my ($class,@args) = @_;
306 1         14 my $self = $class->SUPER::new(@args);
307 1         41 my ($attr, $value);
308 1         5 while (@args) {
309 7         7 $attr = shift @args;
310 7         4 $value = shift @args;
311 7 50       14 next if( $attr =~ /^-/ ); # don't want named parameters
312 7 50       10 if ($attr =~/PROGRAM/i) {
313 0         0 $self->executable($value);
314 0         0 next;
315             }
316 7 100       11 if ($attr =~ /IDLENGTH/i){
317 1         3 $self->idlength($value);
318 1         2 next;
319             }
320 6         34 $self->$attr($value);
321             }
322 1         2 return $self;
323             }
324              
325             sub AUTOLOAD {
326 6     6   5 my $self = shift;
327 6         5 my $attr = $AUTOLOAD;
328 6         13 $attr =~ s/.*:://;
329 6         6 $attr = uc $attr;
330 6 50       12 $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr};
331 6 50       12 $self->{$attr} = shift if @_;
332 6         11 return $self->{$attr};
333             }
334              
335             =head2 idlength
336              
337             Title : idlength
338             Usage : $obj->idlength ($newval)
339             Function:
340             Returns : value of idlength
341             Args : newvalue (optional)
342              
343              
344             =cut
345              
346             sub idlength{
347 1     1 1 1 my $self = shift;
348 1 50       4 if( @_ ) {
349 1         2 my $value = shift;
350 1         1 $self->{'idlength'} = $value;
351             }
352 1         2 return $self->{'idlength'};
353              
354             }
355              
356              
357             =head2 run
358              
359             Title : run
360             Usage :
361             $inputfilename = 't/data/prot.phy';
362             $matrix= $prodistfactory->run($inputfilename);
363             or
364             $seq_array_ref = \@seq_array; @seq_array is array of Seq objs
365             $aln = $protdistfactory->align($seq_array_ref);
366             $matrix = $protdistfactory->run($aln);
367              
368             Function: Create a distance matrix from a SimpleAlign object or a multiple alignment file
369             Example :
370             Returns : L
371             Args : Name of a file containing a multiple alignment in Phylip format
372             or an SimpleAlign object
373              
374             Throws an exception if argument is not either a string (eg a
375             filename) or a Bio::SimpleAlign object. If
376             argument is string, throws exception if file corresponding to string
377             name can not be found.
378              
379             =cut
380              
381             sub run{
382              
383 0     0 1   my ($self,$input) = @_;
384 0           my ($infilename);
385              
386             # Create input file pointer
387 0           $infilename = $self->_setinput($input);
388 0 0         if (!$infilename) {$self->throw("Problems setting up for protdist. Probably bad input data in $input !");}
  0            
389             # Create parameter string to pass to protdist program
390 0           my $param_string = $self->_setparams();
391             # run protdist
392 0           my @mat = $self->_run($infilename,$param_string);
393 0 0         return wantarray ? @mat:\@mat;
394             }
395              
396             #################################################
397              
398             =head2 _run
399              
400             Title : _run
401             Usage : Internal function, not to be called directly
402             Function: makes actual system call to protdist program
403             Example :
404             Returns : Bio::Tree object
405             Args : Name of a file containing a set of multiple alignments in Phylip format
406             and a parameter string to be passed to protdist
407              
408              
409             =cut
410              
411             sub _run {
412 0     0     my ($self,$infile,$param_string) = @_;
413 0           my $instring;
414 0           my $curpath = cwd;
415 0 0         unless( File::Spec->file_name_is_absolute($infile) ) {
416 0           $infile = $self->io->catfile($curpath,$infile);
417             }
418 0           $instring = $infile."\n$param_string";
419 0           $self->debug( "Program ".$self->executable." $instring\n");
420            
421 0           chdir($self->tempdir);
422             #open a pipe to run protdist to bypass interactive menus
423 0 0 0       if ($self->quiet() || $self->verbose() < 0) {
424 0 0         my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null';
425 0           open(PROTDIST,"|".$self->executable .">$null");
426             }
427             else {
428 0           open(PROTDIST,"|".$self->executable);
429             }
430 0           print PROTDIST $instring;
431 0           close(PROTDIST);
432            
433             # get the results
434 0           my $outfile = $self->io->catfile($self->tempdir,$self->outfile);
435 0           chdir($curpath);
436 0 0         $self->throw("protdist did not create matrix correctly ($outfile)")
437             unless (-e $outfile);
438              
439             #Create the distance matrix here
440 0           my $parser = Bio::Tools::Phylo::Phylip::ProtDist->new(-file=>$outfile);
441 0           my @matrix;
442 0           while (my $mat = $parser->next_matrix){
443 0           push @matrix, $mat;
444             }
445              
446             # Clean up the temporary files created along the way...
447 0 0         unlink $outfile unless $self->save_tempfiles;
448            
449 0           return @matrix;
450             }
451              
452             =head2 create_distance_matrix
453              
454             Title : create_distance_matrix
455             Usage : my $file = $app->create_distance_matrix($treefile);
456             Function: This method is deprecated. Please use run method.
457             Returns : L
458             Args : Name of a file containing a multiple alignment in Phylip format
459             or an SimpleAlign object
460              
461             Throws an exception if argument is not either a string (eg a
462             filename) or a Bio::SimpleAlign object. If
463             argument is string, throws exception if file corresponding to string
464             name can not be found.
465              
466             =cut
467              
468             sub create_distance_matrix{
469 0     0 1   return shift->run(@_);
470             }
471              
472             =head2 _setinput()
473              
474             Title : _setinput
475             Usage : Internal function, not to be called directly
476             Function: Create input file for protdist program
477             Example :
478             Returns : name of file containing a multiple alignment in Phylip format
479             Args : SimpleAlign object reference or input file name
480              
481              
482             =cut
483              
484             sub _setinput {
485 0     0     my ($self, $input) = @_;
486 0           my ($alnfilename,$tfh);
487              
488             # suffix is used to distinguish alignment files from an align obkect
489             #If $input is not a reference it better be the name of a file with the sequence/
490              
491             # a phy formatted alignment file
492 0 0         unless (ref $input) {
493             # check that file exists or throw
494 0           $alnfilename= $input;
495 0 0         unless (-e $input) {return 0;}
  0            
496 0           return $alnfilename;
497             }
498 0 0         my @input = ref $input eq 'ARRAY' ? @{$input} : ($input);
  0            
499              
500             # $input may be a SimpleAlign Object
501 0           ($tfh,$alnfilename) = $self->io->tempfile(-dir=>$self->tempdir);
502 0           my $alnIO = Bio::AlignIO->new(-fh => $tfh,
503             -format=>'phylip',
504             -idlength=>$self->idlength());
505 0           my $input_count = 0;
506 0           foreach my $input(@input){
507 0 0         if ($input->isa("Bio::SimpleAlign")){
508             # Open temporary file for both reading & writing of BioSeq array
509 0           $alnIO->write_aln($input);
510             }
511 0           $input_count++;
512             }
513 0           $alnIO->close();
514 0           close($tfh);
515 0           $tfh = undef;
516 0           $self->_input_nbr($input_count);
517 0           return $alnfilename;
518             }
519              
520             sub _input_nbr {
521 0     0     my ($self,$val) = @_;
522 0 0         if($val){
523 0           $self->{'_input_nbr'} = $val;
524             }
525 0           return $self->{'_input_nbr'};
526             }
527              
528             =head2 _setparams()
529              
530             Title : _setparams
531             Usage : Internal function, not to be called directly
532             Function: Create parameter inputs for protdist program
533             Example :
534             Returns : parameter string to be passed to protdist
535             Args : name of calling object
536              
537             =cut
538              
539             sub _setparams {
540 0     0     my ($attr, $value, $self);
541              
542             #do nothing for now
543 0           $self = shift;
544 0           my $param_string = "";
545 0           my $cat = 0;
546 0           my %menu = %{$Menu{$self->version}->{'PROTDIST'}};
  0            
547              
548 0           foreach my $attr ( @PROTDIST_PARAMS) {
549 0           $value = $self->$attr();
550 0 0         next unless (defined $value);
551 0 0         if ($attr =~/MODEL/i){
552 0 0         if ($value=~/CAT/i){
553 0           $cat = 1;
554             }
555 0           $param_string .= $menu{'MODEL'}{$value};
556             }
557 0 0         if($attr=~/MULTIPLE/i){
558 0           $param_string.=$menu{'MULTIPLE'}."$value\n";
559             }
560 0 0         if ($cat == 1){
561 0 0         if($attr =~ /GENCODE/i){
562 0           my $allowed = $menu{'GENCODE'}{'ALLOWED'};
563 0 0         $self->throw("Unallowed value for genetic code") unless ($value =~ /[$allowed]/);
564 0           $param_string .= $menu{'GENCODE'}{'OPTION'}."$value\n";
565             }
566 0 0         if ($attr =~/CATEGORY/i){
567 0           my $allowed = $menu{'CATEGORY'}{'ALLOWED'};
568 0 0         $self->throw("Unallowed value for categorization of amino acids") unless ($value =~/[$allowed]/);
569 0           $param_string .= $menu{'CATEGORY'}{'OPTION'}."$value\n";
570             }
571 0 0         if ($attr =~/PROBCHANGE/i){
572 0 0 0       if (($value =~ /\d+/)&&($value >= 0) && ($value < 1)){
      0        
573 0           $param_string .= $menu{'PROBCHANGE'}."$value\n";
574             }
575             else {
576 0           $self->throw("Unallowed value for probability change category");
577             }
578             }
579 0 0         if ($attr =~/TRANS/i){
580 0 0 0       if (($value=~/\d+/) && ($value >=0)){
581 0           $param_string .=$menu{'TRANS'}."$value\n";
582             }
583             }
584 0 0         if ($attr =~ /FREQ/i){
585 0           my @freq = split(",",$value);
586 0 0         if ($freq[0] !~ /\d+/){ #a letter provided (sets frequencies equally to 0.25)
    0          
587 0           $param_string .=$menu{'FREQ'}.$freq[0]."\n";
588             }
589             elsif ($#freq == 3) {#must have 4 digits for each base
590 0           $param_string .=$menu{'FREQ'};
591 0           foreach my $f (@freq){
592 0           $param_string.="$f\n";
593             }
594             }
595             else {
596 0           $self->throw("Unallowed value for base frequencies");
597             }
598             }
599             }
600             }
601             #set multiple option is not set and there are more than one sequence
602 0 0 0       if (($param_string !~ $menu{'MULTIPLE'}) && (defined ($self->_input_nbr) &&($self->_input_nbr > 1))){
      0        
603 0           $param_string.=$menu{'MULTIPLE'}.$self->_input_nbr."\n";
604             }
605 0           $param_string .=$menu{'SUBMIT'};
606              
607 0           return $param_string;
608             }
609              
610              
611              
612             =head1 Bio::Tools::Run::Wrapper methods
613              
614             =cut
615              
616             =head2 no_param_checks
617              
618             Title : no_param_checks
619             Usage : $obj->no_param_checks($newval)
620             Function: Boolean flag as to whether or not we should
621             trust the sanity checks for parameter values
622             Returns : value of no_param_checks
623             Args : newvalue (optional)
624              
625              
626             =cut
627              
628             =head2 save_tempfiles
629              
630             Title : save_tempfiles
631             Usage : $obj->save_tempfiles($newval)
632             Function:
633             Returns : value of save_tempfiles
634             Args : newvalue (optional)
635              
636              
637             =cut
638              
639             =head2 outfile_name
640              
641             Title : outfile_name
642             Usage : my $outfile = $protdist->outfile_name();
643             Function: Get/Set the name of the output file for this run
644             (if you wanted to do something special)
645             Returns : string
646             Args : [optional] string to set value to
647              
648              
649             =cut
650              
651              
652             =head2 tempdir
653              
654             Title : tempdir
655             Usage : my $tmpdir = $self->tempdir();
656             Function: Retrieve a temporary directory name (which is created)
657             Returns : string which is the name of the temporary directory
658             Args : none
659              
660              
661             =cut
662              
663             =head2 cleanup
664              
665             Title : cleanup
666             Usage : $codeml->cleanup();
667             Function: Will cleanup the tempdir directory after a ProtDist run
668             Returns : none
669             Args : none
670              
671              
672             =cut
673              
674             =head2 io
675              
676             Title : io
677             Usage : $obj->io($newval)
678             Function: Gets a L object
679             Returns : L
680             Args : none
681              
682              
683             =cut
684              
685             1; # Needed to keep compiler happy