File Coverage

lib/Bio/Roary/SortFasta.pm
Criterion Covered Total %
statement 63 63 100.0
branch 23 24 95.8
condition 8 9 88.8
subroutine 12 12 100.0
pod 0 2 0.0
total 106 110 96.3


line stmt bran cond sub pod time code
1             package Bio::Roary::SortFasta;
2             $Bio::Roary::SortFasta::VERSION = '3.10.2';
3             # ABSTRACT: sort a fasta file by name
4              
5              
6 4     4   241960 use Moose;
  4         1159462  
  4         31  
7 4     4   27396 use File::Copy;
  4         6312  
  4         283  
8 4     4   1105 use Bio::SeqIO;
  4         156508  
  4         2482  
9              
10             has 'input_filename' => ( is => 'ro', isa => 'Str', required => 1 );
11             has 'output_filename' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_output_filename' );
12             has 'make_multiple_of_three' => ( is => 'ro', isa => 'Bool', default => 0 );
13             has 'remove_nnn_from_end' => ( is => 'ro', isa => 'Bool', default => 0 );
14             has 'similarity' => ( is => 'rw', isa => 'Num', default => 1 );
15             has 'sequences_unaligned' => ( is => 'rw', isa => 'Bool', default => 0 );
16              
17             has '_input_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__input_seqio' );
18             has '_output_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__output_seqio' );
19              
20             sub _build_output_filename {
21 5     5   8 my ($self) = @_;
22 5         86 return $self->input_filename . ".sorted.fa";
23             }
24              
25             sub _build__input_seqio {
26 6     6   14 my ($self) = @_;
27 6         119 return Bio::SeqIO->new( -file => $self->input_filename, -format => 'Fasta' );
28             }
29              
30             sub _build__output_seqio {
31 4     4   7 my ($self) = @_;
32 4         69 return Bio::SeqIO->new( -file => ">" . $self->output_filename, -format => 'Fasta' );
33             }
34              
35             sub _add_padding_to_make_sequence_length_multiple_of_three {
36 18     18   29 my ( $self, $input_seq ) = @_;
37              
38 18         31 my $seq_length = $input_seq->length();
39 18 100       195 if ( $seq_length % 3 == 1 ) {
    100          
40 6         11 $input_seq->seq( $input_seq->seq() . "NN" );
41             }
42             elsif ( $seq_length % 3 == 2 ) {
43 6         11 $input_seq->seq( $input_seq->seq() . "N" );
44             }
45              
46 18         915 return $input_seq;
47             }
48              
49             sub _remove_nnn_from_all_sequences {
50 2     2   6 my ( $self, $input_sequences ) = @_;
51              
52 2         4 for my $sequence_name ( sort keys %{$input_sequences} ) {
  2         11  
53 4         185 my $sequence = $input_sequences->{$sequence_name}->seq();
54 4         90 $sequence =~ s/NNN$//i;
55 4         10 $input_sequences->{$sequence_name}->seq($sequence);
56             }
57 2         70 return $input_sequences;
58             }
59              
60             sub sort_fasta {
61 6     6 0 1166 my ($self) = @_;
62              
63 6         11 my %input_sequences;
64              
65 6         11 my $nnn_at_end_of_all_sequences = 1;
66 6         7 my $sequence;
67 6         9 my $variation_detected = 0;
68 6         165 while ( my $input_seq = $self->_input_seqio->next_seq() ) {
69 27 100       3537 $sequence = $input_seq->seq if(!defined($sequence));
70 27 100       629 $self->_add_padding_to_make_sequence_length_multiple_of_three($input_seq) if ( $self->make_multiple_of_three );
71 27 100 100     61 $nnn_at_end_of_all_sequences = 0 if ( $nnn_at_end_of_all_sequences == 1 && !( $input_seq->seq() =~ /NNN$/i ) );
72 27         161 $input_sequences{ $input_seq->display_id } = $input_seq;
73            
74 27         278 my $factor = $self->_percentage_similarity($sequence, $input_seq->seq);
75 27 100       495 if($factor < $self->similarity)
76             {
77 2         31 $self->similarity($factor);
78             }
79             }
80              
81 5 100 100     251 $self->_remove_nnn_from_all_sequences( \%input_sequences ) if ( $self->remove_nnn_from_end && $nnn_at_end_of_all_sequences );
82              
83 5         9 my $sequence_length = 0;
84 5         12 my $sequences_unaligned = 0;
85 5         25 for my $sequence_name ( sort keys %input_sequences ) {
86 27 100       3476 $sequence_length = $input_sequences{$sequence_name}->length if($sequence_length == 0);
87 27 100       92 $self->sequences_unaligned(1) if($input_sequences{$sequence_name}->length != $sequence_length);
88 27         588 $self->_output_seqio->write_seq( $input_sequences{$sequence_name} );
89             }
90 5         615 return $self;
91             }
92              
93             sub replace_input_with_output_file {
94 1     1 0 3 my ($self) = @_;
95 1         25 move( $self->output_filename, $self->input_filename );
96 1         303 return $self;
97             }
98              
99             sub _percentage_similarity
100             {
101 31     31   270 my ($self, $string1, $string2) = @_;
102 31         32 my $num_differences = 0;
103 31         33 my $string1_length = length($string1);
104 31   66     102 for(my $i = 0; $i < $string1_length && $i< length($string2); $i++)
105             {
106 91 100       234 $num_differences++ if( substr($string1, $i, 1) ne substr($string2, $i, 1));
107             }
108 31 100       67 return 1 if($num_differences == 0);
109 8 50       10 return 0 if($string1_length == 0);
110 8         22 return (1.0 - ($num_differences/$string1_length));
111             }
112              
113 4     4   35 no Moose;
  4         8  
  4         33  
114             __PACKAGE__->meta->make_immutable;
115              
116             1;
117              
118             __END__
119              
120             =pod
121              
122             =encoding UTF-8
123              
124             =head1 NAME
125              
126             Bio::Roary::SortFasta - sort a fasta file by name
127              
128             =head1 VERSION
129              
130             version 3.10.2
131              
132             =head1 SYNOPSIS
133              
134             sort a fasta file by name
135             use Bio::Roary::SortFasta;
136              
137             my $obj = Bio::Roary::SortFasta->new(
138             input_filename => 'infasta.fa',
139             );
140             $obj->sort_fasta->replace_input_with_output_file;
141              
142             =head1 AUTHOR
143              
144             Andrew J. Page <ap13@sanger.ac.uk>
145              
146             =head1 COPYRIGHT AND LICENSE
147              
148             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
149              
150             This is free software, licensed under:
151              
152             The GNU General Public License, Version 3, June 2007
153              
154             =cut