line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::Roary::SortFasta; |
2
|
|
|
|
|
|
|
$Bio::Roary::SortFasta::VERSION = '3.10.1'; |
3
|
|
|
|
|
|
|
# ABSTRACT: sort a fasta file by name |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
|
6
|
4
|
|
|
4
|
|
234749
|
use Moose; |
|
4
|
|
|
|
|
1294776
|
|
|
4
|
|
|
|
|
31
|
|
7
|
4
|
|
|
4
|
|
31516
|
use File::Copy; |
|
4
|
|
|
|
|
5402
|
|
|
4
|
|
|
|
|
263
|
|
8
|
4
|
|
|
4
|
|
1056
|
use Bio::SeqIO; |
|
4
|
|
|
|
|
155128
|
|
|
4
|
|
|
|
|
3042
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
has 'input_filename' => ( is => 'ro', isa => 'Str', required => 1 ); |
11
|
|
|
|
|
|
|
has 'output_filename' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_output_filename' ); |
12
|
|
|
|
|
|
|
has 'make_multiple_of_three' => ( is => 'ro', isa => 'Bool', default => 0 ); |
13
|
|
|
|
|
|
|
has 'remove_nnn_from_end' => ( is => 'ro', isa => 'Bool', default => 0 ); |
14
|
|
|
|
|
|
|
has 'similarity' => ( is => 'rw', isa => 'Num', default => 1 ); |
15
|
|
|
|
|
|
|
has 'sequences_unaligned' => ( is => 'rw', isa => 'Bool', default => 0 ); |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
has '_input_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__input_seqio' ); |
18
|
|
|
|
|
|
|
has '_output_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__output_seqio' ); |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
sub _build_output_filename { |
21
|
5
|
|
|
5
|
|
9
|
my ($self) = @_; |
22
|
5
|
|
|
|
|
86
|
return $self->input_filename . ".sorted.fa"; |
23
|
|
|
|
|
|
|
} |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
sub _build__input_seqio { |
26
|
6
|
|
|
6
|
|
14
|
my ($self) = @_; |
27
|
6
|
|
|
|
|
134
|
return Bio::SeqIO->new( -file => $self->input_filename, -format => 'Fasta' ); |
28
|
|
|
|
|
|
|
} |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
sub _build__output_seqio { |
31
|
4
|
|
|
4
|
|
6
|
my ($self) = @_; |
32
|
4
|
|
|
|
|
67
|
return Bio::SeqIO->new( -file => ">" . $self->output_filename, -format => 'Fasta' ); |
33
|
|
|
|
|
|
|
} |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
sub _add_padding_to_make_sequence_length_multiple_of_three { |
36
|
18
|
|
|
18
|
|
24
|
my ( $self, $input_seq ) = @_; |
37
|
|
|
|
|
|
|
|
38
|
18
|
|
|
|
|
31
|
my $seq_length = $input_seq->length(); |
39
|
18
|
100
|
|
|
|
203
|
if ( $seq_length % 3 == 1 ) { |
|
|
100
|
|
|
|
|
|
40
|
6
|
|
|
|
|
17
|
$input_seq->seq( $input_seq->seq() . "NN" ); |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
elsif ( $seq_length % 3 == 2 ) { |
43
|
6
|
|
|
|
|
12
|
$input_seq->seq( $input_seq->seq() . "N" ); |
44
|
|
|
|
|
|
|
} |
45
|
|
|
|
|
|
|
|
46
|
18
|
|
|
|
|
915
|
return $input_seq; |
47
|
|
|
|
|
|
|
} |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
sub _remove_nnn_from_all_sequences { |
50
|
2
|
|
|
2
|
|
6
|
my ( $self, $input_sequences ) = @_; |
51
|
|
|
|
|
|
|
|
52
|
2
|
|
|
|
|
4
|
for my $sequence_name ( sort keys %{$input_sequences} ) { |
|
2
|
|
|
|
|
10
|
|
53
|
4
|
|
|
|
|
187
|
my $sequence = $input_sequences->{$sequence_name}->seq(); |
54
|
4
|
|
|
|
|
54
|
$sequence =~ s/NNN$//i; |
55
|
4
|
|
|
|
|
9
|
$input_sequences->{$sequence_name}->seq($sequence); |
56
|
|
|
|
|
|
|
} |
57
|
2
|
|
|
|
|
59
|
return $input_sequences; |
58
|
|
|
|
|
|
|
} |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
sub sort_fasta { |
61
|
6
|
|
|
6
|
0
|
1220
|
my ($self) = @_; |
62
|
|
|
|
|
|
|
|
63
|
6
|
|
|
|
|
10
|
my %input_sequences; |
64
|
|
|
|
|
|
|
|
65
|
6
|
|
|
|
|
13
|
my $nnn_at_end_of_all_sequences = 1; |
66
|
6
|
|
|
|
|
9
|
my $sequence; |
67
|
6
|
|
|
|
|
10
|
my $variation_detected = 0; |
68
|
6
|
|
|
|
|
165
|
while ( my $input_seq = $self->_input_seqio->next_seq() ) { |
69
|
27
|
100
|
|
|
|
3546
|
$sequence = $input_seq->seq if(!defined($sequence)); |
70
|
27
|
100
|
|
|
|
643
|
$self->_add_padding_to_make_sequence_length_multiple_of_three($input_seq) if ( $self->make_multiple_of_three ); |
71
|
27
|
100
|
100
|
|
|
61
|
$nnn_at_end_of_all_sequences = 0 if ( $nnn_at_end_of_all_sequences == 1 && !( $input_seq->seq() =~ /NNN$/i ) ); |
72
|
27
|
|
|
|
|
126
|
$input_sequences{ $input_seq->display_id } = $input_seq; |
73
|
|
|
|
|
|
|
|
74
|
27
|
|
|
|
|
277
|
my $factor = $self->_percentage_similarity($sequence, $input_seq->seq); |
75
|
27
|
100
|
|
|
|
490
|
if($factor < $self->similarity) |
76
|
|
|
|
|
|
|
{ |
77
|
2
|
|
|
|
|
31
|
$self->similarity($factor); |
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
} |
80
|
|
|
|
|
|
|
|
81
|
5
|
100
|
100
|
|
|
246
|
$self->_remove_nnn_from_all_sequences( \%input_sequences ) if ( $self->remove_nnn_from_end && $nnn_at_end_of_all_sequences ); |
82
|
|
|
|
|
|
|
|
83
|
5
|
|
|
|
|
12
|
my $sequence_length = 0; |
84
|
5
|
|
|
|
|
7
|
my $sequences_unaligned = 0; |
85
|
5
|
|
|
|
|
27
|
for my $sequence_name ( sort keys %input_sequences ) { |
86
|
27
|
100
|
|
|
|
3453
|
$sequence_length = $input_sequences{$sequence_name}->length if($sequence_length == 0); |
87
|
27
|
100
|
|
|
|
93
|
$self->sequences_unaligned(1) if($input_sequences{$sequence_name}->length != $sequence_length); |
88
|
27
|
|
|
|
|
583
|
$self->_output_seqio->write_seq( $input_sequences{$sequence_name} ); |
89
|
|
|
|
|
|
|
} |
90
|
5
|
|
|
|
|
572
|
return $self; |
91
|
|
|
|
|
|
|
} |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
sub replace_input_with_output_file { |
94
|
1
|
|
|
1
|
0
|
2
|
my ($self) = @_; |
95
|
1
|
|
|
|
|
22
|
move( $self->output_filename, $self->input_filename ); |
96
|
1
|
|
|
|
|
257
|
return $self; |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
sub _percentage_similarity |
100
|
|
|
|
|
|
|
{ |
101
|
31
|
|
|
31
|
|
260
|
my ($self, $string1, $string2) = @_; |
102
|
31
|
|
|
|
|
32
|
my $num_differences = 0; |
103
|
31
|
|
|
|
|
32
|
my $string1_length = length($string1); |
104
|
31
|
|
66
|
|
|
101
|
for(my $i = 0; $i < $string1_length && $i< length($string2); $i++) |
105
|
|
|
|
|
|
|
{ |
106
|
91
|
100
|
|
|
|
232
|
$num_differences++ if( substr($string1, $i, 1) ne substr($string2, $i, 1)); |
107
|
|
|
|
|
|
|
} |
108
|
31
|
100
|
|
|
|
60
|
return 1 if($num_differences == 0); |
109
|
8
|
50
|
|
|
|
16
|
return 0 if($string1_length == 0); |
110
|
8
|
|
|
|
|
20
|
return (1.0 - ($num_differences/$string1_length)); |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
|
113
|
4
|
|
|
4
|
|
39
|
no Moose; |
|
4
|
|
|
|
|
10
|
|
|
4
|
|
|
|
|
34
|
|
114
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
1; |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
__END__ |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
=pod |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=encoding UTF-8 |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
=head1 NAME |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
Bio::Roary::SortFasta - sort a fasta file by name |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
=head1 VERSION |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
version 3.10.1 |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=head1 SYNOPSIS |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
sort a fasta file by name |
135
|
|
|
|
|
|
|
use Bio::Roary::SortFasta; |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
my $obj = Bio::Roary::SortFasta->new( |
138
|
|
|
|
|
|
|
input_filename => 'infasta.fa', |
139
|
|
|
|
|
|
|
); |
140
|
|
|
|
|
|
|
$obj->sort_fasta->replace_input_with_output_file; |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
=head1 AUTHOR |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
Andrew J. Page <ap13@sanger.ac.uk> |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute. |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
This is free software, licensed under: |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
The GNU General Public License, Version 3, June 2007 |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=cut |