line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::Roary::FilterUnknownsFromFasta; |
2
|
|
|
|
|
|
|
$Bio::Roary::FilterUnknownsFromFasta::VERSION = '3.10.2'; |
3
|
|
|
|
|
|
|
# ABSTRACT: Take in fasta files, remove sequences with too many unknowns and return a list of the new files |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
|
6
|
4
|
|
|
4
|
|
28
|
use Moose; |
|
4
|
|
|
|
|
10
|
|
|
4
|
|
|
|
|
30
|
|
7
|
4
|
|
|
4
|
|
24388
|
use Bio::SeqIO; |
|
4
|
|
|
|
|
9
|
|
|
4
|
|
|
|
|
76
|
|
8
|
4
|
|
|
4
|
|
54
|
use Cwd; |
|
4
|
|
|
|
|
9
|
|
|
4
|
|
|
|
|
241
|
|
9
|
4
|
|
|
4
|
|
22
|
use Bio::Roary::Exceptions; |
|
4
|
|
|
|
|
27
|
|
|
4
|
|
|
|
|
83
|
|
10
|
4
|
|
|
4
|
|
20
|
use File::Basename; |
|
4
|
|
|
|
|
6
|
|
|
4
|
|
|
|
|
1573
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
has 'fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); |
13
|
|
|
|
|
|
|
has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 ); |
14
|
|
|
|
|
|
|
has 'maximum_percentage_of_unknowns' => ( is => 'ro', isa => 'Num', default => 5 ); |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
has 'filtered_fasta_files' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_filtered_fasta_files' ); |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
has 'input_fasta_to_output_fasta' => ( is => 'ro', isa => 'HashRef', default => sub {{}} ); |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
sub _build_filtered_fasta_files |
21
|
|
|
|
|
|
|
{ |
22
|
19
|
|
|
19
|
|
50
|
my ($self) = @_; |
23
|
|
|
|
|
|
|
|
24
|
19
|
|
|
|
|
31
|
my @output_file_names; |
25
|
19
|
|
|
|
|
30
|
for my $fasta_file (@{$self->fasta_files}) |
|
19
|
|
|
|
|
553
|
|
26
|
|
|
|
|
|
|
{ |
27
|
44
|
|
|
|
|
6528
|
my ( $filename, $directories, $suffix ) = fileparse($fasta_file); |
28
|
44
|
|
|
|
|
226
|
push(@output_file_names, $self->_filter_fasta_sequences_and_return_new_file($filename,$fasta_file )); |
29
|
|
|
|
|
|
|
} |
30
|
19
|
|
|
|
|
3691
|
return \@output_file_names; |
31
|
|
|
|
|
|
|
} |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
sub _does_sequence_contain_too_many_unknowns |
34
|
|
|
|
|
|
|
{ |
35
|
183
|
|
|
183
|
|
446
|
my ($self, $sequence_obj) = @_; |
36
|
183
|
|
|
|
|
565
|
my $maximum_number_of_Xs = int(($sequence_obj->length()*$self->maximum_percentage_of_unknowns)/100); |
37
|
183
|
|
|
|
|
617
|
my $number_of_Xs_found = () = $sequence_obj->seq() =~ /X/g; |
38
|
183
|
100
|
|
|
|
2794
|
if($number_of_Xs_found > $maximum_number_of_Xs) |
39
|
|
|
|
|
|
|
{ |
40
|
2
|
|
|
|
|
5
|
return 1; |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
else |
43
|
|
|
|
|
|
|
{ |
44
|
181
|
|
|
|
|
484
|
return 0; |
45
|
|
|
|
|
|
|
} |
46
|
|
|
|
|
|
|
} |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
sub _filter_fasta_sequences_and_return_new_file |
50
|
|
|
|
|
|
|
{ |
51
|
44
|
|
|
44
|
|
107
|
my ($self, $output_file, $input_file) = @_; |
52
|
44
|
|
|
|
|
140
|
my $output_filename = $output_file.'.tmp.filtered.fa'; |
53
|
44
|
|
|
|
|
312
|
my $out_fasta_obj = Bio::SeqIO->new( -file => ">".$output_filename, -format => 'Fasta'); |
54
|
44
|
|
|
|
|
32236
|
my $fasta_obj = Bio::SeqIO->new( -file => $input_file, -format => 'Fasta'); |
55
|
|
|
|
|
|
|
|
56
|
44
|
|
|
|
|
30712
|
$self->input_fasta_to_output_fasta->{$input_file} = $output_filename; |
57
|
|
|
|
|
|
|
|
58
|
44
|
|
|
|
|
154
|
while(my $seq = $fasta_obj->next_seq()) |
59
|
|
|
|
|
|
|
{ |
60
|
183
|
100
|
|
|
|
62527
|
if($self->_does_sequence_contain_too_many_unknowns($seq)) |
61
|
|
|
|
|
|
|
{ |
62
|
2
|
|
|
|
|
7
|
next; |
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
#Â strip out extra details put in by fastatranslate |
65
|
181
|
|
|
|
|
566
|
$seq->description(undef); |
66
|
181
|
|
|
|
|
2437
|
$out_fasta_obj->write_seq($seq); |
67
|
|
|
|
|
|
|
} |
68
|
44
|
|
|
|
|
10165
|
return $output_filename; |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
|
73
|
4
|
|
|
4
|
|
30
|
no Moose; |
|
4
|
|
|
|
|
10
|
|
|
4
|
|
|
|
|
52
|
|
74
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
1; |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
__END__ |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
=pod |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
=encoding UTF-8 |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
=head1 NAME |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
Bio::Roary::FilterUnknownsFromFasta - Take in fasta files, remove sequences with too many unknowns and return a list of the new files |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=head1 VERSION |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
version 3.10.2 |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
=head1 SYNOPSIS |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
Take in fasta files, remove sequences with too many unknowns and return a list of the new files |
95
|
|
|
|
|
|
|
use Bio::Roary::FilterUnknownsFromFasta; |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
my $obj = Bio::Roary::FilterUnknownsFromFasta->new( |
98
|
|
|
|
|
|
|
fasta_files => [], |
99
|
|
|
|
|
|
|
); |
100
|
|
|
|
|
|
|
$obj->filtered_fasta_files(); |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=head1 AUTHOR |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
Andrew J. Page <ap13@sanger.ac.uk> |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute. |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
This is free software, licensed under: |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
The GNU General Public License, Version 3, June 2007 |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=cut |