File Coverage

lib/Bio/Roary/ExtractProteomeFromGFF.pm
Criterion Covered Total %
statement 62 119 52.1
branch 4 16 25.0
condition n/a
subroutine 17 24 70.8
pod n/a
total 83 159 52.2


line stmt bran cond sub pod time code
1             package Bio::Roary::ExtractProteomeFromGFF;
2             $Bio::Roary::ExtractProteomeFromGFF::VERSION = '3.11.0';
3             # ABSTRACT: Take in a GFF file and create protein sequences in FASTA format
4              
5              
6 6     6   36 use Moose;
  6         13  
  6         38  
7 6     6   34637 use Bio::SeqIO;
  6         162101  
  6         206  
8 6     6   45 use Cwd;
  6         11  
  6         380  
9 6     6   323 use Bio::Roary::Exceptions;
  6         11  
  6         111  
10 6     6   30 use File::Basename;
  6         11  
  6         330  
11 6     6   33 use File::Temp;
  6         23  
  6         394  
12 6     6   38 use File::Copy;
  6         10  
  6         256  
13 6     6   2066 use Bio::Tools::GFF;
  6         204233  
  6         6134  
14             with 'Bio::Roary::JobRunner::Role';
15             with 'Bio::Roary::BedFromGFFRole';
16              
17             has 'gff_file' => ( is => 'ro', isa => 'Str', required => 1 );
18             has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 );
19             has 'maximum_percentage_of_unknowns' => ( is => 'ro', isa => 'Num', default => 5 );
20             has 'output_filename' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_output_filename' );
21             has 'fasta_file' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_fasta_file' );
22             has '_working_directory' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
23             has '_working_directory_name' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__working_directory_name' );
24             has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );
25              
26             sub _build_fasta_file {
27 2     2   7 my ($self) = @_;
28 2         18 $self->_extract_nucleotide_regions;
29 2         25 $self->_convert_nucleotide_to_protein;
30 0         0 $self->_cleanup_fasta;
31 0         0 $self->_cleanup_intermediate_files;
32 0         0 $self->_filter_fasta_sequences( join('/',($self->output_directory,$self->output_filename)) );
33 0         0 return join('/',($self->output_directory,$self->output_filename));
34             }
35              
36             sub _build__working_directory_name {
37 0     0   0 my ($self) = @_;
38 0         0 return $self->_working_directory->dirname();
39             }
40              
41             sub _build_output_filename {
42 0     0   0 my ($self) = @_;
43 0         0 my ( $filename, $directories, $suffix ) = fileparse( $self->gff_file, qr/\.[^.]*/ );
44 0         0 return join( '/', ( $self->_working_directory_name, $filename . '.faa' ) );
45             }
46              
47              
48              
49             sub _cleanup_intermediate_files {
50 0     0   0 my ($self) = @_;
51 0         0 unlink( $self->_unfiltered_output_filename );
52 0         0 unlink( $self->_fastatranslate_filename );
53             }
54              
55             sub _nucleotide_fasta_file_from_gff_filename {
56 8     8   35 my ($self) = @_;
57 8         454 return join('/',($self->output_directory,join( '.', ( $self->output_filename, 'intermediate.fa' ) )));
58             }
59              
60             sub _extracted_nucleotide_fasta_file_from_bed_filename {
61 4     4   11 my ($self) = @_;
62 4         99 return join('/',($self->output_directory,join( '.', ( $self->output_filename,'intermediate.extracted.fa' ) )));
63             }
64              
65             sub _unfiltered_output_filename {
66 2     2   8 my $self = shift;
67 2         56 return join('/',($self->output_directory,join( '.', ( $self->output_filename, 'unfiltered.fa' ) )));
68             }
69              
70              
71             sub _create_nucleotide_fasta_file_from_gff {
72 2     2   6 my ($self) = @_;
73            
74 2         78 open(my $input_fh, $self->gff_file);
75 2         19 open(my $output_fh, '>', $self->_nucleotide_fasta_file_from_gff_filename);
76 2         22 my $at_sequence = 0;
77 2         62 while(<$input_fh>)
78             {
79 542         580 my $line = $_;
80 542 100       712 if($line =~/^>/)
81             {
82 2         2 $at_sequence = 1;
83             }
84            
85 542 100       717 if($at_sequence == 1)
86             {
87 504         449 print {$output_fh} $line;
  504         1014  
88             }
89             }
90 2         18 close($input_fh);
91 2         43 close($output_fh);
92             }
93              
94             sub _extract_nucleotide_regions {
95 2     2   9 my ($self) = @_;
96              
97 2         10 $self->_create_nucleotide_fasta_file_from_gff;
98 2         21 $self->_create_bed_file_from_gff;
99              
100 2         449 my $cmd =
101             'bedtools getfasta -s -fi '
102             . $self->_nucleotide_fasta_file_from_gff_filename
103             . ' -bed '
104             . $self->_bed_output_filename . ' -fo '
105             . $self->_extracted_nucleotide_fasta_file_from_bed_filename
106             . ' -name > /dev/null 2>&1';
107              
108 2         55 $self->logger->debug($cmd);
109 2         9358 system($cmd);
110 2         71 unlink( $self->_nucleotide_fasta_file_from_gff_filename );
111 2         29 unlink( $self->_bed_output_filename );
112 2         26 unlink( $self->_nucleotide_fasta_file_from_gff_filename . '.fai' );
113             }
114              
115             sub _cleanup_fasta {
116 0     0   0 my $self = shift;
117 0         0 my $infile = $self->_unfiltered_output_filename;
118 0         0 my $outfile = join('/',($self->output_directory,$self->output_filename));
119 0 0       0 return unless ( -e $infile );
120              
121 0         0 open( my $in, '<', $infile );
122 0         0 open( my $out, '>', $outfile );
123 0         0 while ( my $line = <$in> ) {
124 0         0 chomp $line;
125 0 0       0 $line =~ s/"//g if ( $line =~ /^>/ );
126            
127 0 0       0 if($line =~ /^(>[^:]+)/)
128             {
129 0         0 $line = $1;
130             }
131 0         0 print $out "$line\n";
132             }
133 0         0 close $in;
134 0         0 close $out;
135             }
136              
137             sub _fastatranslate_filename {
138 0     0   0 my ($self) = @_;
139 0         0 return join('/',($self->output_directory,join( '.', ( $self->output_filename, 'intermediate.translate.fa' ) )));
140             }
141              
142             sub _fastatranslate {
143 2     2   11 my ( $self, $inputfile, $outputfile ) = @_;
144              
145 2         62 my $input_fasta_file_obj = Bio::SeqIO->new( -file => $inputfile, -format => 'Fasta' );
146 0         0 my $output_protein_file_obj = Bio::SeqIO->new( -file => ">" . $outputfile, -format => 'Fasta', -alphabet => 'protein' );
147              
148 0         0 my %protein_sequence_objs;
149 0         0 while ( my $seq = $input_fasta_file_obj->next_seq ) {
150 0         0 $seq->desc(undef);
151 0         0 my $protseq = $seq->translate( -codontable_id => $self->translation_table );
152 0         0 $output_protein_file_obj->write_seq($protseq);
153             }
154 0         0 return 1;
155             }
156              
157             sub _convert_nucleotide_to_protein {
158 2     2   8 my ($self) = @_;
159 2         9 $self->_fastatranslate( $self->_extracted_nucleotide_fasta_file_from_bed_filename, $self->_unfiltered_output_filename );
160 0           unlink( $self->_extracted_nucleotide_fasta_file_from_bed_filename );
161             }
162              
163             sub _does_sequence_contain_too_many_unknowns {
164 0     0     my ( $self, $sequence_obj ) = @_;
165 0           my $maximum_number_of_Xs = int( ( $sequence_obj->length() * $self->maximum_percentage_of_unknowns ) / 100 );
166 0           my $number_of_Xs_found = () = $sequence_obj->seq() =~ /X/g;
167 0 0         if ( $number_of_Xs_found > $maximum_number_of_Xs ) {
168 0           return 1;
169             }
170             else {
171 0           return 0;
172             }
173             }
174              
175             sub _filter_fasta_sequences {
176 0     0     my ( $self, $filename ) = @_;
177 0           my $temp_output_file = $filename . '.tmp.filtered.fa';
178 0           my $out_fasta_obj = Bio::SeqIO->new( -file => ">" . $temp_output_file, -format => 'Fasta' );
179 0           my $fasta_obj = Bio::SeqIO->new( -file => $filename, -format => 'Fasta' );
180              
181 0           my $sequence_found = 0;
182              
183 0           while ( my $seq = $fasta_obj->next_seq() ) {
184 0 0         if ( $self->_does_sequence_contain_too_many_unknowns($seq) ) {
185 0           next;
186             }
187 0           $seq->desc(undef);
188 0           $out_fasta_obj->write_seq($seq);
189 0           $sequence_found = 1;
190             }
191              
192 0 0         if ( $sequence_found == 0 ) {
193 0           $self->logger->error( "Could not extract any protein sequences from "
194             . $self->gff_file
195             . ". Does the file contain the assembly as well as the annotation?" );
196             }
197              
198             # Replace the original file.
199 0           move( $temp_output_file, $filename );
200 0           return 1;
201             }
202              
203 6     6   62 no Moose;
  6         14  
  6         57  
204             __PACKAGE__->meta->make_immutable;
205              
206             1;
207              
208             __END__
209              
210             =pod
211              
212             =encoding UTF-8
213              
214             =head1 NAME
215              
216             Bio::Roary::ExtractProteomeFromGFF - Take in a GFF file and create protein sequences in FASTA format
217              
218             =head1 VERSION
219              
220             version 3.11.0
221              
222             =head1 SYNOPSIS
223              
224             Take in GFF files and create protein sequences in FASTA format
225             use Bio::Roary::ExtractProteomeFromGFF;
226              
227             my $obj = Bio::Roary::ExtractProteomeFromGFF->new(
228             gff_file => $fasta_file,
229             );
230             $obj->fasta_file();
231              
232             =head1 AUTHOR
233              
234             Andrew J. Page <ap13@sanger.ac.uk>
235              
236             =head1 COPYRIGHT AND LICENSE
237              
238             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
239              
240             This is free software, licensed under:
241              
242             The GNU General Public License, Version 3, June 2007
243              
244             =cut