File Coverage

lib/Bio/Roary/ExtractProteomeFromGFF.pm

Criterion	Covered	Total	%
statement	62	119	52.1
branch	4	16	25.0
condition			n/a
subroutine	17	24	70.8
pod			n/a
total	83	159	52.2

line	stmt	bran	sub	time	code
1					package Bio::Roary::ExtractProteomeFromGFF;
2					$Bio::Roary::ExtractProteomeFromGFF::VERSION = '3.11.0';
3					# ABSTRACT: Take in a GFF file and create protein sequences in FASTA format
4
5
6	6		6	36	use Moose;
	6			13
	6			38
7	6		6	34637	use Bio::SeqIO;
	6			162101
	6			206
8	6		6	45	use Cwd;
	6			11
	6			380
9	6		6	323	use Bio::Roary::Exceptions;
	6			11
	6			111
10	6		6	30	use File::Basename;
	6			11
	6			330
11	6		6	33	use File::Temp;
	6			23
	6			394
12	6		6	38	use File::Copy;
	6			10
	6			256
13	6		6	2066	use Bio::Tools::GFF;
	6			204233
	6			6134
14					with 'Bio::Roary::JobRunner::Role';
15					with 'Bio::Roary::BedFromGFFRole';
16
17					has 'gff_file' => ( is => 'ro', isa => 'Str', required => 1 );
18					has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 );
19					has 'maximum_percentage_of_unknowns' => ( is => 'ro', isa => 'Num', default => 5 );
20					has 'output_filename' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_output_filename' );
21					has 'fasta_file' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_fasta_file' );
22					has '_working_directory' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
23					has '_working_directory_name' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__working_directory_name' );
24					has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );
25
26					sub _build_fasta_file {
27	2		2	7	my ($self) = @_;
28	2			18	$self->_extract_nucleotide_regions;
29	2			25	$self->_convert_nucleotide_to_protein;
30	0			0	$self->_cleanup_fasta;
31	0			0	$self->_cleanup_intermediate_files;
32	0			0	$self->_filter_fasta_sequences( join('/',($self->output_directory,$self->output_filename)) );
33	0			0	return join('/',($self->output_directory,$self->output_filename));
34					}
35
36					sub _build__working_directory_name {
37	0		0	0	my ($self) = @_;
38	0			0	return $self->_working_directory->dirname();
39					}
40
41					sub _build_output_filename {
42	0		0	0	my ($self) = @_;
43	0			0	my ( $filename, $directories, $suffix ) = fileparse( $self->gff_file, qr/\.[^.]*/ );
44	0			0	return join( '/', ( $self->_working_directory_name, $filename . '.faa' ) );
45					}
46
47
48
49					sub _cleanup_intermediate_files {
50	0		0	0	my ($self) = @_;
51	0			0	unlink( $self->_unfiltered_output_filename );
52	0			0	unlink( $self->_fastatranslate_filename );
53					}
54
55					sub _nucleotide_fasta_file_from_gff_filename {
56	8		8	35	my ($self) = @_;
57	8			454	return join('/',($self->output_directory,join( '.', ( $self->output_filename, 'intermediate.fa' ) )));
58					}
59
60					sub _extracted_nucleotide_fasta_file_from_bed_filename {
61	4		4	11	my ($self) = @_;
62	4			99	return join('/',($self->output_directory,join( '.', ( $self->output_filename,'intermediate.extracted.fa' ) )));
63					}
64
65					sub _unfiltered_output_filename {
66	2		2	8	my $self = shift;
67	2			56	return join('/',($self->output_directory,join( '.', ( $self->output_filename, 'unfiltered.fa' ) )));
68					}
69
70
71					sub _create_nucleotide_fasta_file_from_gff {
72	2		2	6	my ($self) = @_;
73
74	2			78	open(my $input_fh, $self->gff_file);
75	2			19	open(my $output_fh, '>', $self->_nucleotide_fasta_file_from_gff_filename);
76	2			22	my $at_sequence = 0;
77	2			62	while(<$input_fh>)
78					{
79	542			580	my $line = $_;
80	542	100		712	if($line =~/^>/)
81					{
82	2			2	$at_sequence = 1;
83					}
84
85	542	100		717	if($at_sequence == 1)
86					{
87	504			449	print {$output_fh} $line;
	504			1014
88					}
89					}
90	2			18	close($input_fh);
91	2			43	close($output_fh);
92					}
93
94					sub _extract_nucleotide_regions {
95	2		2	9	my ($self) = @_;
96
97	2			10	$self->_create_nucleotide_fasta_file_from_gff;
98	2			21	$self->_create_bed_file_from_gff;
99
100	2			449	my $cmd =
101					'bedtools getfasta -s -fi '
102					. $self->_nucleotide_fasta_file_from_gff_filename
103					. ' -bed '
104					. $self->_bed_output_filename . ' -fo '
105					. $self->_extracted_nucleotide_fasta_file_from_bed_filename
106					. ' -name > /dev/null 2>&1';
107
108	2			55	$self->logger->debug($cmd);
109	2			9358	system($cmd);
110	2			71	unlink( $self->_nucleotide_fasta_file_from_gff_filename );
111	2			29	unlink( $self->_bed_output_filename );
112	2			26	unlink( $self->_nucleotide_fasta_file_from_gff_filename . '.fai' );
113					}
114
115					sub _cleanup_fasta {
116	0		0	0	my $self = shift;
117	0			0	my $infile = $self->_unfiltered_output_filename;
118	0			0	my $outfile = join('/',($self->output_directory,$self->output_filename));
119	0	0		0	return unless ( -e $infile );
120
121	0			0	open( my $in, '<', $infile );
122	0			0	open( my $out, '>', $outfile );
123	0			0	while ( my $line = <$in> ) {
124	0			0	chomp $line;
125	0	0		0	$line =~ s/"//g if ( $line =~ /^>/ );
126
127	0	0		0	if($line =~ /^(>[^:]+)/)
128					{
129	0			0	$line = $1;
130					}
131	0			0	print $out "$line\n";
132					}
133	0			0	close $in;
134	0			0	close $out;
135					}
136
137					sub _fastatranslate_filename {
138	0		0	0	my ($self) = @_;
139	0			0	return join('/',($self->output_directory,join( '.', ( $self->output_filename, 'intermediate.translate.fa' ) )));
140					}
141
142					sub _fastatranslate {
143	2		2	11	my ( $self, $inputfile, $outputfile ) = @_;
144
145	2			62	my $input_fasta_file_obj = Bio::SeqIO->new( -file => $inputfile, -format => 'Fasta' );
146	0			0	my $output_protein_file_obj = Bio::SeqIO->new( -file => ">" . $outputfile, -format => 'Fasta', -alphabet => 'protein' );
147
148	0			0	my %protein_sequence_objs;
149	0			0	while ( my $seq = $input_fasta_file_obj->next_seq ) {
150	0			0	$seq->desc(undef);
151	0			0	my $protseq = $seq->translate( -codontable_id => $self->translation_table );
152	0			0	$output_protein_file_obj->write_seq($protseq);
153					}
154	0			0	return 1;
155					}
156
157					sub _convert_nucleotide_to_protein {
158	2		2	8	my ($self) = @_;
159	2			9	$self->_fastatranslate( $self->_extracted_nucleotide_fasta_file_from_bed_filename, $self->_unfiltered_output_filename );
160	0				unlink( $self->_extracted_nucleotide_fasta_file_from_bed_filename );
161					}
162
163					sub _does_sequence_contain_too_many_unknowns {
164	0		0		my ( $self, $sequence_obj ) = @_;
165	0				my $maximum_number_of_Xs = int( ( $sequence_obj->length() * $self->maximum_percentage_of_unknowns ) / 100 );
166	0				my $number_of_Xs_found = () = $sequence_obj->seq() =~ /X/g;
167	0	0			if ( $number_of_Xs_found > $maximum_number_of_Xs ) {
168	0				return 1;
169					}
170					else {
171	0				return 0;
172					}
173					}
174
175					sub _filter_fasta_sequences {
176	0		0		my ( $self, $filename ) = @_;
177	0				my $temp_output_file = $filename . '.tmp.filtered.fa';
178	0				my $out_fasta_obj = Bio::SeqIO->new( -file => ">" . $temp_output_file, -format => 'Fasta' );
179	0				my $fasta_obj = Bio::SeqIO->new( -file => $filename, -format => 'Fasta' );
180
181	0				my $sequence_found = 0;
182
183	0				while ( my $seq = $fasta_obj->next_seq() ) {
184	0	0			if ( $self->_does_sequence_contain_too_many_unknowns($seq) ) {
185	0				next;
186					}
187	0				$seq->desc(undef);
188	0				$out_fasta_obj->write_seq($seq);
189	0				$sequence_found = 1;
190					}
191
192	0	0			if ( $sequence_found == 0 ) {
193	0				$self->logger->error( "Could not extract any protein sequences from "
194					. $self->gff_file
195					. ". Does the file contain the assembly as well as the annotation?" );
196					}
197
198					# Replace the original file.
199	0				move( $temp_output_file, $filename );
200	0				return 1;
201					}
202
203	6		6	62	no Moose;
	6			14
	6			57
204					__PACKAGE__->meta->make_immutable;
205
206					1;
207
208					__END__
209
210					=pod
211
212					=encoding UTF-8
213
214					=head1 NAME
215
216					Bio::Roary::ExtractProteomeFromGFF - Take in a GFF file and create protein sequences in FASTA format
217
218					=head1 VERSION
219
220					version 3.11.0
221
222					=head1 SYNOPSIS
223
224					Take in GFF files and create protein sequences in FASTA format
225					use Bio::Roary::ExtractProteomeFromGFF;
226
227					my $obj = Bio::Roary::ExtractProteomeFromGFF->new(
228					gff_file => $fasta_file,
229					);
230					$obj->fasta_file();
231
232					=head1 AUTHOR
233
234					Andrew J. Page <ap13@sanger.ac.uk>
235
236					=head1 COPYRIGHT AND LICENSE
237
238					This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
239
240					This is free software, licensed under:
241
242					The GNU General Public License, Version 3, June 2007
243
244					=cut