File Coverage

lib/Bio/Roary/Output/GroupsMultifastaNucleotide.pm

Criterion	Covered	Total	%
statement	71	108	65.7
branch	5	18	27.7
condition	0	3	0.0
subroutine	20	23	86.9
pod	0	1	0.0
total	96	153	62.7

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::Roary::Output::GroupsMultifastaNucleotide;
2							$Bio::Roary::Output::GroupsMultifastaNucleotide::VERSION = '3.10.1';
3							# ABSTRACT: Take in a GFF files and a groups file and output one multifasta file per group with nucleotide sequences.
4
5
6	3			3		21	use Moose;
	3					7
	3					20
7	3			3		22258	use Bio::SeqIO;
	3					9481
	3					136
8	3			3		27	use File::Path qw(make_path);
	3					12
	3					215
9	3			3		24	use File::Basename;
	3					6
	3					186
10	3			3		22	use File::Copy;
	3					6
	3					179
11	3			3		22	use File::Temp qw/ tempfile /;
	3					8
	3					219
12	3			3		20	use Bio::Roary::Exceptions;
	3					40
	3					81
13	3			3		19	use Bio::Roary::AnalyseGroups;
	3					7
	3					86
14	3			3		16	use Bio::Tools::GFF;
	3					7
	3					3659
15							with 'Bio::Roary::BedFromGFFRole';
16
17							has 'gff_file' => ( is => 'ro', isa => 'Str', required => 1 );
18							has 'group_names' => ( is => 'ro', isa => 'ArrayRef', required => 0 );
19							has 'output_directory' => ( is => 'ro', isa => 'Str', required => 1 );
20							has 'pan_reference_groups_seen' => ( is => 'rw', isa => 'HashRef', required => 1 );
21							has 'number_of_gff_files' => ( is => 'ro', isa => 'Int', required => 1 );
22							has 'pan_reference_filename' => ( is => 'ro', isa => 'Str',default => 'pan_genome_reference.fa' );
23							has 'dont_delete_files' => ( is => 'ro', isa => 'Bool',default => 0 );
24							has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1.0 );
25
26							has 'annotate_groups' => ( is => 'ro', isa => 'Bio::Roary::AnnotateGroups', required => 1 );
27							has 'output_multifasta_files' => ( is => 'ro', isa => 'Bool', default => 0 );
28
29							has 'fasta_file' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_fasta_file' );
30							has '_input_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__input_seqio' );
31
32							has 'output_filename' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_output_filename' );
33
34							sub _build_output_filename
35							{
36	1			1		3	my ($self) = @_;
37	1					22	my ( $filename, $directories, $suffix ) = fileparse($self->gff_file);
38	1					26	return join('/',($self->output_directory, $filename.'.tmp_nuc_sequences.fa' ));
39							}
40
41							sub _build__input_seqio {
42	1			1		2	my ($self) = @_;
43	1					28	return Bio::SeqIO->new( -file => $self->fasta_file, -format => 'Fasta' );
44							}
45
46							sub _bed_output_filename {
47	3			3		7	my ($self) = @_;
48	3					90	return join( '.', ( $self->output_filename, 'intermediate.bed' ) );
49							}
50
51							sub populate_files {
52	1			1	0	3	my ($self) = @_;
53	1					32	while ( my $input_seq = $self->_input_seqio->next_seq() )
54							{
55	0	0				0	if ( $self->annotate_groups->_ids_to_groups->{$input_seq->display_id} )
56							{
57	0					0	my $current_group = $self->annotate_groups->_ids_to_groups->{$input_seq->display_id};
58	0					0	my $gene_name = $self->annotate_groups->_groups_to_consensus_gene_names->{$current_group};
59
60	0	0				0	if(! defined($self->pan_reference_groups_seen->{$current_group}))
61							{
62	0					0	my $pan_output_seq = $self->_pan_genome_reference_io_obj($current_group);
63	0	0				0	$pan_output_seq->write_seq(Bio::Seq->new( -display_id => $input_seq->display_id, -desc => ($gene_name ? $gene_name : $current_group), -seq => $input_seq->seq ) );
64	0					0	$self->pan_reference_groups_seen->{$current_group} = 1;
65							}
66
67	0					0	my $number_of_genes = @{$self->annotate_groups->_groups_to_id_names->{$current_group}};
	0					0
68							# Theres no need to align noncore files
69	0	0	0			0	next if($self->dont_delete_files == 0 && $number_of_genes < ($self->core_definition * $self->number_of_gff_files ));
70
71	0					0	my $output_seq = $self->_group_seq_io_obj($current_group,$number_of_genes);
72	0					0	$output_seq->write_seq($input_seq);
73							}
74							}
75
76	0					0	unlink($self->fasta_file);
77	0					0	1;
78							}
79
80							sub _group_file_name
81							{
82	0			0		0	my ($self,$group_name,$num_group_genes) = @_;
83	0					0	my $annotated_group_name = $self->annotate_groups->_groups_to_consensus_gene_names->{$group_name};
84	0					0	$annotated_group_name =~ s!\W!_!gi;
85	0					0	my $filename = $annotated_group_name.'.fa';
86	0					0	my $group_file_name = join('/',($self->output_directory, $filename ));
87	0					0	return $group_file_name;
88							}
89
90
91							sub _pan_genome_reference_io_obj
92							{
93	0			0		0	my ($self) = @_;
94	0					0	return Bio::SeqIO->new( -file => ">>".$self->pan_reference_filename, -format => 'Fasta' );
95							}
96
97
98							sub _group_seq_io_obj
99							{
100	0			0		0	my ($self,$group_name,$num_group_genes) = @_;
101	0					0	my $filename = $self->_group_file_name($group_name,$num_group_genes);
102	0					0	return Bio::SeqIO->new( -file => ">>".$filename, -format => 'Fasta' );
103							}
104
105
106							sub _extracted_nucleotide_fasta_file_from_bed_filename {
107	2			2		5	my ($self) = @_;
108	2					62	return join( '.', ( $self->output_filename, 'intermediate.extracted.fa' ) );
109							}
110
111							sub _create_nucleotide_fasta_file_from_gff {
112	1			1		2	my ($self) = @_;
113
114	1					28	open(my $input_fh, $self->gff_file);
115	1					9	open(my $output_fh, '>', $self->_nucleotide_fasta_file_from_gff_filename);
116	1					3	my $at_sequence = 0;
117	1					22	while(<$input_fh>)
118							{
119	271					308	my $line = $_;
120	271	100				359	if($line =~/^>/)
121							{
122	1					5	$at_sequence = 1;
123							}
124
125	271	100				342	if($at_sequence == 1)
126							{
127	252					238	print {$output_fh} $line;
	252					503
128							}
129							}
130	1					7	close($input_fh);
131	1					21	close($output_fh);
132							}
133
134							sub _nucleotide_fasta_file_from_gff_filename {
135	4			4		13	my ($self) = @_;
136	4					164	return join( '.', ( $self->output_filename, 'intermediate.fa' ) );
137							}
138
139							sub _extract_nucleotide_regions {
140	1			1		1	my ($self) = @_;
141
142	1					5	$self->_create_nucleotide_fasta_file_from_gff;
143	1					6	$self->_create_bed_file_from_gff;
144
145	1					168	my $cmd =
146							'bedtools getfasta -s -fi '
147							. $self->_nucleotide_fasta_file_from_gff_filename
148							. ' -bed '
149							. $self->_bed_output_filename . ' -fo '
150							. $self->_extracted_nucleotide_fasta_file_from_bed_filename
151							. ' -name > /dev/null 2>&1';
152	1					3530	system($cmd);
153	1					23	unlink( $self->_nucleotide_fasta_file_from_gff_filename );
154	1					13	unlink( $self->_bed_output_filename );
155	1					7	unlink( $self->_nucleotide_fasta_file_from_gff_filename . '.fai' );
156	1					6	return $self->_extracted_nucleotide_fasta_file_from_bed_filename;
157							}
158
159							sub _cleanup_fasta {
160	1			1		4	my ($self,$infile) = @_;
161
162	1					69	my($fh, $outfile) = tempfile();
163	1	50				598	return unless ( -e $infile );
164
165	0					0	open( my $in, '<', $infile );
166	0					0	open( my $out, '>', $outfile );
167	0					0	while ( my $line = <$in> ) {
168	0					0	chomp $line;
169	0	0				0	$line =~ s/"//g if ( $line =~ /^>/ );
170
171	0	0				0	if($line =~ /^(>[^:]+)/)
172							{
173	0					0	$line = $1;
174							}
175	0					0	print $out "$line\n";
176							}
177	0					0	close $in;
178	0					0	close $out;
179
180	0					0	move( $outfile, $infile);
181	0					0	return $infile;
182							}
183
184
185							sub _build_fasta_file {
186	1			1		3	my ($self) = @_;
187	1					3	my $fasta_filename = $self->_extract_nucleotide_regions;
188	1					10	return $self->_cleanup_fasta($fasta_filename);
189							}
190
191	3			3		27	no Moose;
	3					7
	3					21
192							__PACKAGE__->meta->make_immutable;
193
194							1;
195
196							__END__
197
198							=pod
199
200							=encoding UTF-8
201
202							=head1 NAME
203
204							Bio::Roary::Output::GroupsMultifastaNucleotide - Take in a GFF files and a groups file and output one multifasta file per group with nucleotide sequences.
205
206							=head1 VERSION
207
208							version 3.10.1
209
210							=head1 SYNOPSIS
211
212							Take in a GFF files and a groups file and output one multifasta file per group with nucleotide sequences.
213							use Bio::Roary::Output::GroupsMultifastas;
214
215							my $obj = Bio::Roary::Output::GroupsMultifastasNucleotide->new(
216							group_names => ['aaa','bbb'],
217							);
218							$obj->populate_files();
219
220							=head1 AUTHOR
221
222							Andrew J. Page <ap13@sanger.ac.uk>
223
224							=head1 COPYRIGHT AND LICENSE
225
226							This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
227
228							This is free software, licensed under:
229
230							The GNU General Public License, Version 3, June 2007
231
232							=cut