File Coverage

lib/Bio/Roary/MergeMultifastaAlignments.pm

Criterion	Covered	Total	%
statement	55	55	100.0
branch	6	8	75.0
condition			n/a
subroutine	13	13	100.0
pod	0	2	0.0
total	74	78	94.8

line	stmt	bran	sub	pod	time	code
1						package Bio::Roary::MergeMultifastaAlignments;
2						$Bio::Roary::MergeMultifastaAlignments::VERSION = '3.11.0';
3						# ABSTRACT: Merge multifasta alignment files with equal numbers of sequences.
4
5
6	1		1		6	use Moose;
	1				3
	1				4
7	1		1		5320	use Bio::SeqIO;
	1				2
	1				21
8	1		1		276	use Bio::Roary::Output::CoreGeneAlignmentCoordinatesEMBL;
	1				3
	1				558
9
10						has 'multifasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
11						has 'sample_names' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
12						has 'sample_names_to_genes' => ( is => 'rw', isa => 'HashRef', required => 1 );
13						has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'core_alignment.aln' );
14						has 'output_header_filename' => ( is => 'ro', isa => 'Str', default => 'core_alignment_header.embl' );
15						has '_output_seqio_obj' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__output_seqio_obj' );
16						has '_gene_lengths' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build__gene_lengths' );
17						has '_gene_to_sequence' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
18						has '_sorted_multifasta_files' => ( is => 'rw', isa => 'ArrayRef', lazy => 1, builder => '_build__sorted_multifasta_files' );
19
20						sub BUILD {
21	2		2	0	3	my ($self) = @_;
22	2				49	$self->_gene_lengths;
23						}
24
25						sub _input_seq_io_obj {
26	4		4		8	my ( $self, $filename ) = @_;
27	4				31	return Bio::SeqIO->new( -file => $filename, -format => 'Fasta' );
28						}
29
30						sub _build__output_seqio_obj {
31	2		2		3	my ($self) = @_;
32	2				51	return Bio::SeqIO->new( -file => ">" . $self->output_filename, -format => 'Fasta' );
33						}
34
35						sub _build__gene_lengths {
36	2		2		4	my ($self) = @_;
37	2				4	my %gene_lengths;
38	2				3	for my $filename ( @{ $self->_sorted_multifasta_files } ) {
	2				46
39	4				250	my $seq_io = $self->_input_seq_io_obj($filename);
40	4	50			5142	next unless ( defined($seq_io) );
41	4				10	while ( my $seq_record = $seq_io->next_seq ) {
42
43						# Save all of the gene sequences to memory, massive speedup but a bit naughty.
44	8				1382	$self->_gene_to_sequence->{$filename}->{ $seq_record->display_id } = $seq_record->seq;
45	8	100			125	$gene_lengths{$filename} = $seq_record->length() if ( !defined( $gene_lengths{$filename} ) );
46						}
47						}
48
49	2				295	return \%gene_lengths;
50						}
51
52						sub _build__sorted_multifasta_files {
53	2		2		3	my ($self) = @_;
54	2				2	my @sorted_gene_files = sort @{ $self->multifasta_files };
	2				44
55	2				49	return \@sorted_gene_files;
56						}
57
58						sub _sequence_for_sample_from_gene_file {
59	9		9		14	my ( $self, $sample_name, $gene_file ) = @_;
60
61						# loop over this to get the geneIDs
62	9				12	for my $gene_id ( sort keys %{ $self->_gene_to_sequence->{$gene_file} } ) {
	9				241
63	14	100			314	if ( defined( $self->sample_names_to_genes->{$sample_name}->{$gene_id} ) ) {
64	8				184	return $self->_gene_to_sequence->{$gene_file}->{$gene_id};
65						}
66						}
67	1				4	return $self->_padded_string_for_gene_file($gene_file);
68						}
69
70						sub _padded_string_for_gene_file {
71	1		1		3	my ( $self, $gene_file ) = @_;
72	1	50			21	return '' unless ( defined( $self->_gene_lengths->{$gene_file} ) );
73	1				21	return '-' x ( $self->_gene_lengths->{$gene_file} );
74						}
75
76						sub _create_merged_sequence_for_sample {
77	5		5		10	my ( $self, $sample_name ) = @_;
78	5				9	my $merged_sequence = '';
79	5				6	for my $gene_file ( @{ $self->_sorted_multifasta_files } ) {
	5				128
80	9				22	$merged_sequence .= $self->_sequence_for_sample_from_gene_file( $sample_name, $gene_file );
81						}
82	5				11	return $merged_sequence;
83						}
84
85						sub merge_files {
86	2		2	0	4	my ($self) = @_;
87
88	2				3	for my $sample_name ( @{ $self->sample_names } ) {
	2				43
89	5				597	my $sequence = $self->_create_merged_sequence_for_sample($sample_name);
90	5				24	my $seq_io = Bio::Seq->new( -display_id => $sample_name, -seq => $sequence );
91	5				1225	$self->_output_seqio_obj->write_seq($seq_io);
92						}
93
94						# Create a header file which gives the coordinates of each gene in the multifasta
95						Bio::Roary::Output::CoreGeneAlignmentCoordinatesEMBL->new(
96	2				379	multifasta_files => $self->_sorted_multifasta_files,
97						gene_lengths => $self->_gene_lengths,
98						output_filename => $self->output_header_filename
99						)->create_file();
100
101	2				86	return 1;
102						}
103
104	1		1		8	no Moose;
	1				1
	1				5
105						__PACKAGE__->meta->make_immutable;
106
107						1;
108
109						__END__
110
111						=pod
112
113						=encoding UTF-8
114
115						=head1 NAME
116
117						Bio::Roary::MergeMultifastaAlignments - Merge multifasta alignment files with equal numbers of sequences.
118
119						=head1 VERSION
120
121						version 3.11.0
122
123						=head1 SYNOPSIS
124
125						Merge multifasta alignment files with equal numbers of sequences.So each sequence in each file gets concatenated together. It is assumed the
126						sequences are in the correct order.
127						use Bio::Roary::MergeMultifastaAlignments;
128
129						my $obj = Bio::Roary::MergeMultifastaAlignments->new(
130						multifasta_files => [],
131						output_filename => 'output_merged.aln'
132						);
133						$obj->merge_files;
134
135						=head1 AUTHOR
136
137						Andrew J. Page <ap13@sanger.ac.uk>
138
139						=head1 COPYRIGHT AND LICENSE
140
141						This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
142
143						This is free software, licensed under:
144
145						The GNU General Public License, Version 3, June 2007
146
147						=cut