File Coverage

lib/Bio/Roary/MergeMultifastaAlignments.pm
Criterion Covered Total %
statement 55 55 100.0
branch 6 8 75.0
condition n/a
subroutine 13 13 100.0
pod 0 2 0.0
total 74 78 94.8


line stmt bran cond sub pod time code
1             package Bio::Roary::MergeMultifastaAlignments;
2             $Bio::Roary::MergeMultifastaAlignments::VERSION = '3.11.0';
3             # ABSTRACT: Merge multifasta alignment files with equal numbers of sequences.
4              
5              
6 1     1   6 use Moose;
  1         3  
  1         4  
7 1     1   5320 use Bio::SeqIO;
  1         2  
  1         21  
8 1     1   276 use Bio::Roary::Output::CoreGeneAlignmentCoordinatesEMBL;
  1         3  
  1         558  
9              
10             has 'multifasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
11             has 'sample_names' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
12             has 'sample_names_to_genes' => ( is => 'rw', isa => 'HashRef', required => 1 );
13             has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'core_alignment.aln' );
14             has 'output_header_filename' => ( is => 'ro', isa => 'Str', default => 'core_alignment_header.embl' );
15             has '_output_seqio_obj' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__output_seqio_obj' );
16             has '_gene_lengths' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build__gene_lengths' );
17             has '_gene_to_sequence' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
18             has '_sorted_multifasta_files' => ( is => 'rw', isa => 'ArrayRef', lazy => 1, builder => '_build__sorted_multifasta_files' );
19              
20             sub BUILD {
21 2     2 0 3 my ($self) = @_;
22 2         49 $self->_gene_lengths;
23             }
24              
25             sub _input_seq_io_obj {
26 4     4   8 my ( $self, $filename ) = @_;
27 4         31 return Bio::SeqIO->new( -file => $filename, -format => 'Fasta' );
28             }
29              
30             sub _build__output_seqio_obj {
31 2     2   3 my ($self) = @_;
32 2         51 return Bio::SeqIO->new( -file => ">" . $self->output_filename, -format => 'Fasta' );
33             }
34              
35             sub _build__gene_lengths {
36 2     2   4 my ($self) = @_;
37 2         4 my %gene_lengths;
38 2         3 for my $filename ( @{ $self->_sorted_multifasta_files } ) {
  2         46  
39 4         250 my $seq_io = $self->_input_seq_io_obj($filename);
40 4 50       5142 next unless ( defined($seq_io) );
41 4         10 while ( my $seq_record = $seq_io->next_seq ) {
42              
43             # Save all of the gene sequences to memory, massive speedup but a bit naughty.
44 8         1382 $self->_gene_to_sequence->{$filename}->{ $seq_record->display_id } = $seq_record->seq;
45 8 100       125 $gene_lengths{$filename} = $seq_record->length() if ( !defined( $gene_lengths{$filename} ) );
46             }
47             }
48              
49 2         295 return \%gene_lengths;
50             }
51              
52             sub _build__sorted_multifasta_files {
53 2     2   3 my ($self) = @_;
54 2         2 my @sorted_gene_files = sort @{ $self->multifasta_files };
  2         44  
55 2         49 return \@sorted_gene_files;
56             }
57              
58             sub _sequence_for_sample_from_gene_file {
59 9     9   14 my ( $self, $sample_name, $gene_file ) = @_;
60              
61             # loop over this to get the geneIDs
62 9         12 for my $gene_id ( sort keys %{ $self->_gene_to_sequence->{$gene_file} } ) {
  9         241  
63 14 100       314 if ( defined( $self->sample_names_to_genes->{$sample_name}->{$gene_id} ) ) {
64 8         184 return $self->_gene_to_sequence->{$gene_file}->{$gene_id};
65             }
66             }
67 1         4 return $self->_padded_string_for_gene_file($gene_file);
68             }
69              
70             sub _padded_string_for_gene_file {
71 1     1   3 my ( $self, $gene_file ) = @_;
72 1 50       21 return '' unless ( defined( $self->_gene_lengths->{$gene_file} ) );
73 1         21 return '-' x ( $self->_gene_lengths->{$gene_file} );
74             }
75              
76             sub _create_merged_sequence_for_sample {
77 5     5   10 my ( $self, $sample_name ) = @_;
78 5         9 my $merged_sequence = '';
79 5         6 for my $gene_file ( @{ $self->_sorted_multifasta_files } ) {
  5         128  
80 9         22 $merged_sequence .= $self->_sequence_for_sample_from_gene_file( $sample_name, $gene_file );
81             }
82 5         11 return $merged_sequence;
83             }
84              
85             sub merge_files {
86 2     2 0 4 my ($self) = @_;
87              
88 2         3 for my $sample_name ( @{ $self->sample_names } ) {
  2         43  
89 5         597 my $sequence = $self->_create_merged_sequence_for_sample($sample_name);
90 5         24 my $seq_io = Bio::Seq->new( -display_id => $sample_name, -seq => $sequence );
91 5         1225 $self->_output_seqio_obj->write_seq($seq_io);
92             }
93              
94             # Create a header file which gives the coordinates of each gene in the multifasta
95             Bio::Roary::Output::CoreGeneAlignmentCoordinatesEMBL->new(
96 2         379 multifasta_files => $self->_sorted_multifasta_files,
97             gene_lengths => $self->_gene_lengths,
98             output_filename => $self->output_header_filename
99             )->create_file();
100              
101 2         86 return 1;
102             }
103              
104 1     1   8 no Moose;
  1         1  
  1         5  
105             __PACKAGE__->meta->make_immutable;
106              
107             1;
108              
109             __END__
110              
111             =pod
112              
113             =encoding UTF-8
114              
115             =head1 NAME
116              
117             Bio::Roary::MergeMultifastaAlignments - Merge multifasta alignment files with equal numbers of sequences.
118              
119             =head1 VERSION
120              
121             version 3.11.0
122              
123             =head1 SYNOPSIS
124              
125             Merge multifasta alignment files with equal numbers of sequences.So each sequence in each file gets concatenated together. It is assumed the
126             sequences are in the correct order.
127             use Bio::Roary::MergeMultifastaAlignments;
128              
129             my $obj = Bio::Roary::MergeMultifastaAlignments->new(
130             multifasta_files => [],
131             output_filename => 'output_merged.aln'
132             );
133             $obj->merge_files;
134              
135             =head1 AUTHOR
136              
137             Andrew J. Page <ap13@sanger.ac.uk>
138              
139             =head1 COPYRIGHT AND LICENSE
140              
141             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
142              
143             This is free software, licensed under:
144              
145             The GNU General Public License, Version 3, June 2007
146              
147             =cut