File Coverage

lib/Bio/Roary/GroupStatistics.pm

Criterion	Covered	Total	%
statement	138	144	95.8
branch	14	18	77.7
condition	4	6	66.6
subroutine	23	23	100.0
pod	0	3	0.0
total	179	194	92.2

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::Roary::GroupStatistics;
2							$Bio::Roary::GroupStatistics::VERSION = '3.10.1';
3							# ABSTRACT: Add labels to the groups
4
5
6	10			10		101230	use Moose;
	10					463086
	10					85
7	10			10		71407	use POSIX;
	10					48806
	10					68
8	10			10		23267	use Text::CSV;
	10					53049
	10					407
9	10			10		69	use File::Basename;
	10					24
	10					756
10	10			10		2810	use Bio::SeqIO;
	10					272096
	10					328
11	10			10		1600	use Bio::Roary::Exceptions;
	10					30
	10					256
12	10			10		1441	use Bio::Roary::AnalyseGroups;
	10					44
	10					387
13	10			10		2910	use Bio::Roary::AnnotateGroups;
	10					31
	10					369
14	10			10		3832	use Bio::Roary::PresenceAbsenceMatrix;
	10					36
	10					12975
15
16							has 'annotate_groups_obj' => ( is => 'ro', isa => 'Bio::Roary::AnnotateGroups', required => 1 );
17							has 'analyse_groups_obj' => ( is => 'ro', isa => 'Bio::Roary::AnalyseGroups', required => 1 );
18							has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'gene_presence_absence.csv' );
19							has 'output_rtab_filename' => ( is => 'ro', isa => 'Str', default => 'gene_presence_absence.Rtab' );
20							has 'groups_to_contigs' => ( is => 'ro', isa => 'Maybe[HashRef]');
21							has '_output_fh' => ( is => 'ro', lazy => 1, builder => '_build__output_fh' );
22							has '_text_csv_obj' => ( is => 'ro', isa => 'Text::CSV', lazy => 1, builder => '_build__text_csv_obj' );
23							has '_sorted_file_names' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__sorted_file_names' );
24							has '_groups_to_files' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__groups_to_files' );
25							has '_files_to_groups' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__files_to_groups' );
26							has '_num_files_in_groups' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__num_files_in_groups' );
27							has '_verbose' => ( is => 'ro', isa => 'Bool', default => 0 );
28
29
30							sub _build__output_fh {
31	24			24		118	my ($self) = @_;
32	24	50				1128	open( my $fh, '>', $self->output_filename )
33							or Bio::Roary::Exceptions::CouldntWriteToFile->throw(
34							error => "Couldnt write output file:" . $self->output_filename );
35	24					906	return $fh;
36							}
37
38							sub _build__text_csv_obj {
39	24			24		139	my ($self) = @_;
40	24					776	return Text::CSV->new( { binary => 1, always_quote => 1, eol => "\r\n" } );
41							}
42
43							sub fixed_headers {
44	435			435	0	547	my ($self) = @_;
45	435					1396	my @header =
46							( 'Gene', 'Non-unique Gene name', 'Annotation', 'No. isolates', 'No. sequences', 'Avg sequences per isolate', 'Genome Fragment','Order within Fragment', 'Accessory Fragment','Accessory Order with Fragment', 'QC','Min group size nuc', 'Max group size nuc', 'Avg group size nuc' );
47	435					1213	return \@header;
48							}
49
50							sub _sample_headers
51							{
52	25			25		68	my ($self) = @_;
53	25					57	my @header;
54	25					48	for my $filename ( @{ $self->_sorted_file_names } ) {
	25					1028
55	76					3489	my $filename_cpy = basename($filename);
56	76					311	$filename_cpy =~ s!\.gff\.proteome\.faa!!;
57	76					238	push( @header, $filename_cpy );
58							}
59	25					200	return \@header;
60							}
61
62							sub _header {
63	24			24		113	my ($self) = @_;
64	24					53	my @header = @{ $self->fixed_headers };
	24					117
65	24					88	push( @header, @{$self->_sample_headers});
	24					128
66	24	100				1148	push( @header, 'Inference' ) if ( $self->_verbose );
67	24					1052	return \@header;
68							}
69
70							sub _build__sorted_file_names {
71	25			25		155	my ($self) = @_;
72	25					60	my @sorted_file_names = sort( @{ $self->analyse_groups_obj->fasta_files } );
	25					960
73	25					950	return \@sorted_file_names;
74							}
75
76							sub _non_unique_name_for_group {
77	63			63		205	my ( $self, $annotated_group_name ) = @_;
78	63					158	my $duplicate_gene_name = '';
79	63					1967	my $prefix = $self->annotate_groups_obj->_group_default_prefix;
80	63	100				422	if ( $annotated_group_name =~ /$prefix/ ) {
81	51					1696	my $non_unique_name_for_group =
82							$self->annotate_groups_obj->_consensus_gene_name_for_group($annotated_group_name);
83	51	50				324	if ( !( $non_unique_name_for_group =~ /$prefix/ ) ) {
84	0					0	$duplicate_gene_name = $non_unique_name_for_group;
85							}
86							}
87	63					198	return $duplicate_gene_name;
88							}
89
90							sub _build__groups_to_files {
91	22			22		66	my ($self) = @_;
92	22					54	my %groups_to_files;
93	22					48	for my $group ( @{ $self->annotate_groups_obj->_groups } ) {
	22					749
94	70					2386	my $genes = $self->annotate_groups_obj->_groups_to_id_names->{$group};
95	70					146	my %filenames;
96	70					119	for my $gene_name ( @{$genes} ) {
	70					170
97	121					4053	my $filename = $self->analyse_groups_obj->_genes_to_file->{$gene_name};
98	121					237	push( @{ $filenames{$filename} }, $gene_name );
	121					606
99							}
100	70					411	$groups_to_files{$group} = \%filenames;
101							}
102
103	22					790	return \%groups_to_files;
104							}
105
106							sub _build__files_to_groups
107							{
108	1			1		4	my ($self) = @_;
109	1					3	my %files_to_groups;
110
111	1					2	for my $group (keys %{$self->_groups_to_files})
	1					41
112							{
113	7					13	for my $filename (keys %{$self->_groups_to_files->{$group}})
	7					213
114							{
115	12					25	push(@{$files_to_groups{$filename}}, $group);
	12					34
116							}
117							}
118
119	1					38	return \%files_to_groups;
120							}
121
122							sub _build__num_files_in_groups
123							{
124	24			24		69	my ($self) = @_;
125	24					61	my %num_files_in_groups;
126	24					59	for my $group (@{ $self->annotate_groups_obj->_groups })
	24					1004
127							{
128	63					2221	my $num_files = $self->analyse_groups_obj->_count_num_files_in_group( $self->annotate_groups_obj->_groups_to_id_names->{$group});
129	63					253	$num_files_in_groups{$group} = $num_files;
130							}
131	24					901	return \%num_files_in_groups;
132							}
133
134							sub _row {
135	63			63		174	my ( $self, $group ) = @_;
136	63					1877	my $genes = $self->annotate_groups_obj->_groups_to_id_names->{$group};
137
138	63					1943	my $num_isolates_in_group = $self->analyse_groups_obj->_count_num_files_in_group($genes);
139	63					137	my $num_sequences_in_group = $#{$genes} + 1;
	63					158
140	63					500	my $avg_sequences_per_isolate = ceil( ( $num_sequences_in_group / $num_isolates_in_group ) * 100 ) / 100;
141
142	63					2246	my $annotation = $self->annotate_groups_obj->consensus_product_for_id_names($genes);
143	63					2086	my $annotated_group_name = $self->annotate_groups_obj->_groups_to_consensus_gene_names->{$group};
144
145	63					246	my $duplicate_gene_name = $self->_non_unique_name_for_group($annotated_group_name);
146
147	63					159	my $genome_number = '';
148	63					123	my $qc_comment = '';
149	63					102	my $order_within_fragement = '';
150	63					112	my $accessory_order_within_fragement = '';
151	63					117	my $accessory_genome_number = '';
152	63	50	66			2047	if(defined($self->groups_to_contigs) && defined($self->groups_to_contigs->{$annotated_group_name}))
153							{
154	0					0	$genome_number = $self->groups_to_contigs->{$annotated_group_name}->{label};
155	0					0	$qc_comment = $self->groups_to_contigs->{$annotated_group_name}->{comment};
156	0					0	$order_within_fragement = $self->groups_to_contigs->{$annotated_group_name}->{order};
157
158	0					0	$accessory_genome_number = $self->groups_to_contigs->{$annotated_group_name}->{accessory_label};
159	0					0	$accessory_order_within_fragement = $self->groups_to_contigs->{$annotated_group_name}->{accessory_order};
160							}
161
162	63					1899	my $group_size = $self->annotate_groups_obj->group_nucleotide_lengths->{$group};
163
164							my @row = (
165							$annotated_group_name, $duplicate_gene_name, $annotation,
166							$num_isolates_in_group, $num_sequences_in_group, $avg_sequences_per_isolate,$genome_number,$order_within_fragement,$accessory_genome_number,$accessory_order_within_fragement,$qc_comment,$group_size->{min}, $group_size->{max}, $group_size->{average}
167	63					375	);
168
169	63					258	for(my $i =0; $i < @row; $i++)
170							{
171	882	100				2103	if(!defined($row[$i]))
172							{
173	135					363	$row[$i] = '';
174							}
175							}
176
177	63					121	for my $filename ( @{ $self->_sorted_file_names } ) {
	63					2103
178	196					6122	my $group_to_file_genes = $self->_groups_to_files->{$group}->{$filename};
179
180	196	100	66			566	if ( defined($group_to_file_genes) && @{$group_to_file_genes} > 0 ) {
	109					401
181
182	109					243	push( @row, join( "\t", @{$group_to_file_genes} ) );
	109					420
183	109					347	next;
184							}
185							else {
186	87					314	push( @row, '' );
187							}
188							}
189
190							## ADD INFERENCE AND FULL ANNOTATION IF VERBOSE REQUESTED ##
191	63	100				1828	if ( $self->_verbose ){
192	7					15	my ( $full_annotation, $inference );
193	7					162	$row[2] = $self->annotate_groups_obj->full_annotation($group);
194	7					201	push( @row, $self->annotate_groups_obj->inference($group) );
195							}
196
197	63					1215	return \@row;
198							}
199
200							sub create_rtab
201							{
202	1			1	0	4	my ($self) = @_;
203	1					50	my $presence_absence_matrix_obj = Bio::Roary::PresenceAbsenceMatrix->new(
204							output_filename => $self->output_rtab_filename,
205							annotate_groups_obj => $self->annotate_groups_obj,
206							sorted_file_names => $self->_sorted_file_names,
207							groups_to_files => $self->_groups_to_files,
208							num_files_in_groups => $self->_num_files_in_groups,
209							sample_headers => $self->_sample_headers,
210							);
211	1					6	$presence_absence_matrix_obj->create_matrix_file;
212	1					29	return $self;
213							}
214
215							sub create_spreadsheet {
216	24			24	0	90	my ($self) = @_;
217
218	24					1182	$self->_text_csv_obj->print( $self->_output_fh, $self->_header );
219
220	24	50				501	for my $group (sort {$self->_num_files_in_groups->{$b}<=>$self->_num_files_in_groups->{$a} \|\| $a cmp $b} keys %{$self->_num_files_in_groups}){
	72					2493
	24					1000
221	63					2580	$self->_text_csv_obj->print( $self->_output_fh, $self->_row($group) );
222							}
223	24					1092	close( $self->_output_fh );
224							}
225
226	10			10		104	no Moose;
	10					26
	10					62
227							__PACKAGE__->meta->make_immutable;
228
229							1;
230
231							__END__
232
233							=pod
234
235							=encoding UTF-8
236
237							=head1 NAME
238
239							Bio::Roary::GroupStatistics - Add labels to the groups
240
241							=head1 VERSION
242
243							version 3.10.1
244
245							=head1 SYNOPSIS
246
247							Add labels to the groups
248							use Bio::Roary::GroupStatistics;
249
250							my $obj = Bio::Roary::GroupStatistics->new(
251							output_filename => 'group_statitics.csv',
252							annotate_groups_obj => $annotate_groups_obj,
253							analyse_groups_obj => $analyse_groups_obj
254							);
255							$obj->create_spreadsheet;
256
257							=head1 AUTHOR
258
259							Andrew J. Page <ap13@sanger.ac.uk>
260
261							=head1 COPYRIGHT AND LICENSE
262
263							This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
264
265							This is free software, licensed under:
266
267							The GNU General Public License, Version 3, June 2007
268
269							=cut