File Coverage

lib/Bio/Roary/ReformatInputGFFs.pm

Criterion	Covered	Total	%
statement	120	129	93.0
branch	24	38	63.1
condition	3	3	100.0
subroutine	16	16	100.0
pod	0	1	0.0
total	163	187	87.1

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::Roary::ReformatInputGFFs;
2							$Bio::Roary::ReformatInputGFFs::VERSION = '3.10.2';
3							# ABSTRACT: Take in gff files and add suffix where a gene id is seen twice
4
5
6	2			2		85468	use Moose;
	2					383896
	2					17
7	2			2		17924	use Bio::Roary::Exceptions;
	2					7
	2					64
8	2			2		16	use Cwd;
	2					5
	2					169
9	2			2		289	use File::Copy;
	2					1749
	2					165
10	2			2		525	use Log::Log4perl qw(:easy);
	2					31052
	2					23
11	2			2		2321	use Bio::Tools::GFF;
	2					76200
	2					88
12	2			2		17	use File::Path qw(make_path);
	2					6
	2					122
13	2			2		16	use File::Basename;
	2					5
	2					168
14	2			2		941	use Digest::MD5::File qw(file_md5_hex);
	2					59127
	2					17
15
16							has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
17							has 'logger' => ( is => 'ro', lazy => 1, builder => '_build_logger' );
18							has '_tags_to_filter' => ( is => 'ro', isa => 'Str', default => '(CDS\|ncRNA\|tRNA\|tmRNA\|rRNA)' );
19							has 'output_directory' => ( is => 'ro', isa => 'Str', default => 'fixed_input_files' );
20							has 'suffix_counter' => ( is => 'rw', isa => 'Int', default => 1 );
21
22							has 'fixed_gff_files' => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } );
23
24							sub _build_logger {
25	3			3		7	my ($self) = @_;
26	3					23	Log::Log4perl->easy_init( $ERROR );
27	3					6824	my $logger = get_logger();
28	3					211	return $logger;
29							}
30
31							sub fix_duplicate_gene_ids {
32	4			4	0	684	my ($self) = @_;
33
34	4					9	my %gene_ids_seen_before;
35
36							my %file_md5s;
37
38	4					10	for my $file ( @{ $self->gff_files } ) {
	4					136
39	9					50	my $digest = file_md5_hex($file);
40
41	9	100				1862	if(defined($file_md5s{$digest}))
42							{
43							$self->logger->warn(
44	1					24	"Input files have identical MD5 hashes, only using the first file: ".$file_md5s{$digest}." == ".$file
45							);
46	1					8	next;
47							}
48							else
49							{
50	8					31	$file_md5s{$digest} = $file;
51							}
52
53	8					13	my $ids_seen = 0;
54	8					33	my $ids_from_file = $self->_get_ids_for_gff_file($file);
55
56	8	50				718	if ( @{$ids_from_file} < 1 ) {
	8					33
57	0					0	$self->logger->error(
58							"Input GFF file doesnt contain annotation we can use so excluding it from the analysis: $file"
59							);
60							}
61							else {
62	8					16	for my $gene_id ( @{$ids_from_file} ) {
	8					20
63	33	100				64	if ( $gene_ids_seen_before{$gene_id} ) {
64	2					56	$self->logger->error(
65							"Input file contains duplicate gene IDs, attempting to fix by adding a unique suffix, new GFF in the fixed_input_files directory: $file "
66							);
67	2					825	my $updated_file = $self->_add_suffix_to_gene_ids_and_return_new_file($file, $digest);
68	2	50				6	push( @{ $self->fixed_gff_files }, $updated_file ) if ( defined($updated_file) );
	2					55
69	2					3	$ids_seen = 1;
70	2					4	last;
71							}
72	31					69	$gene_ids_seen_before{$gene_id}++;
73							}
74
75							# We know its a valid GFF file since we could open it and extract IDs.
76							# We need to make sure the filenames end in .gff. If it contained duplicate IDs, then they are fixed so nothing to do, but
77							# if they didnt, then we have to double check and repair if necessary.
78	8	100				27	if ( $ids_seen == 0 ) {
79
80
81	6					10	push( @{ $self->fixed_gff_files }, $self->_fix_gff_file_extension($file) );
	6					203
82							}
83							}
84							}
85	4					36	return 1;
86							}
87
88							sub _fix_gff_file_extension
89							{
90	6			6		21	my ( $self, $input_file ) = @_;
91
92	6					350	my ( $filename, $directories, $suffix ) = fileparse( $input_file, qr/\.[^.]*/ );
93	6	50				49	return $input_file if($suffix eq '.gff');
94
95
96	0	0				0	make_path( $self->output_directory ) if ( !( -d $self->output_directory ) );
97	0					0	my $output_file = $self->output_directory . '/' . $filename . '.gff';
98	0	0				0	copy($input_file, $output_file) or $self->logger->error("Couldnt copy file with invalid gff extention: $input_file -> $output_file");
99	0					0	return $output_file;
100							}
101
102
103							sub _add_suffix_to_gene_ids_and_return_new_file {
104	3			3		10	my ( $self, $input_file, $digest ) = @_;
105	3					91	my ( $filename, $directories, $suffix ) = fileparse( $input_file, qr/\.[^.]*/ );
106	3	50				84	make_path( $self->output_directory ) if ( !( -d $self->output_directory ) );
107	3					81	my $output_file = $self->output_directory . '/' . $filename . '.gff';
108
109	3					80	open( my $input_gff_fh, $input_file );
110	3					143	open( my $out_gff_fh, '>', $output_file );
111
112							# There is a chance that there can be a collision here, but its remote.
113	3					11	my $random_locus_tag = "".$digest;
114
115	3					75	$self->logger->warn(
116							"Renamed GFF file from: $input_file -> $output_file" );
117	3					73	$self->logger->warn(
118							"Locus tag used is '$random_locus_tag' for file: $input_file" );
119
120	3					20	my $found_fasta = 0;
121	3					5	my $gene_counter = 1;
122	3					56	while (<$input_gff_fh>) {
123	663					728	my $line = $_;
124
125	663	100				881	if ( $line =~ /^\#\#FASTA/ ) {
126	3					4	$found_fasta = 1;
127							}
128
129	663	100	100			1332	if ( $line =~ /\#/ \|\| $found_fasta == 1 ) {
130	645					571	print {$out_gff_fh} $line;
	645					845
131	645					1102	next;
132							}
133
134	18					61	my @cells = split( /\t/, $line );
135	18					46	my @tags = split( /;/, $cells[8] );
136	18					21	my $found_id = 0;
137	18					35	for ( my $i = 0 ; $i < @tags ; $i++ ) {
138	52	100				127	if ( $tags[$i] =~ /^(ID=["']?)([^;"']+)(["']?)/ ) {
139	10					18	my $current_id = $2;
140	10					204	$current_id .= '___' . $self->suffix_counter;
141	10					29	$tags[$i] = $1 .$random_locus_tag.'_'. $gene_counter . $3;
142	10					12	$gene_counter++;
143	10					11	$found_id++;
144	10					12	last;
145							}
146							}
147	18	100				32	if ( $found_id == 0 ) {
148	8					18	unshift( @tags, 'ID=' . $random_locus_tag.'_'. $gene_counter );
149	8					9	$gene_counter++;
150							}
151	18					43	$cells[8] = join( ';', @tags );
152	18					21	print {$out_gff_fh} join( "\t", @cells );
	18					80
153							}
154
155	3	50				8	if ( $found_fasta == 0 ) {
156	0					0	$self->logger->warn(
157							"Input GFF file doesnt appear to have the FASTA sequence at the end of the file so is being excluded from the analysis: $input_file" );
158	0					0	return undef;
159							}
160	3					51	close($out_gff_fh);
161	3					9	close($input_gff_fh);
162	3					20	return $output_file;
163							}
164
165							sub _get_ids_for_gff_file {
166	10			10		705	my ( $self, $file ) = @_;
167	10					16	my @gene_ids;
168	10					306	my $tags_regex = $self->_tags_to_filter;
169	10					94	my $gffio = Bio::Tools::GFF->new( -file => $file, -gff_version => 3 );
170	10					7127	while ( my $feature = $gffio->next_feature() ) {
171	100	100				53521	next if !( $feature->primary_tag =~ /$tags_regex/ );
172	51					638	my $gene_id = $self->_get_feature_id($feature);
173	51	50				243	push( @gene_ids, $gene_id ) if ( defined($gene_id) );
174							}
175	10					56806	return \@gene_ids;
176							}
177
178							sub _get_feature_id {
179	51			51		95	my ( $self, $feature ) = @_;
180	51					77	my ( $gene_id, @junk );
181	51	50				122	if ( $feature->has_tag('ID') ) {
		0
182	51					329	( $gene_id, @junk ) = $feature->get_tag_values('ID');
183							}
184							elsif ( $feature->has_tag('locus_tag') ) {
185	0					0	( $gene_id, @junk ) = $feature->get_tag_values('locus_tag');
186							}
187							else {
188	0					0	return undef;
189							}
190	51					550	$gene_id =~ s!["']!!g;
191	51	50				117	return undef if ( $gene_id eq "" );
192	51					103	return $gene_id;
193							}
194
195	2			2		3303	no Moose;
	2					5
	2					24
196							__PACKAGE__->meta->make_immutable;
197
198							1;
199
200							__END__
201
202							=pod
203
204							=encoding UTF-8
205
206							=head1 NAME
207
208							Bio::Roary::ReformatInputGFFs - Take in gff files and add suffix where a gene id is seen twice
209
210							=head1 VERSION
211
212							version 3.10.2
213
214							=head1 SYNOPSIS
215
216							Take in gff files and add suffix where a gene id is seen twice
217							use Bio::Roary::ReformatInputGFFs;
218
219							my $obj = Bio::Roary::PrepareInputFiles->new(
220							gff_files => ['abc.gff','ddd.faa'],
221							);
222							$obj->fix_duplicate_gene_ids;
223							$obj->fixed_gff_files;
224
225							=head1 AUTHOR
226
227							Andrew J. Page <ap13@sanger.ac.uk>
228
229							=head1 COPYRIGHT AND LICENSE
230
231							This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
232
233							This is free software, licensed under:
234
235							The GNU General Public License, Version 3, June 2007
236
237							=cut