File Coverage

lib/Bio/Roary/ReformatInputGFFs.pm
Criterion Covered Total %
statement 120 129 93.0
branch 24 38 63.1
condition 3 3 100.0
subroutine 16 16 100.0
pod 0 1 0.0
total 163 187 87.1


line stmt bran cond sub pod time code
1             package Bio::Roary::ReformatInputGFFs;
2             $Bio::Roary::ReformatInputGFFs::VERSION = '3.10.2';
3             # ABSTRACT: Take in gff files and add suffix where a gene id is seen twice
4              
5              
6 2     2   85468 use Moose;
  2         383896  
  2         17  
7 2     2   17924 use Bio::Roary::Exceptions;
  2         7  
  2         64  
8 2     2   16 use Cwd;
  2         5  
  2         169  
9 2     2   289 use File::Copy;
  2         1749  
  2         165  
10 2     2   525 use Log::Log4perl qw(:easy);
  2         31052  
  2         23  
11 2     2   2321 use Bio::Tools::GFF;
  2         76200  
  2         88  
12 2     2   17 use File::Path qw(make_path);
  2         6  
  2         122  
13 2     2   16 use File::Basename;
  2         5  
  2         168  
14 2     2   941 use Digest::MD5::File qw(file_md5_hex);
  2         59127  
  2         17  
15              
16             has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
17             has 'logger' => ( is => 'ro', lazy => 1, builder => '_build_logger' );
18             has '_tags_to_filter' => ( is => 'ro', isa => 'Str', default => '(CDS|ncRNA|tRNA|tmRNA|rRNA)' );
19             has 'output_directory' => ( is => 'ro', isa => 'Str', default => 'fixed_input_files' );
20             has 'suffix_counter' => ( is => 'rw', isa => 'Int', default => 1 );
21              
22             has 'fixed_gff_files' => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } );
23              
24             sub _build_logger {
25 3     3   7 my ($self) = @_;
26 3         23 Log::Log4perl->easy_init( $ERROR );
27 3         6824 my $logger = get_logger();
28 3         211 return $logger;
29             }
30              
31             sub fix_duplicate_gene_ids {
32 4     4 0 684 my ($self) = @_;
33              
34 4         9 my %gene_ids_seen_before;
35            
36             my %file_md5s;
37            
38 4         10 for my $file ( @{ $self->gff_files } ) {
  4         136  
39 9         50 my $digest = file_md5_hex($file);
40            
41 9 100       1862 if(defined($file_md5s{$digest}))
42             {
43             $self->logger->warn(
44 1         24 "Input files have identical MD5 hashes, only using the first file: ".$file_md5s{$digest}." == ".$file
45             );
46 1         8 next;
47             }
48             else
49             {
50 8         31 $file_md5s{$digest} = $file;
51             }
52            
53 8         13 my $ids_seen = 0;
54 8         33 my $ids_from_file = $self->_get_ids_for_gff_file($file);
55              
56 8 50       718 if ( @{$ids_from_file} < 1 ) {
  8         33  
57 0         0 $self->logger->error(
58             "Input GFF file doesnt contain annotation we can use so excluding it from the analysis: $file"
59             );
60             }
61             else {
62 8         16 for my $gene_id ( @{$ids_from_file} ) {
  8         20  
63 33 100       64 if ( $gene_ids_seen_before{$gene_id} ) {
64 2         56 $self->logger->error(
65             "Input file contains duplicate gene IDs, attempting to fix by adding a unique suffix, new GFF in the fixed_input_files directory: $file "
66             );
67 2         825 my $updated_file = $self->_add_suffix_to_gene_ids_and_return_new_file($file, $digest);
68 2 50       6 push( @{ $self->fixed_gff_files }, $updated_file ) if ( defined($updated_file) );
  2         55  
69 2         3 $ids_seen = 1;
70 2         4 last;
71             }
72 31         69 $gene_ids_seen_before{$gene_id}++;
73             }
74            
75             # We know its a valid GFF file since we could open it and extract IDs.
76             # We need to make sure the filenames end in .gff. If it contained duplicate IDs, then they are fixed so nothing to do, but
77             # if they didnt, then we have to double check and repair if necessary.
78 8 100       27 if ( $ids_seen == 0 ) {
79            
80            
81 6         10 push( @{ $self->fixed_gff_files }, $self->_fix_gff_file_extension($file) );
  6         203  
82             }
83             }
84             }
85 4         36 return 1;
86             }
87              
88             sub _fix_gff_file_extension
89             {
90 6     6   21 my ( $self, $input_file ) = @_;
91            
92 6         350 my ( $filename, $directories, $suffix ) = fileparse( $input_file, qr/\.[^.]*/ );
93 6 50       49 return $input_file if($suffix eq '.gff');
94            
95            
96 0 0       0 make_path( $self->output_directory ) if ( !( -d $self->output_directory ) );
97 0         0 my $output_file = $self->output_directory . '/' . $filename . '.gff';
98 0 0       0 copy($input_file, $output_file) or $self->logger->error("Couldnt copy file with invalid gff extention: $input_file -> $output_file");
99 0         0 return $output_file;
100             }
101              
102              
103             sub _add_suffix_to_gene_ids_and_return_new_file {
104 3     3   10 my ( $self, $input_file, $digest ) = @_;
105 3         91 my ( $filename, $directories, $suffix ) = fileparse( $input_file, qr/\.[^.]*/ );
106 3 50       84 make_path( $self->output_directory ) if ( !( -d $self->output_directory ) );
107 3         81 my $output_file = $self->output_directory . '/' . $filename . '.gff';
108              
109 3         80 open( my $input_gff_fh, $input_file );
110 3         143 open( my $out_gff_fh, '>', $output_file );
111            
112             # There is a chance that there can be a collision here, but its remote.
113 3         11 my $random_locus_tag = "".$digest;
114            
115 3         75 $self->logger->warn(
116             "Renamed GFF file from: $input_file -> $output_file" );
117 3         73 $self->logger->warn(
118             "Locus tag used is '$random_locus_tag' for file: $input_file" );
119              
120 3         20 my $found_fasta = 0;
121 3         5 my $gene_counter = 1;
122 3         56 while (<$input_gff_fh>) {
123 663         728 my $line = $_;
124              
125 663 100       881 if ( $line =~ /^\#\#FASTA/ ) {
126 3         4 $found_fasta = 1;
127             }
128              
129 663 100 100     1332 if ( $line =~ /\#/ || $found_fasta == 1 ) {
130 645         571 print {$out_gff_fh} $line;
  645         845  
131 645         1102 next;
132             }
133              
134 18         61 my @cells = split( /\t/, $line );
135 18         46 my @tags = split( /;/, $cells[8] );
136 18         21 my $found_id = 0;
137 18         35 for ( my $i = 0 ; $i < @tags ; $i++ ) {
138 52 100       127 if ( $tags[$i] =~ /^(ID=["']?)([^;"']+)(["']?)/ ) {
139 10         18 my $current_id = $2;
140 10         204 $current_id .= '___' . $self->suffix_counter;
141 10         29 $tags[$i] = $1 .$random_locus_tag.'_'. $gene_counter . $3;
142 10         12 $gene_counter++;
143 10         11 $found_id++;
144 10         12 last;
145             }
146             }
147 18 100       32 if ( $found_id == 0 ) {
148 8         18 unshift( @tags, 'ID=' . $random_locus_tag.'_'. $gene_counter );
149 8         9 $gene_counter++;
150             }
151 18         43 $cells[8] = join( ';', @tags );
152 18         21 print {$out_gff_fh} join( "\t", @cells );
  18         80  
153             }
154              
155 3 50       8 if ( $found_fasta == 0 ) {
156 0         0 $self->logger->warn(
157             "Input GFF file doesnt appear to have the FASTA sequence at the end of the file so is being excluded from the analysis: $input_file" );
158 0         0 return undef;
159             }
160 3         51 close($out_gff_fh);
161 3         9 close($input_gff_fh);
162 3         20 return $output_file;
163             }
164              
165             sub _get_ids_for_gff_file {
166 10     10   705 my ( $self, $file ) = @_;
167 10         16 my @gene_ids;
168 10         306 my $tags_regex = $self->_tags_to_filter;
169 10         94 my $gffio = Bio::Tools::GFF->new( -file => $file, -gff_version => 3 );
170 10         7127 while ( my $feature = $gffio->next_feature() ) {
171 100 100       53521 next if !( $feature->primary_tag =~ /$tags_regex/ );
172 51         638 my $gene_id = $self->_get_feature_id($feature);
173 51 50       243 push( @gene_ids, $gene_id ) if ( defined($gene_id) );
174             }
175 10         56806 return \@gene_ids;
176             }
177              
178             sub _get_feature_id {
179 51     51   95 my ( $self, $feature ) = @_;
180 51         77 my ( $gene_id, @junk );
181 51 50       122 if ( $feature->has_tag('ID') ) {
    0          
182 51         329 ( $gene_id, @junk ) = $feature->get_tag_values('ID');
183             }
184             elsif ( $feature->has_tag('locus_tag') ) {
185 0         0 ( $gene_id, @junk ) = $feature->get_tag_values('locus_tag');
186             }
187             else {
188 0         0 return undef;
189             }
190 51         550 $gene_id =~ s!["']!!g;
191 51 50       117 return undef if ( $gene_id eq "" );
192 51         103 return $gene_id;
193             }
194              
195 2     2   3303 no Moose;
  2         5  
  2         24  
196             __PACKAGE__->meta->make_immutable;
197              
198             1;
199              
200             __END__
201              
202             =pod
203              
204             =encoding UTF-8
205              
206             =head1 NAME
207              
208             Bio::Roary::ReformatInputGFFs - Take in gff files and add suffix where a gene id is seen twice
209              
210             =head1 VERSION
211              
212             version 3.10.2
213              
214             =head1 SYNOPSIS
215              
216             Take in gff files and add suffix where a gene id is seen twice
217             use Bio::Roary::ReformatInputGFFs;
218              
219             my $obj = Bio::Roary::PrepareInputFiles->new(
220             gff_files => ['abc.gff','ddd.faa'],
221             );
222             $obj->fix_duplicate_gene_ids;
223             $obj->fixed_gff_files;
224              
225             =head1 AUTHOR
226              
227             Andrew J. Page <ap13@sanger.ac.uk>
228              
229             =head1 COPYRIGHT AND LICENSE
230              
231             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
232              
233             This is free software, licensed under:
234              
235             The GNU General Public License, Version 3, June 2007
236              
237             =cut