line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# -*-CPerl-*- |
2
|
|
|
|
|
|
|
# Last changed Time-stamp: <2014-10-22 14:03:15 mtw> |
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
package Bio::ViennaNGS::AnnoC; |
5
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
71970
|
use 5.12.0; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
38
|
|
7
|
1
|
|
|
1
|
|
8
|
use version; our $VERSION = qv('0.07'); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
5
|
|
8
|
1
|
|
|
1
|
|
282
|
use Bio::ViennaNGS qw(sortbed); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use Bio::Tools::GFF; |
10
|
|
|
|
|
|
|
use IPC::Cmd qw(can_run run); |
11
|
|
|
|
|
|
|
use Path::Class; |
12
|
|
|
|
|
|
|
use Carp; |
13
|
|
|
|
|
|
|
use Moose; |
14
|
|
|
|
|
|
|
use namespace::autoclean; |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
has 'accession' => ( |
17
|
|
|
|
|
|
|
is => 'rw', |
18
|
|
|
|
|
|
|
isa => 'Str', |
19
|
|
|
|
|
|
|
predicate => 'has_accession', |
20
|
|
|
|
|
|
|
); |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
has 'features' => ( |
23
|
|
|
|
|
|
|
is => 'ro', |
24
|
|
|
|
|
|
|
isa => 'HashRef', |
25
|
|
|
|
|
|
|
predicate => 'has_features', |
26
|
|
|
|
|
|
|
default => sub { {} }, |
27
|
|
|
|
|
|
|
); |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
has 'nr_features' => ( |
30
|
|
|
|
|
|
|
is => 'ro', |
31
|
|
|
|
|
|
|
isa => 'Int', |
32
|
|
|
|
|
|
|
builder => '_get_nr_of_features', |
33
|
|
|
|
|
|
|
lazy => 1, |
34
|
|
|
|
|
|
|
); |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
has 'featstat' => ( |
37
|
|
|
|
|
|
|
is => 'ro', |
38
|
|
|
|
|
|
|
isa => 'HashRef', |
39
|
|
|
|
|
|
|
builder => '_set_featstat', |
40
|
|
|
|
|
|
|
predicate => 'has_featstat', |
41
|
|
|
|
|
|
|
lazy => 1, |
42
|
|
|
|
|
|
|
); |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
before 'featstat' => sub { |
45
|
|
|
|
|
|
|
my $self = shift; |
46
|
|
|
|
|
|
|
$self->_get_nr_of_features(); |
47
|
|
|
|
|
|
|
}; |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
sub _set_featstat { |
50
|
|
|
|
|
|
|
my $self = shift; |
51
|
|
|
|
|
|
|
my $this_function = (caller(0))[3]; |
52
|
|
|
|
|
|
|
my %fs = (); |
53
|
|
|
|
|
|
|
confess "ERROR [$this_function] \$self->features not available" |
54
|
|
|
|
|
|
|
unless ($self->has_features); |
55
|
|
|
|
|
|
|
$fs{total} = 0; |
56
|
|
|
|
|
|
|
$fs{origin} = "$this_function ".$VERSION; |
57
|
|
|
|
|
|
|
$fs{count} = $self->nr_features; |
58
|
|
|
|
|
|
|
foreach my $uid ( keys %{$self->features} ){ |
59
|
|
|
|
|
|
|
my $gbkey = ${$self->features}{$uid}->{gbkey}; |
60
|
|
|
|
|
|
|
$fs{total} += 1; |
61
|
|
|
|
|
|
|
unless (exists $fs{$gbkey}){ |
62
|
|
|
|
|
|
|
$fs{$gbkey} = 0; |
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
$fs{$gbkey} += 1; |
65
|
|
|
|
|
|
|
} |
66
|
|
|
|
|
|
|
return \%fs; |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
sub _get_nr_of_features { |
70
|
|
|
|
|
|
|
my $self = shift; |
71
|
|
|
|
|
|
|
my $this_function = (caller(0))[3]; |
72
|
|
|
|
|
|
|
confess "ERROR [$this_function] \$self->features not available" |
73
|
|
|
|
|
|
|
unless ($self->has_features); |
74
|
|
|
|
|
|
|
return (keys %{$self->features}); |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
sub parse_gff { |
78
|
|
|
|
|
|
|
my ($self,$in_file) = @_; |
79
|
|
|
|
|
|
|
my ($i,$gffio,$header,$f,$gbkey); |
80
|
|
|
|
|
|
|
my $this_function = (caller(0))[3]; |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
$gffio = Bio::Tools::GFF->new(-file => $in_file, |
83
|
|
|
|
|
|
|
-gff_version => 3, |
84
|
|
|
|
|
|
|
); |
85
|
|
|
|
|
|
|
$gffio->ignore_sequence(1); |
86
|
|
|
|
|
|
|
if ($header = $gffio->next_segment() ){ |
87
|
|
|
|
|
|
|
$self->accession( $header->display_id() ); |
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
else{ carp "ERROR [$this_function] Cannot parse GFF header\n" } |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
while($f = $gffio->next_feature()) { |
92
|
|
|
|
|
|
|
my ($uid,$feat_name); |
93
|
|
|
|
|
|
|
my @name = my @id = my @gbkeys = (); |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
next if ($f->primary_tag() eq "exon"); |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
# 1) determine gbkey of the current feature |
98
|
|
|
|
|
|
|
@gbkeys = $f->get_tag_values("gbkey"); |
99
|
|
|
|
|
|
|
$gbkey = $gbkeys[0]; |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# 2) get a unique ID for each feature |
102
|
|
|
|
|
|
|
if ($f->has_tag('ID')){ |
103
|
|
|
|
|
|
|
@id = $f->get_tag_values('ID'); |
104
|
|
|
|
|
|
|
$uid = $id[0]; # ID=id101 |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
else { |
107
|
|
|
|
|
|
|
croak "ERROR [$this_function] Feature '$gbkey' at pos.\ |
108
|
|
|
|
|
|
|
$f->start does not have \'ID\' attribute\n"; |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
# 3) assign parent's unique ID in case a parent record exists |
112
|
|
|
|
|
|
|
if ($f->has_tag('Parent')){ |
113
|
|
|
|
|
|
|
@id = $f->get_tag_values('Parent'); |
114
|
|
|
|
|
|
|
$uid = $id[0]; # ID=id101 |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
# 4) find a name for the current feature, use 'Name' or 'ID' attribute |
118
|
|
|
|
|
|
|
if ($f->has_tag('Name')){ |
119
|
|
|
|
|
|
|
@name = $f->get_tag_values('Name'); |
120
|
|
|
|
|
|
|
$feat_name = $name[0]; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
elsif ($f->has_tag('ID')){ |
123
|
|
|
|
|
|
|
@id = $f->get_tag_values('ID'); |
124
|
|
|
|
|
|
|
$feat_name = $id[0]; # ID=id101, use ID as feature name |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
else { |
127
|
|
|
|
|
|
|
croak "ERROR [$this_function] Cannot set name for feature \ |
128
|
|
|
|
|
|
|
$f->gbkey at pos. $f->start\n"; |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
unless (exists ${$self->features}{$uid}) { # gene / ribosome_entry_site / etc. |
132
|
|
|
|
|
|
|
${$self->features}{$uid}->{start} = $f->start; |
133
|
|
|
|
|
|
|
${$self->features}{$uid}->{end} = $f->end; |
134
|
|
|
|
|
|
|
${$self->features}{$uid}->{strand} = $f->strand; |
135
|
|
|
|
|
|
|
${$self->features}{$uid}->{length} = $f->length; |
136
|
|
|
|
|
|
|
${$self->features}{$uid}->{seqid} = $f->seq_id; |
137
|
|
|
|
|
|
|
${$self->features}{$uid}->{score} = $f->score || 0; |
138
|
|
|
|
|
|
|
${$self->features}{$uid}->{gbkey} = $gbkey; |
139
|
|
|
|
|
|
|
${$self->features}{$uid}->{name} = $feat_name; |
140
|
|
|
|
|
|
|
${$self->features}{$uid}->{uid} = $uid; |
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
else { # CDS / tRNA / rRNA / etc. |
143
|
|
|
|
|
|
|
${$self->features}{$uid}->{gbkey} = $gbkey; # gbkey for tRNA/ rRNA/ CDS etc |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
$gffio->close(); |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
sub features2bed { |
150
|
|
|
|
|
|
|
my ($self,$gbkey,$dest,$bn,$log) = @_; |
151
|
|
|
|
|
|
|
my ($chrom,$chrom_start,$chrom_end,$name,$score,$strand,$thick_start); |
152
|
|
|
|
|
|
|
my ($thick_end,$reserved,$block_count,$block_sizes,$block_starts); |
153
|
|
|
|
|
|
|
my @ft = (); |
154
|
|
|
|
|
|
|
my $this_function = (caller(0))[3]; |
155
|
|
|
|
|
|
|
my $bedtools = can_run('bedtools') or |
156
|
|
|
|
|
|
|
croak "ERROR [$this_function] Cannot find 'bedtools' utility"; |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
croak "ERROR [$this_function] $self->features not available" |
159
|
|
|
|
|
|
|
unless ($self->has_features); |
160
|
|
|
|
|
|
|
croak "ERROT [$this_function] $self->featstat not available" |
161
|
|
|
|
|
|
|
unless ($self->has_featstat); |
162
|
|
|
|
|
|
|
croak "ERROR [$this_function] $dest does not exist" |
163
|
|
|
|
|
|
|
unless (-d $dest); |
164
|
|
|
|
|
|
|
if (defined $log){open(LOG, ">>", $log) or croak $!;} |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
if (defined $gbkey){ # dump features of just one genbank key |
167
|
|
|
|
|
|
|
confess "ERROR [$this_function] genbank key \'$gbkey\' N/A in hash " |
168
|
|
|
|
|
|
|
unless (exists ${$self->featstat}{$gbkey}); |
169
|
|
|
|
|
|
|
$ft[0] = $gbkey; |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
else{ # dump features for all genbank keys |
172
|
|
|
|
|
|
|
foreach my $gbk (keys %{$self->featstat}) { |
173
|
|
|
|
|
|
|
next if ($gbk eq 'total' || $gbk eq 'Src' || $gbk eq 'accession' || |
174
|
|
|
|
|
|
|
$gbk eq 'origin' || $gbk eq 'count'); |
175
|
|
|
|
|
|
|
push @ft,$gbk; |
176
|
|
|
|
|
|
|
} |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
foreach my $f (@ft){ |
180
|
|
|
|
|
|
|
my $bedname = file($dest,"$bn.$f.bed"); |
181
|
|
|
|
|
|
|
my $bedname_u = file($dest,"$bn.$f.u.bed"); |
182
|
|
|
|
|
|
|
open (BEDOUT, "> $bedname_u") or croak $!; |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
# dump unsorted gene annotation from DS to BED12 |
185
|
|
|
|
|
|
|
foreach my $uid (keys %{$self->features}){ |
186
|
|
|
|
|
|
|
next unless (${$self->features}{$uid}->{gbkey} eq $f); |
187
|
|
|
|
|
|
|
my @bedline = (); |
188
|
|
|
|
|
|
|
$chrom = ${$self->features}{$uid}->{seqid}; |
189
|
|
|
|
|
|
|
$chrom_start = ${$self->features}{$uid}->{start}; |
190
|
|
|
|
|
|
|
$chrom_start--; # BED is 0-based |
191
|
|
|
|
|
|
|
$chrom_end = ${$self->features}{$uid}->{end}; |
192
|
|
|
|
|
|
|
$name = ${$self->features}{$uid}->{name}; |
193
|
|
|
|
|
|
|
$score = ${$self->features}{$uid}->{score}; |
194
|
|
|
|
|
|
|
$strand = ${$self->features}{$uid}->{strand} == -1 ? '-' : '+'; #default to + |
195
|
|
|
|
|
|
|
$thick_start = $chrom_start; |
196
|
|
|
|
|
|
|
$thick_end = $chrom_end; |
197
|
|
|
|
|
|
|
$reserved = 0; |
198
|
|
|
|
|
|
|
$block_count = 1; |
199
|
|
|
|
|
|
|
$block_sizes = ${$self->features}{$uid}->{length}.","; |
200
|
|
|
|
|
|
|
$block_starts = "0,"; |
201
|
|
|
|
|
|
|
@bedline = join ("\t", ($chrom,$chrom_start,$chrom_end, |
202
|
|
|
|
|
|
|
$name,$score,$strand,$thick_start, |
203
|
|
|
|
|
|
|
$thick_end,$reserved,$block_count, |
204
|
|
|
|
|
|
|
$block_sizes, $block_starts)); |
205
|
|
|
|
|
|
|
print BEDOUT "@bedline\n"; |
206
|
|
|
|
|
|
|
} |
207
|
|
|
|
|
|
|
close (BEDOUT); |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
sortbed($bedname_u,".",$bedname,1,undef); # sort bed file |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
} # end foreach |
212
|
|
|
|
|
|
|
if (defined $log){close(LOG)}; |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
sub feature_summary { |
216
|
|
|
|
|
|
|
my ($self, $dest) = @_; |
217
|
|
|
|
|
|
|
my ($fn,$fh); |
218
|
|
|
|
|
|
|
my $this_function = (caller(0))[3]; |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
croak "ERROR [$this_function] $dest does not exist\n" |
221
|
|
|
|
|
|
|
unless (-d $dest); |
222
|
|
|
|
|
|
|
croak "ERROR [$this_function] $self->accession not available\n" |
223
|
|
|
|
|
|
|
unless ($self->has_accession); |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
$fn = dir($dest,$self->accession.".summary.txt"); |
226
|
|
|
|
|
|
|
open $fh, ">", $fn or croak $!; |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
print $fh "Accession\t ".$self->accession."\n"; |
229
|
|
|
|
|
|
|
print $fh "Origin \t ${$self->featstat}{origin}\n"; |
230
|
|
|
|
|
|
|
foreach my $ft (sort keys %{$self->featstat}){ |
231
|
|
|
|
|
|
|
next if ($ft =~ /total/ || $ft =~ /accession/ || $ft =~ /origin/); |
232
|
|
|
|
|
|
|
print $fh "$ft\t${$self->featstat}{$ft}\n"; |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
print $fh "Total\t${$self->featstat}{total}\n"; |
235
|
|
|
|
|
|
|
close $fh; |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
1; |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
__END__ |