| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# -*-CPerl-*- |
|
2
|
|
|
|
|
|
|
# Last changed Time-stamp: <2014-10-24 10:31:25 mtw> |
|
3
|
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
package Bio::ViennaNGS::AnnoC; |
|
5
|
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
15409
|
use 5.12.0; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
32
|
|
|
7
|
1
|
|
|
1
|
|
408
|
use version; our $VERSION = qv('0.08'); |
|
|
1
|
|
|
|
|
1342
|
|
|
|
1
|
|
|
|
|
5
|
|
|
8
|
1
|
|
|
1
|
|
253
|
use Bio::ViennaNGS qw(sortbed); |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use Bio::Tools::GFF; |
|
10
|
|
|
|
|
|
|
use Path::Class; |
|
11
|
|
|
|
|
|
|
use Carp; |
|
12
|
|
|
|
|
|
|
use Moose; |
|
13
|
|
|
|
|
|
|
use namespace::autoclean; |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
has 'accession' => ( |
|
16
|
|
|
|
|
|
|
is => 'rw', |
|
17
|
|
|
|
|
|
|
isa => 'Str', |
|
18
|
|
|
|
|
|
|
predicate => 'has_accession', |
|
19
|
|
|
|
|
|
|
); |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
has 'features' => ( |
|
22
|
|
|
|
|
|
|
is => 'ro', |
|
23
|
|
|
|
|
|
|
isa => 'HashRef', |
|
24
|
|
|
|
|
|
|
predicate => 'has_features', |
|
25
|
|
|
|
|
|
|
default => sub { {} }, |
|
26
|
|
|
|
|
|
|
); |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
has 'nr_features' => ( |
|
29
|
|
|
|
|
|
|
is => 'ro', |
|
30
|
|
|
|
|
|
|
isa => 'Int', |
|
31
|
|
|
|
|
|
|
builder => '_get_nr_of_features', |
|
32
|
|
|
|
|
|
|
lazy => 1, |
|
33
|
|
|
|
|
|
|
); |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
has 'featstat' => ( |
|
36
|
|
|
|
|
|
|
is => 'ro', |
|
37
|
|
|
|
|
|
|
isa => 'HashRef', |
|
38
|
|
|
|
|
|
|
builder => '_set_featstat', |
|
39
|
|
|
|
|
|
|
predicate => 'has_featstat', |
|
40
|
|
|
|
|
|
|
lazy => 1, |
|
41
|
|
|
|
|
|
|
); |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
before 'featstat' => sub { |
|
44
|
|
|
|
|
|
|
my $self = shift; |
|
45
|
|
|
|
|
|
|
$self->_get_nr_of_features(); |
|
46
|
|
|
|
|
|
|
}; |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
sub _set_featstat { |
|
49
|
|
|
|
|
|
|
my $self = shift; |
|
50
|
|
|
|
|
|
|
my $this_function = (caller(0))[3]; |
|
51
|
|
|
|
|
|
|
my %fs = (); |
|
52
|
|
|
|
|
|
|
confess "ERROR [$this_function] \$self->features not available" |
|
53
|
|
|
|
|
|
|
unless ($self->has_features); |
|
54
|
|
|
|
|
|
|
$fs{total} = 0; |
|
55
|
|
|
|
|
|
|
$fs{origin} = "$this_function ".$VERSION; |
|
56
|
|
|
|
|
|
|
$fs{count} = $self->nr_features; |
|
57
|
|
|
|
|
|
|
foreach my $uid ( keys %{$self->features} ){ |
|
58
|
|
|
|
|
|
|
my $gbkey = ${$self->features}{$uid}->{gbkey}; |
|
59
|
|
|
|
|
|
|
$fs{total} += 1; |
|
60
|
|
|
|
|
|
|
unless (exists $fs{$gbkey}){ |
|
61
|
|
|
|
|
|
|
$fs{$gbkey} = 0; |
|
62
|
|
|
|
|
|
|
} |
|
63
|
|
|
|
|
|
|
$fs{$gbkey} += 1; |
|
64
|
|
|
|
|
|
|
} |
|
65
|
|
|
|
|
|
|
return \%fs; |
|
66
|
|
|
|
|
|
|
} |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
sub _get_nr_of_features { |
|
69
|
|
|
|
|
|
|
my $self = shift; |
|
70
|
|
|
|
|
|
|
my $this_function = (caller(0))[3]; |
|
71
|
|
|
|
|
|
|
confess "ERROR [$this_function] \$self->features not available" |
|
72
|
|
|
|
|
|
|
unless ($self->has_features); |
|
73
|
|
|
|
|
|
|
return (keys %{$self->features}); |
|
74
|
|
|
|
|
|
|
} |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
sub parse_gff { |
|
77
|
|
|
|
|
|
|
my ($self,$in_file) = @_; |
|
78
|
|
|
|
|
|
|
my ($i,$gffio,$header,$f,$gbkey); |
|
79
|
|
|
|
|
|
|
my $this_function = (caller(0))[3]; |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
$gffio = Bio::Tools::GFF->new(-file => $in_file, |
|
82
|
|
|
|
|
|
|
-gff_version => 3, |
|
83
|
|
|
|
|
|
|
); |
|
84
|
|
|
|
|
|
|
$gffio->ignore_sequence(1); |
|
85
|
|
|
|
|
|
|
if ($header = $gffio->next_segment() ){ |
|
86
|
|
|
|
|
|
|
$self->accession( $header->display_id() ); |
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
else{ carp "ERROR [$this_function] Cannot parse GFF header\n" } |
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
while($f = $gffio->next_feature()) { |
|
91
|
|
|
|
|
|
|
my ($uid,$feat_name); |
|
92
|
|
|
|
|
|
|
my @name = my @id = my @gbkeys = (); |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
next if ($f->primary_tag() eq "exon"); |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
# 1) determine gbkey of the current feature |
|
97
|
|
|
|
|
|
|
@gbkeys = $f->get_tag_values("gbkey"); |
|
98
|
|
|
|
|
|
|
$gbkey = $gbkeys[0]; |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
# 2) get a unique ID for each feature |
|
101
|
|
|
|
|
|
|
if ($f->has_tag('ID')){ |
|
102
|
|
|
|
|
|
|
@id = $f->get_tag_values('ID'); |
|
103
|
|
|
|
|
|
|
$uid = $id[0]; # ID=id101 |
|
104
|
|
|
|
|
|
|
} |
|
105
|
|
|
|
|
|
|
else { |
|
106
|
|
|
|
|
|
|
croak "ERROR [$this_function] Feature '$gbkey' at pos.\ |
|
107
|
|
|
|
|
|
|
$f->start does not have \'ID\' attribute\n"; |
|
108
|
|
|
|
|
|
|
} |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
# 3) assign parent's unique ID in case a parent record exists |
|
111
|
|
|
|
|
|
|
if ($f->has_tag('Parent')){ |
|
112
|
|
|
|
|
|
|
@id = $f->get_tag_values('Parent'); |
|
113
|
|
|
|
|
|
|
$uid = $id[0]; # ID=id101 |
|
114
|
|
|
|
|
|
|
} |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
# 4) find a name for the current feature, use 'Name' or 'ID' attribute |
|
117
|
|
|
|
|
|
|
if ($f->has_tag('Name')){ |
|
118
|
|
|
|
|
|
|
@name = $f->get_tag_values('Name'); |
|
119
|
|
|
|
|
|
|
$feat_name = $name[0]; |
|
120
|
|
|
|
|
|
|
} |
|
121
|
|
|
|
|
|
|
elsif ($f->has_tag('ID')){ |
|
122
|
|
|
|
|
|
|
@id = $f->get_tag_values('ID'); |
|
123
|
|
|
|
|
|
|
$feat_name = $id[0]; # ID=id101, use ID as feature name |
|
124
|
|
|
|
|
|
|
} |
|
125
|
|
|
|
|
|
|
else { |
|
126
|
|
|
|
|
|
|
croak "ERROR [$this_function] Cannot set name for feature \ |
|
127
|
|
|
|
|
|
|
$f->gbkey at pos. $f->start\n"; |
|
128
|
|
|
|
|
|
|
} |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
unless (exists ${$self->features}{$uid}) { # gene / ribosome_entry_site / etc. |
|
131
|
|
|
|
|
|
|
${$self->features}{$uid}->{start} = $f->start; |
|
132
|
|
|
|
|
|
|
${$self->features}{$uid}->{end} = $f->end; |
|
133
|
|
|
|
|
|
|
${$self->features}{$uid}->{strand} = $f->strand; |
|
134
|
|
|
|
|
|
|
${$self->features}{$uid}->{length} = $f->length; |
|
135
|
|
|
|
|
|
|
${$self->features}{$uid}->{seqid} = $f->seq_id; |
|
136
|
|
|
|
|
|
|
${$self->features}{$uid}->{score} = $f->score || 0; |
|
137
|
|
|
|
|
|
|
${$self->features}{$uid}->{gbkey} = $gbkey; |
|
138
|
|
|
|
|
|
|
${$self->features}{$uid}->{name} = $feat_name; |
|
139
|
|
|
|
|
|
|
${$self->features}{$uid}->{uid} = $uid; |
|
140
|
|
|
|
|
|
|
} |
|
141
|
|
|
|
|
|
|
else { # CDS / tRNA / rRNA / etc. |
|
142
|
|
|
|
|
|
|
${$self->features}{$uid}->{gbkey} = $gbkey; # gbkey for tRNA/ rRNA/ CDS etc |
|
143
|
|
|
|
|
|
|
} |
|
144
|
|
|
|
|
|
|
} |
|
145
|
|
|
|
|
|
|
$gffio->close(); |
|
146
|
|
|
|
|
|
|
} |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
sub features2bed { |
|
149
|
|
|
|
|
|
|
my ($self,$gbkey,$dest,$bn,$log) = @_; |
|
150
|
|
|
|
|
|
|
my ($chrom,$chrom_start,$chrom_end,$name,$score,$strand,$thick_start); |
|
151
|
|
|
|
|
|
|
my ($thick_end,$reserved,$block_count,$block_sizes,$block_starts); |
|
152
|
|
|
|
|
|
|
my @ft = (); |
|
153
|
|
|
|
|
|
|
my $this_function = (caller(0))[3]; |
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
croak "ERROR [$this_function] $self->features not available" |
|
156
|
|
|
|
|
|
|
unless ($self->has_features); |
|
157
|
|
|
|
|
|
|
croak "ERROT [$this_function] $self->featstat not available" |
|
158
|
|
|
|
|
|
|
unless ($self->has_featstat); |
|
159
|
|
|
|
|
|
|
croak "ERROR [$this_function] $dest does not exist" |
|
160
|
|
|
|
|
|
|
unless (-d $dest); |
|
161
|
|
|
|
|
|
|
if (defined $log){open(LOG, ">>", $log) or croak $!;} |
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
if (defined $gbkey){ # dump features of just one genbank key |
|
164
|
|
|
|
|
|
|
confess "ERROR [$this_function] genbank key \'$gbkey\' N/A in hash " |
|
165
|
|
|
|
|
|
|
unless (exists ${$self->featstat}{$gbkey}); |
|
166
|
|
|
|
|
|
|
$ft[0] = $gbkey; |
|
167
|
|
|
|
|
|
|
} |
|
168
|
|
|
|
|
|
|
else{ # dump features for all genbank keys |
|
169
|
|
|
|
|
|
|
foreach my $gbk (keys %{$self->featstat}) { |
|
170
|
|
|
|
|
|
|
next if ($gbk eq 'total' || $gbk eq 'Src' || $gbk eq 'accession' || |
|
171
|
|
|
|
|
|
|
$gbk eq 'origin' || $gbk eq 'count'); |
|
172
|
|
|
|
|
|
|
push @ft,$gbk; |
|
173
|
|
|
|
|
|
|
} |
|
174
|
|
|
|
|
|
|
} |
|
175
|
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
foreach my $f (@ft){ |
|
177
|
|
|
|
|
|
|
my $bedname = file($dest,"$bn.$f.bed"); |
|
178
|
|
|
|
|
|
|
my $bedname_u = file($dest,"$bn.$f.u.bed"); |
|
179
|
|
|
|
|
|
|
open (BEDOUT, "> $bedname_u") or croak $!; |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# dump unsorted gene annotation from DS to BED12 |
|
182
|
|
|
|
|
|
|
foreach my $uid (keys %{$self->features}){ |
|
183
|
|
|
|
|
|
|
next unless (${$self->features}{$uid}->{gbkey} eq $f); |
|
184
|
|
|
|
|
|
|
my @bedline = (); |
|
185
|
|
|
|
|
|
|
$chrom = ${$self->features}{$uid}->{seqid}; |
|
186
|
|
|
|
|
|
|
$chrom_start = ${$self->features}{$uid}->{start}; |
|
187
|
|
|
|
|
|
|
$chrom_start--; # BED is 0-based |
|
188
|
|
|
|
|
|
|
$chrom_end = ${$self->features}{$uid}->{end}; |
|
189
|
|
|
|
|
|
|
$name = ${$self->features}{$uid}->{name}; |
|
190
|
|
|
|
|
|
|
$score = ${$self->features}{$uid}->{score}; |
|
191
|
|
|
|
|
|
|
$strand = ${$self->features}{$uid}->{strand} == -1 ? '-' : '+'; #default to + |
|
192
|
|
|
|
|
|
|
$thick_start = $chrom_start; |
|
193
|
|
|
|
|
|
|
$thick_end = $chrom_end; |
|
194
|
|
|
|
|
|
|
$reserved = 0; |
|
195
|
|
|
|
|
|
|
$block_count = 1; |
|
196
|
|
|
|
|
|
|
$block_sizes = ${$self->features}{$uid}->{length}.","; |
|
197
|
|
|
|
|
|
|
$block_starts = "0,"; |
|
198
|
|
|
|
|
|
|
@bedline = join ("\t", ($chrom,$chrom_start,$chrom_end, |
|
199
|
|
|
|
|
|
|
$name,$score,$strand,$thick_start, |
|
200
|
|
|
|
|
|
|
$thick_end,$reserved,$block_count, |
|
201
|
|
|
|
|
|
|
$block_sizes, $block_starts)); |
|
202
|
|
|
|
|
|
|
print BEDOUT "@bedline\n"; |
|
203
|
|
|
|
|
|
|
} |
|
204
|
|
|
|
|
|
|
close (BEDOUT); |
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
sortbed($bedname_u,".",$bedname,1,undef); # sort bed file |
|
207
|
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
} # end foreach |
|
209
|
|
|
|
|
|
|
if (defined $log){close(LOG)}; |
|
210
|
|
|
|
|
|
|
} |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
sub feature_summary { |
|
213
|
|
|
|
|
|
|
my ($self, $dest) = @_; |
|
214
|
|
|
|
|
|
|
my ($fn,$fh); |
|
215
|
|
|
|
|
|
|
my $this_function = (caller(0))[3]; |
|
216
|
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
croak "ERROR [$this_function] $dest does not exist\n" |
|
218
|
|
|
|
|
|
|
unless (-d $dest); |
|
219
|
|
|
|
|
|
|
croak "ERROR [$this_function] $self->accession not available\n" |
|
220
|
|
|
|
|
|
|
unless ($self->has_accession); |
|
221
|
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
$fn = dir($dest,$self->accession.".summary.txt"); |
|
223
|
|
|
|
|
|
|
open $fh, ">", $fn or croak $!; |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
print $fh "Accession\t ".$self->accession."\n"; |
|
226
|
|
|
|
|
|
|
print $fh "Origin \t ${$self->featstat}{origin}\n"; |
|
227
|
|
|
|
|
|
|
foreach my $ft (sort keys %{$self->featstat}){ |
|
228
|
|
|
|
|
|
|
next if ($ft =~ /total/ || $ft =~ /accession/ || $ft =~ /origin/); |
|
229
|
|
|
|
|
|
|
print $fh "$ft\t${$self->featstat}{$ft}\n"; |
|
230
|
|
|
|
|
|
|
} |
|
231
|
|
|
|
|
|
|
print $fh "Total\t${$self->featstat}{total}\n"; |
|
232
|
|
|
|
|
|
|
close $fh; |
|
233
|
|
|
|
|
|
|
} |
|
234
|
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
|
236
|
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
1; |
|
238
|
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
__END__ |