line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::GFF3::LowLevel; |
2
|
|
|
|
|
|
|
BEGIN { |
3
|
4
|
|
|
4
|
|
24765
|
$Bio::GFF3::LowLevel::AUTHORITY = 'cpan:RBUELS'; |
4
|
|
|
|
|
|
|
} |
5
|
|
|
|
|
|
|
{ |
6
|
|
|
|
|
|
|
$Bio::GFF3::LowLevel::VERSION = '2.0'; |
7
|
|
|
|
|
|
|
} |
8
|
|
|
|
|
|
|
# ABSTRACT: fast, low-level functions for parsing and formatting GFF3 |
9
|
|
|
|
|
|
|
|
10
|
4
|
|
|
4
|
|
26
|
use strict; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
117
|
|
11
|
|
|
|
|
|
|
|
12
|
4
|
|
|
4
|
|
23
|
use Scalar::Util (); |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
54
|
|
13
|
4
|
|
|
4
|
|
3385
|
use URI::Escape (); |
|
4
|
|
|
|
|
5894
|
|
|
4
|
|
|
|
|
367
|
|
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
require Exporter; |
17
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
18
|
|
|
|
|
|
|
our @EXPORT_OK = qw( |
19
|
|
|
|
|
|
|
gff3_parse_feature |
20
|
|
|
|
|
|
|
gff3_parse_attributes |
21
|
|
|
|
|
|
|
gff3_parse_directive |
22
|
|
|
|
|
|
|
gff3_format_feature |
23
|
|
|
|
|
|
|
gff3_format_attributes |
24
|
|
|
|
|
|
|
gff3_escape |
25
|
|
|
|
|
|
|
gff3_unescape |
26
|
|
|
|
|
|
|
); |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
my @gff3_field_names = qw( |
29
|
|
|
|
|
|
|
seq_id |
30
|
|
|
|
|
|
|
source |
31
|
|
|
|
|
|
|
type |
32
|
|
|
|
|
|
|
start |
33
|
|
|
|
|
|
|
end |
34
|
|
|
|
|
|
|
score |
35
|
|
|
|
|
|
|
strand |
36
|
|
|
|
|
|
|
phase |
37
|
|
|
|
|
|
|
attributes |
38
|
|
|
|
|
|
|
); |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
sub gff3_parse_feature { |
42
|
5102
|
|
|
5102
|
1
|
6423
|
my ( $line ) = @_; |
43
|
4
|
|
|
4
|
|
36
|
no warnings 'uninitialized'; |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
1319
|
|
44
|
|
|
|
|
|
|
|
45
|
5102
|
|
|
|
|
27679
|
my @f = split /\t/, $line; |
46
|
5102
|
|
|
|
|
10453
|
for( @f ) { |
47
|
46097
|
100
|
|
|
|
104757
|
if( $_ eq '.' ) { |
48
|
6518
|
|
|
|
|
8650
|
$_ = undef; |
49
|
|
|
|
|
|
|
} |
50
|
|
|
|
|
|
|
} |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# unescape only the ref and source columns |
53
|
5102
|
|
|
|
|
8405
|
$f[0] =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; |
|
3
|
|
|
|
|
9
|
|
54
|
5102
|
|
|
|
|
6490
|
$f[1] =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; |
|
0
|
|
|
|
|
0
|
|
55
|
|
|
|
|
|
|
|
56
|
5102
|
|
|
|
|
9179
|
$f[8] = gff3_parse_attributes( $f[8] ); |
57
|
5102
|
|
|
|
|
7130
|
my %parsed; |
58
|
5102
|
|
|
|
|
35032
|
@parsed{@gff3_field_names} = @f; |
59
|
5102
|
|
|
|
|
23765
|
return \%parsed; |
60
|
|
|
|
|
|
|
} |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
sub gff3_parse_attributes { |
64
|
5102
|
|
|
5102
|
1
|
7015
|
my ( $attr_string ) = @_; |
65
|
|
|
|
|
|
|
|
66
|
5102
|
100
|
66
|
|
|
22513
|
return {} if !defined $attr_string || $attr_string eq '.'; |
67
|
|
|
|
|
|
|
|
68
|
5101
|
|
|
|
|
19173
|
$attr_string =~ s/\r?\n$//; |
69
|
|
|
|
|
|
|
|
70
|
5101
|
|
|
|
|
5893
|
my %attrs; |
71
|
5101
|
|
|
|
|
12346
|
for my $a ( split ';', $attr_string ) { |
72
|
4
|
|
|
4
|
|
31
|
no warnings 'uninitialized'; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
2451
|
|
73
|
9666
|
|
|
|
|
21700
|
my ( $name, $values ) = split '=', $a, 2; |
74
|
9666
|
100
|
|
|
|
20088
|
next unless defined $values; |
75
|
9665
|
|
|
|
|
9447
|
push @{$attrs{$name}}, map { s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; $_ } split ',', $values; |
|
9665
|
|
|
|
|
31333
|
|
|
9850
|
|
|
|
|
15387
|
|
|
992
|
|
|
|
|
9421
|
|
|
9850
|
|
|
|
|
31940
|
|
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
|
78
|
5101
|
|
|
|
|
12710
|
return \%attrs; |
79
|
|
|
|
|
|
|
} |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
sub gff3_parse_directive { |
83
|
65
|
|
|
65
|
1
|
92
|
my ( $line ) = @_; |
84
|
|
|
|
|
|
|
|
85
|
65
|
50
|
|
|
|
405
|
my ( $name, $contents ) = $line =~ /^ \s* \#\# \s* (\S+) \s* (.*) $/x |
86
|
|
|
|
|
|
|
or return; |
87
|
|
|
|
|
|
|
|
88
|
65
|
|
|
|
|
185
|
my $parsed = { directive => $name }; |
89
|
65
|
100
|
|
|
|
153
|
if( length $contents ) { |
90
|
54
|
|
|
|
|
89
|
$contents =~ s/\r?\n$//; |
91
|
54
|
|
|
|
|
120
|
$parsed->{value} = $contents; |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
# do a little additional parsing for sequence-region and genome-build directives |
95
|
65
|
100
|
|
|
|
205
|
if( $name eq 'sequence-region' ) { |
|
|
50
|
|
|
|
|
|
96
|
12
|
|
|
|
|
46
|
my ( $seqid, $start, $end ) = split /\s+/, $contents, 3; |
97
|
12
|
|
|
|
|
70
|
s/\D//g for $start, $end; |
98
|
12
|
|
|
|
|
25
|
@{$parsed}{qw( seq_id start end )} = ( $seqid, $start, $end ); |
|
12
|
|
|
|
|
54
|
|
99
|
|
|
|
|
|
|
} |
100
|
|
|
|
|
|
|
elsif( $name eq 'genome-build' ) { |
101
|
0
|
|
|
|
|
0
|
my ( $source, $buildname ) = split /\s+/, $contents, 2; |
102
|
0
|
|
|
|
|
0
|
@{$parsed}{qw(source buildname)} = ( $source, $buildname ); |
|
0
|
|
|
|
|
0
|
|
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
|
105
|
65
|
|
|
|
|
253
|
return $parsed; |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
sub gff3_format_feature { |
110
|
10
|
|
|
10
|
1
|
1855
|
my ( $f ) = @_; |
111
|
|
|
|
|
|
|
|
112
|
10
|
|
|
|
|
42
|
my $attr_string = $f->{attributes}; |
113
|
10
|
50
|
|
|
|
28
|
$attr_string = '.' unless defined $attr_string; |
114
|
|
|
|
|
|
|
|
115
|
10
|
50
|
33
|
|
|
80
|
$attr_string = gff3_format_attributes( $attr_string ) |
116
|
|
|
|
|
|
|
if ref( $attr_string ) eq 'HASH' |
117
|
|
|
|
|
|
|
&& ! Scalar::Util::blessed( $attr_string ); |
118
|
|
|
|
|
|
|
|
119
|
80
|
100
|
|
|
|
2066
|
return join( "\t", |
120
|
10
|
|
|
|
|
49
|
( map { defined $_ ? gff3_escape($_) : '.' } |
121
|
10
|
|
|
|
|
24
|
@{$f}{@gff3_field_names[0..7]} |
122
|
|
|
|
|
|
|
), |
123
|
|
|
|
|
|
|
$attr_string |
124
|
|
|
|
|
|
|
)."\n"; |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
my %force_attr_first = ( |
129
|
|
|
|
|
|
|
ID => 1, |
130
|
|
|
|
|
|
|
Name => 2, |
131
|
|
|
|
|
|
|
Alias => 3, |
132
|
|
|
|
|
|
|
Parent => 4, |
133
|
|
|
|
|
|
|
); |
134
|
|
|
|
|
|
|
sub _cmp_attr_names { |
135
|
4
|
|
|
4
|
|
24
|
no warnings 'uninitialized'; |
|
4
|
|
|
|
|
6
|
|
|
4
|
|
|
|
|
629
|
|
136
|
11
|
|
|
11
|
|
26
|
my ( $fa, $fb ) = @force_attr_first{ $a, $b }; |
137
|
11
|
100
|
100
|
|
|
53
|
return $fa <=> $fb if $fa && $fb; |
138
|
|
|
|
|
|
|
|
139
|
9
|
100
|
66
|
|
|
50
|
return -1 if $fa && !$fb; |
140
|
4
|
100
|
66
|
|
|
24
|
return 1 if !$fa && $fb; |
141
|
|
|
|
|
|
|
|
142
|
1
|
|
|
|
|
4
|
return $a cmp $b; |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
sub gff3_format_attributes { |
146
|
17
|
|
|
17
|
1
|
318
|
my ( $attr ) = @_; |
147
|
|
|
|
|
|
|
|
148
|
17
|
100
|
|
|
|
47
|
return '.' unless defined $attr; |
149
|
|
|
|
|
|
|
|
150
|
24
|
|
|
|
|
30
|
my $astring = join ';' => ( |
151
|
|
|
|
|
|
|
map { |
152
|
16
|
|
|
|
|
79
|
my $key = $_; |
153
|
24
|
|
|
|
|
41
|
my $val = $attr->{$key}; |
154
|
4
|
|
|
4
|
|
23
|
no warnings 'uninitialized'; |
|
4
|
|
|
|
|
6
|
|
|
4
|
|
|
|
|
636
|
|
155
|
24
|
100
|
|
|
|
86
|
$val = join( ',', map gff3_escape($_), ref $val eq 'ARRAY' ? @$val : $val ); |
156
|
24
|
100
|
|
|
|
2725
|
if( length $val ) { |
157
|
21
|
|
|
|
|
70
|
"$key=$val" |
158
|
|
|
|
|
|
|
} else { |
159
|
|
|
|
|
|
|
() |
160
|
3
|
|
|
|
|
7
|
} |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
sort _cmp_attr_names |
163
|
|
|
|
|
|
|
keys %$attr |
164
|
|
|
|
|
|
|
); |
165
|
|
|
|
|
|
|
|
166
|
16
|
100
|
|
|
|
70
|
return length $astring ? $astring : '.'; |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
sub gff3_escape { |
171
|
86
|
|
|
86
|
1
|
591
|
URI::Escape::uri_escape( $_[0], '\n\r\t;=%&,\x00-\x1f\x7f-\xff' ) |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
*gff3_unescape = \&URI::Escape::uri_unescape; |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
__END__ |