| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Bio::GFF3::LowLevel; |
|
2
|
|
|
|
|
|
|
BEGIN { |
|
3
|
4
|
|
|
4
|
|
24765
|
$Bio::GFF3::LowLevel::AUTHORITY = 'cpan:RBUELS'; |
|
4
|
|
|
|
|
|
|
} |
|
5
|
|
|
|
|
|
|
{ |
|
6
|
|
|
|
|
|
|
$Bio::GFF3::LowLevel::VERSION = '2.0'; |
|
7
|
|
|
|
|
|
|
} |
|
8
|
|
|
|
|
|
|
# ABSTRACT: fast, low-level functions for parsing and formatting GFF3 |
|
9
|
|
|
|
|
|
|
|
|
10
|
4
|
|
|
4
|
|
26
|
use strict; |
|
|
4
|
|
|
|
|
8
|
|
|
|
4
|
|
|
|
|
117
|
|
|
11
|
|
|
|
|
|
|
|
|
12
|
4
|
|
|
4
|
|
23
|
use Scalar::Util (); |
|
|
4
|
|
|
|
|
7
|
|
|
|
4
|
|
|
|
|
54
|
|
|
13
|
4
|
|
|
4
|
|
3385
|
use URI::Escape (); |
|
|
4
|
|
|
|
|
5894
|
|
|
|
4
|
|
|
|
|
367
|
|
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
require Exporter; |
|
17
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
|
18
|
|
|
|
|
|
|
our @EXPORT_OK = qw( |
|
19
|
|
|
|
|
|
|
gff3_parse_feature |
|
20
|
|
|
|
|
|
|
gff3_parse_attributes |
|
21
|
|
|
|
|
|
|
gff3_parse_directive |
|
22
|
|
|
|
|
|
|
gff3_format_feature |
|
23
|
|
|
|
|
|
|
gff3_format_attributes |
|
24
|
|
|
|
|
|
|
gff3_escape |
|
25
|
|
|
|
|
|
|
gff3_unescape |
|
26
|
|
|
|
|
|
|
); |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
my @gff3_field_names = qw( |
|
29
|
|
|
|
|
|
|
seq_id |
|
30
|
|
|
|
|
|
|
source |
|
31
|
|
|
|
|
|
|
type |
|
32
|
|
|
|
|
|
|
start |
|
33
|
|
|
|
|
|
|
end |
|
34
|
|
|
|
|
|
|
score |
|
35
|
|
|
|
|
|
|
strand |
|
36
|
|
|
|
|
|
|
phase |
|
37
|
|
|
|
|
|
|
attributes |
|
38
|
|
|
|
|
|
|
); |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
sub gff3_parse_feature { |
|
42
|
5102
|
|
|
5102
|
1
|
6423
|
my ( $line ) = @_; |
|
43
|
4
|
|
|
4
|
|
36
|
no warnings 'uninitialized'; |
|
|
4
|
|
|
|
|
7
|
|
|
|
4
|
|
|
|
|
1319
|
|
|
44
|
|
|
|
|
|
|
|
|
45
|
5102
|
|
|
|
|
27679
|
my @f = split /\t/, $line; |
|
46
|
5102
|
|
|
|
|
10453
|
for( @f ) { |
|
47
|
46097
|
100
|
|
|
|
104757
|
if( $_ eq '.' ) { |
|
48
|
6518
|
|
|
|
|
8650
|
$_ = undef; |
|
49
|
|
|
|
|
|
|
} |
|
50
|
|
|
|
|
|
|
} |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# unescape only the ref and source columns |
|
53
|
5102
|
|
|
|
|
8405
|
$f[0] =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; |
|
|
3
|
|
|
|
|
9
|
|
|
54
|
5102
|
|
|
|
|
6490
|
$f[1] =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; |
|
|
0
|
|
|
|
|
0
|
|
|
55
|
|
|
|
|
|
|
|
|
56
|
5102
|
|
|
|
|
9179
|
$f[8] = gff3_parse_attributes( $f[8] ); |
|
57
|
5102
|
|
|
|
|
7130
|
my %parsed; |
|
58
|
5102
|
|
|
|
|
35032
|
@parsed{@gff3_field_names} = @f; |
|
59
|
5102
|
|
|
|
|
23765
|
return \%parsed; |
|
60
|
|
|
|
|
|
|
} |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
sub gff3_parse_attributes { |
|
64
|
5102
|
|
|
5102
|
1
|
7015
|
my ( $attr_string ) = @_; |
|
65
|
|
|
|
|
|
|
|
|
66
|
5102
|
100
|
66
|
|
|
22513
|
return {} if !defined $attr_string || $attr_string eq '.'; |
|
67
|
|
|
|
|
|
|
|
|
68
|
5101
|
|
|
|
|
19173
|
$attr_string =~ s/\r?\n$//; |
|
69
|
|
|
|
|
|
|
|
|
70
|
5101
|
|
|
|
|
5893
|
my %attrs; |
|
71
|
5101
|
|
|
|
|
12346
|
for my $a ( split ';', $attr_string ) { |
|
72
|
4
|
|
|
4
|
|
31
|
no warnings 'uninitialized'; |
|
|
4
|
|
|
|
|
8
|
|
|
|
4
|
|
|
|
|
2451
|
|
|
73
|
9666
|
|
|
|
|
21700
|
my ( $name, $values ) = split '=', $a, 2; |
|
74
|
9666
|
100
|
|
|
|
20088
|
next unless defined $values; |
|
75
|
9665
|
|
|
|
|
9447
|
push @{$attrs{$name}}, map { s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; $_ } split ',', $values; |
|
|
9665
|
|
|
|
|
31333
|
|
|
|
9850
|
|
|
|
|
15387
|
|
|
|
992
|
|
|
|
|
9421
|
|
|
|
9850
|
|
|
|
|
31940
|
|
|
76
|
|
|
|
|
|
|
} |
|
77
|
|
|
|
|
|
|
|
|
78
|
5101
|
|
|
|
|
12710
|
return \%attrs; |
|
79
|
|
|
|
|
|
|
} |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
sub gff3_parse_directive { |
|
83
|
65
|
|
|
65
|
1
|
92
|
my ( $line ) = @_; |
|
84
|
|
|
|
|
|
|
|
|
85
|
65
|
50
|
|
|
|
405
|
my ( $name, $contents ) = $line =~ /^ \s* \#\# \s* (\S+) \s* (.*) $/x |
|
86
|
|
|
|
|
|
|
or return; |
|
87
|
|
|
|
|
|
|
|
|
88
|
65
|
|
|
|
|
185
|
my $parsed = { directive => $name }; |
|
89
|
65
|
100
|
|
|
|
153
|
if( length $contents ) { |
|
90
|
54
|
|
|
|
|
89
|
$contents =~ s/\r?\n$//; |
|
91
|
54
|
|
|
|
|
120
|
$parsed->{value} = $contents; |
|
92
|
|
|
|
|
|
|
} |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
# do a little additional parsing for sequence-region and genome-build directives |
|
95
|
65
|
100
|
|
|
|
205
|
if( $name eq 'sequence-region' ) { |
|
|
|
50
|
|
|
|
|
|
|
96
|
12
|
|
|
|
|
46
|
my ( $seqid, $start, $end ) = split /\s+/, $contents, 3; |
|
97
|
12
|
|
|
|
|
70
|
s/\D//g for $start, $end; |
|
98
|
12
|
|
|
|
|
25
|
@{$parsed}{qw( seq_id start end )} = ( $seqid, $start, $end ); |
|
|
12
|
|
|
|
|
54
|
|
|
99
|
|
|
|
|
|
|
} |
|
100
|
|
|
|
|
|
|
elsif( $name eq 'genome-build' ) { |
|
101
|
0
|
|
|
|
|
0
|
my ( $source, $buildname ) = split /\s+/, $contents, 2; |
|
102
|
0
|
|
|
|
|
0
|
@{$parsed}{qw(source buildname)} = ( $source, $buildname ); |
|
|
0
|
|
|
|
|
0
|
|
|
103
|
|
|
|
|
|
|
} |
|
104
|
|
|
|
|
|
|
|
|
105
|
65
|
|
|
|
|
253
|
return $parsed; |
|
106
|
|
|
|
|
|
|
} |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
sub gff3_format_feature { |
|
110
|
10
|
|
|
10
|
1
|
1855
|
my ( $f ) = @_; |
|
111
|
|
|
|
|
|
|
|
|
112
|
10
|
|
|
|
|
42
|
my $attr_string = $f->{attributes}; |
|
113
|
10
|
50
|
|
|
|
28
|
$attr_string = '.' unless defined $attr_string; |
|
114
|
|
|
|
|
|
|
|
|
115
|
10
|
50
|
33
|
|
|
80
|
$attr_string = gff3_format_attributes( $attr_string ) |
|
116
|
|
|
|
|
|
|
if ref( $attr_string ) eq 'HASH' |
|
117
|
|
|
|
|
|
|
&& ! Scalar::Util::blessed( $attr_string ); |
|
118
|
|
|
|
|
|
|
|
|
119
|
80
|
100
|
|
|
|
2066
|
return join( "\t", |
|
120
|
10
|
|
|
|
|
49
|
( map { defined $_ ? gff3_escape($_) : '.' } |
|
121
|
10
|
|
|
|
|
24
|
@{$f}{@gff3_field_names[0..7]} |
|
122
|
|
|
|
|
|
|
), |
|
123
|
|
|
|
|
|
|
$attr_string |
|
124
|
|
|
|
|
|
|
)."\n"; |
|
125
|
|
|
|
|
|
|
} |
|
126
|
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
my %force_attr_first = ( |
|
129
|
|
|
|
|
|
|
ID => 1, |
|
130
|
|
|
|
|
|
|
Name => 2, |
|
131
|
|
|
|
|
|
|
Alias => 3, |
|
132
|
|
|
|
|
|
|
Parent => 4, |
|
133
|
|
|
|
|
|
|
); |
|
134
|
|
|
|
|
|
|
sub _cmp_attr_names { |
|
135
|
4
|
|
|
4
|
|
24
|
no warnings 'uninitialized'; |
|
|
4
|
|
|
|
|
6
|
|
|
|
4
|
|
|
|
|
629
|
|
|
136
|
11
|
|
|
11
|
|
26
|
my ( $fa, $fb ) = @force_attr_first{ $a, $b }; |
|
137
|
11
|
100
|
100
|
|
|
53
|
return $fa <=> $fb if $fa && $fb; |
|
138
|
|
|
|
|
|
|
|
|
139
|
9
|
100
|
66
|
|
|
50
|
return -1 if $fa && !$fb; |
|
140
|
4
|
100
|
66
|
|
|
24
|
return 1 if !$fa && $fb; |
|
141
|
|
|
|
|
|
|
|
|
142
|
1
|
|
|
|
|
4
|
return $a cmp $b; |
|
143
|
|
|
|
|
|
|
} |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
sub gff3_format_attributes { |
|
146
|
17
|
|
|
17
|
1
|
318
|
my ( $attr ) = @_; |
|
147
|
|
|
|
|
|
|
|
|
148
|
17
|
100
|
|
|
|
47
|
return '.' unless defined $attr; |
|
149
|
|
|
|
|
|
|
|
|
150
|
24
|
|
|
|
|
30
|
my $astring = join ';' => ( |
|
151
|
|
|
|
|
|
|
map { |
|
152
|
16
|
|
|
|
|
79
|
my $key = $_; |
|
153
|
24
|
|
|
|
|
41
|
my $val = $attr->{$key}; |
|
154
|
4
|
|
|
4
|
|
23
|
no warnings 'uninitialized'; |
|
|
4
|
|
|
|
|
6
|
|
|
|
4
|
|
|
|
|
636
|
|
|
155
|
24
|
100
|
|
|
|
86
|
$val = join( ',', map gff3_escape($_), ref $val eq 'ARRAY' ? @$val : $val ); |
|
156
|
24
|
100
|
|
|
|
2725
|
if( length $val ) { |
|
157
|
21
|
|
|
|
|
70
|
"$key=$val" |
|
158
|
|
|
|
|
|
|
} else { |
|
159
|
|
|
|
|
|
|
() |
|
160
|
3
|
|
|
|
|
7
|
} |
|
161
|
|
|
|
|
|
|
} |
|
162
|
|
|
|
|
|
|
sort _cmp_attr_names |
|
163
|
|
|
|
|
|
|
keys %$attr |
|
164
|
|
|
|
|
|
|
); |
|
165
|
|
|
|
|
|
|
|
|
166
|
16
|
100
|
|
|
|
70
|
return length $astring ? $astring : '.'; |
|
167
|
|
|
|
|
|
|
} |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
sub gff3_escape { |
|
171
|
86
|
|
|
86
|
1
|
591
|
URI::Escape::uri_escape( $_[0], '\n\r\t;=%&,\x00-\x1f\x7f-\xff' ) |
|
172
|
|
|
|
|
|
|
} |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
*gff3_unescape = \&URI::Escape::uri_unescape; |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
__END__ |