| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# $Id: gmap_f9.pm 15987 2009-08-18 21:08:55Z lstein $ |
|
2
|
|
|
|
|
|
|
# |
|
3
|
|
|
|
|
|
|
# BioPerl module for Bio::SearchIO::gmap_f9 |
|
4
|
|
|
|
|
|
|
# |
|
5
|
|
|
|
|
|
|
# Cared for by George Hartzell |
|
6
|
|
|
|
|
|
|
# |
|
7
|
|
|
|
|
|
|
# Copyright George Hartzell |
|
8
|
|
|
|
|
|
|
# |
|
9
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=head1 NAME |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
Bio::SearchIO::gmap_f9 - Event generator for parsing gmap reports (Z format) |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
# Do not use this object directly - it is used as part of the |
|
20
|
|
|
|
|
|
|
# Bio::SearchIO system. |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
use Bio::SearchIO; |
|
23
|
|
|
|
|
|
|
my $searchio = Bio::SearchIO->new(-format => 'gmap', |
|
24
|
|
|
|
|
|
|
-file => 't/data/her2.gmapz'); |
|
25
|
|
|
|
|
|
|
while( my $result = $searchio->next_result ) { |
|
26
|
|
|
|
|
|
|
while( my $hit = $result->next_hit ) { |
|
27
|
|
|
|
|
|
|
while( my $hsp = $hit->next_hsp ) { |
|
28
|
|
|
|
|
|
|
# ... |
|
29
|
|
|
|
|
|
|
} |
|
30
|
|
|
|
|
|
|
} |
|
31
|
|
|
|
|
|
|
} |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
This object encapsulated the necessary methods for generating events |
|
37
|
|
|
|
|
|
|
suitable for building Bio::Search objects from a GMAP "compressed" |
|
38
|
|
|
|
|
|
|
report (from gmap run with -Z flag) Read the L for more |
|
39
|
|
|
|
|
|
|
information about how to use this. |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=head2 REVERSE STRAND AND BIOPERL COORDINATES |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
I believe that I'm doing the correct thing when reporting hits on the |
|
44
|
|
|
|
|
|
|
negative strand of the genome. In particular, I've compared the |
|
45
|
|
|
|
|
|
|
"exons" this code generates with the set returned by ncbi's megablast |
|
46
|
|
|
|
|
|
|
web service. NCBI's hsp's are ordered differently and have a |
|
47
|
|
|
|
|
|
|
different genomic location (off by ~18,000,000 bases, padding?) but |
|
48
|
|
|
|
|
|
|
the starts, ends, and lengths were similar and my strand handling |
|
49
|
|
|
|
|
|
|
matches theirs. E.g. |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
CDNA GENOME |
|
52
|
|
|
|
|
|
|
start end strand start end strand |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
blast |
|
55
|
|
|
|
|
|
|
1913 2989 1 86236731 86237808 -1 |
|
56
|
|
|
|
|
|
|
1 475 1 86260509 86260983 -1 |
|
57
|
|
|
|
|
|
|
1510 1727 1 86240259 86240476 -1 |
|
58
|
|
|
|
|
|
|
841 989 1 86243034 86243182 -1 |
|
59
|
|
|
|
|
|
|
1381 1514 1 86240630 86240763 -1 |
|
60
|
|
|
|
|
|
|
989 1122 1 86242457 86242590 -1 |
|
61
|
|
|
|
|
|
|
599 729 1 86247470 86247600 -1 |
|
62
|
|
|
|
|
|
|
473 608 1 86259972 86260107 -1 |
|
63
|
|
|
|
|
|
|
1255 1382 1 86240837 86240964 -1 |
|
64
|
|
|
|
|
|
|
730 842 1 86244040 86244152 -1 |
|
65
|
|
|
|
|
|
|
1813 1921 1 86238123 86238231 -1 |
|
66
|
|
|
|
|
|
|
1725 1814 1 86239747 86239836 -1 |
|
67
|
|
|
|
|
|
|
1167 1256 1 86241294 86241383 -1 |
|
68
|
|
|
|
|
|
|
1120 1188 1 86242319 86242387 -1 |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
gmap |
|
71
|
|
|
|
|
|
|
1 475 1 104330509 104330983 -1 |
|
72
|
|
|
|
|
|
|
476 600 1 104329980 104330104 -1 |
|
73
|
|
|
|
|
|
|
601 729 1 104317470 104317598 -1 |
|
74
|
|
|
|
|
|
|
730 841 1 104314041 104314152 -1 |
|
75
|
|
|
|
|
|
|
842 989 1 104313034 104313181 -1 |
|
76
|
|
|
|
|
|
|
990 1121 1 104312458 104312589 -1 |
|
77
|
|
|
|
|
|
|
1122 1187 1 104312320 104312385 -1 |
|
78
|
|
|
|
|
|
|
1188 1256 1 104311294 104311362 -1 |
|
79
|
|
|
|
|
|
|
1257 1382 1 104310837 104310962 -1 |
|
80
|
|
|
|
|
|
|
1383 1511 1 104310633 104310761 -1 |
|
81
|
|
|
|
|
|
|
1512 1726 1 104310260 104310474 -1 |
|
82
|
|
|
|
|
|
|
1727 1814 1 104309747 104309834 -1 |
|
83
|
|
|
|
|
|
|
1815 1917 1 104308127 104308229 -1 |
|
84
|
|
|
|
|
|
|
1918 2989 1 104306731 104307802 -1 |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=head1 FEEDBACK |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=head2 Mailing Lists |
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
|
91
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to |
|
92
|
|
|
|
|
|
|
the Bioperl mailing list. Your participation is much appreciated. |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
|
95
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=head2 Reporting Bugs |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
|
100
|
|
|
|
|
|
|
of the bugs and their resolution. Bug reports can be submitted via |
|
101
|
|
|
|
|
|
|
the web: |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=head1 AUTHOR - George Hartzell |
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
Email hartzell@alerce.com |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
=head1 CONTRIBUTORS |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
Additional contributors names and emails here |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=head1 APPENDIX |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
The rest of the documentation details each of the object methods. |
|
116
|
|
|
|
|
|
|
Internal methods are usually preceded with an underscore (_). |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
=cut |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
# Let the code begin... |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
package Bio::SearchIO::gmap_f9; |
|
125
|
1
|
|
|
1
|
|
6
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
25
|
|
|
126
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
29
|
|
|
127
|
|
|
|
|
|
|
|
|
128
|
1
|
|
|
1
|
|
361
|
use Bio::Search::Hit::GenericHit; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
34
|
|
|
129
|
1
|
|
|
1
|
|
439
|
use Bio::Search::HSP::GenericHSP; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
59
|
|
|
130
|
|
|
|
|
|
|
|
|
131
|
1
|
|
|
1
|
|
11
|
use base qw(Bio::SearchIO ); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
113
|
|
|
132
|
|
|
|
|
|
|
|
|
133
|
1
|
|
|
1
|
|
6
|
use Data::Dumper; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
1105
|
|
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
=head2 next_result |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
Title : next_result |
|
138
|
|
|
|
|
|
|
Usage : $result = stream->next_result |
|
139
|
|
|
|
|
|
|
Function: Reads the next ResultI object from the stream and returns it. |
|
140
|
|
|
|
|
|
|
Returns : A Bio::Search::Result::ResultI object |
|
141
|
|
|
|
|
|
|
Args : n/a |
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=cut |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
sub next_result { |
|
146
|
62
|
|
|
62
|
1
|
163
|
my $self = shift; |
|
147
|
|
|
|
|
|
|
|
|
148
|
62
|
|
|
|
|
192
|
my $info = []; |
|
149
|
62
|
|
|
|
|
252
|
my $result; |
|
150
|
|
|
|
|
|
|
my $hit; |
|
151
|
62
|
|
|
|
|
0
|
my @hsp_info; |
|
152
|
62
|
|
|
|
|
0
|
my $previous_hit_pos; |
|
153
|
|
|
|
|
|
|
|
|
154
|
62
|
|
|
|
|
296
|
while ( $_ = $self->_readline ) { |
|
155
|
86654
|
100
|
|
|
|
129965
|
if ( $_ =~ /^>/ ) { # looking at the start of a result |
|
156
|
118
|
100
|
|
|
|
304
|
if ($result) { # and done if there's one in progress |
|
157
|
57
|
|
|
|
|
281
|
$self->_pushback($_); |
|
158
|
57
|
|
|
|
|
910
|
goto DONE; |
|
159
|
|
|
|
|
|
|
} |
|
160
|
|
|
|
|
|
|
else { # otherwise start a new one. |
|
161
|
61
|
|
|
|
|
644
|
my ($id, $desc, $md5) = m|>([^ ]*)\s*(.*)\s*(?:md5:(.*))?|; |
|
162
|
|
|
|
|
|
|
|
|
163
|
61
|
|
|
|
|
419
|
$result = Bio::Search::Result::GenericResult->new(); |
|
164
|
61
|
|
|
|
|
254
|
$result->algorithm('gmap'); |
|
165
|
61
|
|
|
|
|
263
|
$result->query_name($id); |
|
166
|
61
|
|
|
|
|
190
|
$result->query_accession($id); |
|
167
|
61
|
|
|
|
|
285
|
$result->query_description($desc); |
|
168
|
|
|
|
|
|
|
#$self->warn("Take care of MD5!\n"); |
|
169
|
|
|
|
|
|
|
|
|
170
|
61
|
|
33
|
|
|
471
|
$hit ||= Bio::Search::Hit::GenericHit->new( -name => |
|
171
|
|
|
|
|
|
|
"NONE_SPECIFIED"); |
|
172
|
|
|
|
|
|
|
} |
|
173
|
|
|
|
|
|
|
} |
|
174
|
|
|
|
|
|
|
else { # add another position to the hit/hsp |
|
175
|
|
|
|
|
|
|
# 468 H 1956 C -14:104307764 2298317517 C H |
|
176
|
|
|
|
|
|
|
# 468 1957 A -14:104307763 2298317516 A |
|
177
|
86536
|
|
|
|
|
76862
|
my $c; # info about a column |
|
178
|
|
|
|
|
|
|
($c->{query_aa_pos}, $c->{query_aa}, $c->{query_pos}, |
|
179
|
|
|
|
|
|
|
$c->{query_base}, |
|
180
|
|
|
|
|
|
|
$c->{hit_strand}, $c->{hit_chromo}, $c->{hit_pos}, |
|
181
|
|
|
|
|
|
|
$c->{hit_concat_pos}, $c->{hit_base}, $c->{hit_aa}) |
|
182
|
86536
|
|
|
|
|
674091
|
= ($_ =~ |
|
183
|
|
|
|
|
|
|
m| |
|
184
|
|
|
|
|
|
|
(\d+)[ ]?(.)?[\t] |
|
185
|
|
|
|
|
|
|
(\d+)[ ]?(.)?[\t] |
|
186
|
|
|
|
|
|
|
# TODO chromosome isn't a number... X, Y, MT.... |
|
187
|
|
|
|
|
|
|
(\+\|\-)([\dxXyY]+\|MT):(\d+)[ ](\d+)[ ](.) |
|
188
|
|
|
|
|
|
|
[\t]?(.)? |
|
189
|
|
|
|
|
|
|
|xo |
|
190
|
|
|
|
|
|
|
); |
|
191
|
|
|
|
|
|
|
|
|
192
|
86536
|
100
|
100
|
|
|
304362
|
if ($previous_hit_pos && |
|
193
|
|
|
|
|
|
|
(abs($c->{hit_pos} - $previous_hit_pos) > 1)) { |
|
194
|
541
|
|
33
|
|
|
1593
|
$hit ||= Bio::Search::Hit::GenericHit->new( -name => |
|
195
|
|
|
|
|
|
|
"NONE_SPECIFIED", |
|
196
|
|
|
|
|
|
|
); |
|
197
|
541
|
|
|
|
|
2415
|
$hit->add_hsp( $self->_hsp_from_info(\@hsp_info) ); |
|
198
|
541
|
|
|
|
|
99593
|
@hsp_info = (); |
|
199
|
|
|
|
|
|
|
} |
|
200
|
86536
|
|
|
|
|
108959
|
push @hsp_info, $c; |
|
201
|
86536
|
|
|
|
|
177890
|
$previous_hit_pos = $c->{hit_pos}; |
|
202
|
|
|
|
|
|
|
} |
|
203
|
|
|
|
|
|
|
} |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
DONE: |
|
206
|
62
|
100
|
|
|
|
316
|
if ($result) { |
|
207
|
61
|
50
|
|
|
|
489
|
$hit->add_hsp( $self->_hsp_from_info(\@hsp_info) ) if (@hsp_info); |
|
208
|
|
|
|
|
|
|
|
|
209
|
61
|
|
|
|
|
141
|
my ($hit_length,$query_length); |
|
210
|
61
|
|
|
|
|
307
|
for my $hsp ($hit->hsps) { |
|
211
|
602
|
|
|
|
|
1382
|
$hit_length += $hsp->length(); |
|
212
|
602
|
|
|
|
|
1025
|
$query_length += $hsp->length('query'); |
|
213
|
|
|
|
|
|
|
} |
|
214
|
61
|
|
|
|
|
419
|
$hit->length($hit_length); |
|
215
|
61
|
|
|
|
|
264
|
$hit->query_length($query_length); |
|
216
|
|
|
|
|
|
|
# update this now that we actually know something useful.q |
|
217
|
61
|
|
|
|
|
427
|
$hit->name($hsp_info[0]->{hit_chromo}); |
|
218
|
|
|
|
|
|
|
|
|
219
|
61
|
50
|
|
|
|
465
|
$result->add_hit($hit) if ($hit); |
|
220
|
|
|
|
|
|
|
} |
|
221
|
|
|
|
|
|
|
|
|
222
|
62
|
|
|
|
|
25944
|
return($result); |
|
223
|
|
|
|
|
|
|
} |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
sub _hsp_from_info { |
|
228
|
602
|
|
|
602
|
|
1200
|
my $self = shift; |
|
229
|
602
|
|
|
|
|
1139
|
my $info = shift; |
|
230
|
602
|
|
|
|
|
1005
|
my $a = {}; # args w/ which we'll create hsp |
|
231
|
602
|
|
|
|
|
1256
|
my $hsp; |
|
232
|
|
|
|
|
|
|
my $identical; |
|
233
|
|
|
|
|
|
|
|
|
234
|
602
|
|
|
|
|
1583
|
$a->{-algorithm} = 'GMAP'; |
|
235
|
|
|
|
|
|
|
|
|
236
|
602
|
|
|
|
|
915
|
for my $c (@{$info}) { |
|
|
602
|
|
|
|
|
1950
|
|
|
237
|
86536
|
|
|
|
|
100834
|
$a->{-query_seq} .= $c->{query_base}; |
|
238
|
86536
|
|
|
|
|
95273
|
$a->{-hit_seq} .= $c->{hit_base}; |
|
239
|
86536
|
100
|
|
|
|
108745
|
$a->{-homology_seq} .= $c->{query_base} eq $c->{hit_base} ? $c->{hit_base} : ' '; |
|
240
|
86536
|
100
|
|
|
|
119914
|
$identical++ if ( $c->{query_base} eq $c->{hit_base} ); |
|
241
|
|
|
|
|
|
|
} |
|
242
|
|
|
|
|
|
|
|
|
243
|
602
|
|
|
|
|
1853
|
$a->{-query_seq} =~ s| |\-|g; # switch to bioperl gaps. |
|
244
|
602
|
|
|
|
|
1794
|
$a->{-hit_seq} =~ s| |\-|g; |
|
245
|
|
|
|
|
|
|
|
|
246
|
602
|
|
|
|
|
1867
|
$a->{-conserved} = $a->{-identical} = $identical; |
|
247
|
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
# use the coordinates from from gmap's -f 9 output to |
|
249
|
|
|
|
|
|
|
# determine whether gmap revcomped the query sequence |
|
250
|
|
|
|
|
|
|
# to generate the alignment. Note that this is not |
|
251
|
|
|
|
|
|
|
# the same as the cDNA's sense/anti-sense-ness. |
|
252
|
602
|
|
|
|
|
1322
|
$a->{-stranded} = 'both'; |
|
253
|
|
|
|
|
|
|
|
|
254
|
602
|
|
|
|
|
2307
|
$a->{-query_start} = $info->[0]->{query_pos}; |
|
255
|
602
|
|
|
|
|
1537
|
$a->{-query_end} = $info->[-1]->{query_pos}; |
|
256
|
602
|
|
|
|
|
2173
|
$a->{-query_length} = $a->{-query_end} - $a->{-query_start} + 1; |
|
257
|
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
# hit can be either strand, -f 9 output tells us which. |
|
259
|
|
|
|
|
|
|
# we don't have to worry about it here, but telling the generichsp code |
|
260
|
|
|
|
|
|
|
# that this hit is 'stranded', it compares the start and end positions |
|
261
|
|
|
|
|
|
|
# sets it for us. |
|
262
|
602
|
|
|
|
|
1582
|
$a->{-hit_start} = $info->[0]->{hit_pos}; |
|
263
|
602
|
|
|
|
|
1413
|
$a->{-hit_end} = $info->[-1]->{hit_pos}; |
|
264
|
|
|
|
|
|
|
|
|
265
|
602
|
|
|
|
|
2063
|
$a->{-hit_length} = abs($a->{-hit_end} - $a->{-hit_start}) + 1; |
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
$a->{-hsp_length} = |
|
268
|
|
|
|
|
|
|
$a->{-query_length} > $a->{-hit_length} ? |
|
269
|
602
|
50
|
|
|
|
2032
|
$a->{-query_length} : $a->{-hit_length}; |
|
270
|
|
|
|
|
|
|
|
|
271
|
602
|
|
|
|
|
6690
|
$hsp = Bio::Search::HSP::GenericHSP->new( %$a ); |
|
272
|
|
|
|
|
|
|
|
|
273
|
602
|
|
|
|
|
5302
|
return $hsp; |
|
274
|
|
|
|
|
|
|
} |
|
275
|
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
# TODO (adjust regexp to swallow lines w/out md5 sig's. |
|
277
|
|
|
|
|
|
|
sub _parse_path_header { |
|
278
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
279
|
0
|
|
|
|
|
|
my $path_line = shift; |
|
280
|
0
|
|
|
|
|
|
my $path = {}; |
|
281
|
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
( |
|
283
|
|
|
|
|
|
|
$path->{query}, |
|
284
|
|
|
|
|
|
|
$path->{db}, |
|
285
|
|
|
|
|
|
|
$path->{path_num}, |
|
286
|
|
|
|
|
|
|
$path->{path_total_num}, |
|
287
|
|
|
|
|
|
|
$path->{query_length}, |
|
288
|
|
|
|
|
|
|
$path->{exon_count}, |
|
289
|
|
|
|
|
|
|
$path->{trimmed_coverage}, |
|
290
|
|
|
|
|
|
|
$path->{percent_identity}, |
|
291
|
|
|
|
|
|
|
$path->{query_start}, |
|
292
|
|
|
|
|
|
|
$path->{query_end}, |
|
293
|
|
|
|
|
|
|
$path->{whole_genome_start}, |
|
294
|
|
|
|
|
|
|
$path->{whole_genome_end}, |
|
295
|
|
|
|
|
|
|
$path->{chromosome}, |
|
296
|
|
|
|
|
|
|
$path->{chromo_start}, |
|
297
|
|
|
|
|
|
|
$path->{chromo_end}, |
|
298
|
|
|
|
|
|
|
$path->{strand}, |
|
299
|
|
|
|
|
|
|
$path->{sense}, |
|
300
|
|
|
|
|
|
|
$path->{md5}, |
|
301
|
0
|
|
|
|
|
|
) = |
|
302
|
|
|
|
|
|
|
($_ =~ qr| |
|
303
|
|
|
|
|
|
|
> |
|
304
|
|
|
|
|
|
|
([^ ]*)[ ] # the query id}, followed by a space |
|
305
|
|
|
|
|
|
|
([^ ]*)[ ] # the genome database, followed by a space |
|
306
|
|
|
|
|
|
|
(\d+)/(\d+)[ ] # path_num/path_total_num (e.g. 3/12) |
|
307
|
|
|
|
|
|
|
(\d+)[ ] # query length, followed by a space |
|
308
|
|
|
|
|
|
|
(\d+)[ ] # hsp/exon count, followed by a space |
|
309
|
|
|
|
|
|
|
(\d+\.\d*)[ ] # trimmed coverage |
|
310
|
|
|
|
|
|
|
(\d+\.\d*)[ ] # percent identity |
|
311
|
|
|
|
|
|
|
(\d+)\.\.(\d+)[ ] # query start .. query end, followed by space |
|
312
|
|
|
|
|
|
|
(\d+)\.\.(\d+)[ ] # whole genome s..e, followed by space |
|
313
|
|
|
|
|
|
|
(\d+): # chromosome number |
|
314
|
|
|
|
|
|
|
(\d+)\.\.(\d+)[ ] # chromo s..e, followed by a space |
|
315
|
|
|
|
|
|
|
([+-])[ ] # strand, followed by a space |
|
316
|
|
|
|
|
|
|
dir:(.*) # dir:sense or dir:antisense |
|
317
|
|
|
|
|
|
|
[ ]md5:([\dabcdefg]+) # md5 signature |
|
318
|
|
|
|
|
|
|
|x |
|
319
|
|
|
|
|
|
|
); |
|
320
|
|
|
|
|
|
|
|
|
321
|
0
|
0
|
|
|
|
|
$path->{query} or $self->throw("query was not found in path line."); |
|
322
|
0
|
0
|
|
|
|
|
$path->{db} or $self->throw("db was not found in path line."); |
|
323
|
0
|
0
|
|
|
|
|
$path->{path_num} or $self->throw("path_num was not found in path line."); |
|
324
|
|
|
|
|
|
|
$path->{path_total_num} or |
|
325
|
0
|
0
|
|
|
|
|
$self->throw("path_total_num was not found in path line."); |
|
326
|
|
|
|
|
|
|
$path->{query_length} or |
|
327
|
0
|
0
|
|
|
|
|
$self->throw("query_length was not found in path line."); |
|
328
|
|
|
|
|
|
|
$path->{exon_count} or |
|
329
|
0
|
0
|
|
|
|
|
$self->throw("exon_count was not found in path line."); |
|
330
|
|
|
|
|
|
|
$path->{trimmed_coverage} or |
|
331
|
0
|
0
|
|
|
|
|
$self->throw("trimmed_coverage was not found in path line."); |
|
332
|
|
|
|
|
|
|
$path->{percent_identity} or |
|
333
|
0
|
0
|
|
|
|
|
$self->throw("percent_identity was not found in path line."); |
|
334
|
|
|
|
|
|
|
$path->{query_start} or |
|
335
|
0
|
0
|
|
|
|
|
$self->throw("query_start was not found in path line."); |
|
336
|
|
|
|
|
|
|
$path->{query_end} or |
|
337
|
0
|
0
|
|
|
|
|
$self->throw("query_end was not found in path line."); |
|
338
|
|
|
|
|
|
|
$path->{whole_genome_start} or |
|
339
|
0
|
0
|
|
|
|
|
$self->throw("whole_genome_start was not found in path line."); |
|
340
|
|
|
|
|
|
|
$path->{whole_genome_end} or |
|
341
|
0
|
0
|
|
|
|
|
$self->throw("whole_genome_end was not found in path line."); |
|
342
|
|
|
|
|
|
|
$path->{chromosome} or |
|
343
|
0
|
0
|
|
|
|
|
$self->throw("chromosome was not found in path line."); |
|
344
|
|
|
|
|
|
|
$path->{chromo_start} or |
|
345
|
0
|
0
|
|
|
|
|
$self->throw("chromo_start was not found in path line."); |
|
346
|
|
|
|
|
|
|
$path->{chromo_end} or |
|
347
|
0
|
0
|
|
|
|
|
$self->throw("chromo_end was not found in path line."); |
|
348
|
0
|
0
|
|
|
|
|
$path->{strand} or $self->throw("strand was not found in path line."); |
|
349
|
0
|
0
|
|
|
|
|
$path->{sense} or $self->throw("sense was not found in path line."); |
|
350
|
|
|
|
|
|
|
|
|
351
|
0
|
|
|
|
|
|
return $path; |
|
352
|
|
|
|
|
|
|
} |
|
353
|
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
sub _parse_alignment_line { |
|
355
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
356
|
0
|
|
|
|
|
|
my $a_line = shift; |
|
357
|
0
|
|
|
|
|
|
my $align = {}; |
|
358
|
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
( |
|
360
|
|
|
|
|
|
|
$align->{chromo_start}, |
|
361
|
|
|
|
|
|
|
$align->{chromo_end}, |
|
362
|
|
|
|
|
|
|
$align->{query_start}, |
|
363
|
|
|
|
|
|
|
$align->{query_end}, |
|
364
|
|
|
|
|
|
|
$align->{percent_identity}, |
|
365
|
|
|
|
|
|
|
$align->{align_length}, |
|
366
|
|
|
|
|
|
|
$align->{intron_length}, |
|
367
|
0
|
|
|
|
|
|
) = |
|
368
|
|
|
|
|
|
|
($_ =~ qr| |
|
369
|
|
|
|
|
|
|
[\t] |
|
370
|
|
|
|
|
|
|
([\d]+)[ ] # start in chromosome coord. |
|
371
|
|
|
|
|
|
|
([\d]+)[ ] # end in chromosome coord. |
|
372
|
|
|
|
|
|
|
([\d]+)[ ] # start in query coord. |
|
373
|
|
|
|
|
|
|
([\d]+)[ ] # end in query coord. |
|
374
|
|
|
|
|
|
|
([\d]+) # percent identity (as integer) |
|
375
|
|
|
|
|
|
|
[\t].*[\t] # skip the edit script |
|
376
|
|
|
|
|
|
|
([\d]+) # length of alignment block. |
|
377
|
|
|
|
|
|
|
[\t]*([\d]+)* # length of following intron. |
|
378
|
|
|
|
|
|
|
|x |
|
379
|
|
|
|
|
|
|
); |
|
380
|
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
$align->{chromo_start} |
|
382
|
0
|
0
|
|
|
|
|
or $self->throw("chromo_start missing in alignment line."); |
|
383
|
|
|
|
|
|
|
$align->{chromo_end}, |
|
384
|
0
|
0
|
|
|
|
|
or $self->throw("chromo_end was missing in alignment line."); |
|
385
|
|
|
|
|
|
|
$align->{query_start}, |
|
386
|
0
|
0
|
|
|
|
|
or $self->throw("query_start was missing in alignment line."); |
|
387
|
|
|
|
|
|
|
$align->{query_end}, |
|
388
|
0
|
0
|
|
|
|
|
or $self->throw("query_end was missing in alignment line."); |
|
389
|
|
|
|
|
|
|
$align->{percent_identity}, |
|
390
|
0
|
0
|
|
|
|
|
or $self->throw("percent_identity was missing in alignment line."); |
|
391
|
|
|
|
|
|
|
$align->{align_length}, |
|
392
|
0
|
0
|
|
|
|
|
or $self->throw("align_length was missing in alignment line."); |
|
393
|
|
|
|
|
|
|
|
|
394
|
0
|
|
|
|
|
|
return $align; |
|
395
|
|
|
|
|
|
|
} |
|
396
|
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
1; |