line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::Cigar; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
16329
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
47
|
|
4
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
48
|
|
5
|
1
|
|
|
1
|
|
27
|
use 5.014; |
|
1
|
|
|
|
|
50
|
|
|
1
|
|
|
|
|
53
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=encoding utf-8 |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=head1 NAME |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
Bio::Cigar - Parse CIGAR strings and translate coordinates to/from reference/query |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=head1 SYNOPSIS |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
use 5.014; |
16
|
|
|
|
|
|
|
use Bio::Cigar; |
17
|
|
|
|
|
|
|
my $cigar = Bio::Cigar->new("2M1D1M1I4M"); |
18
|
|
|
|
|
|
|
say "Query length is ", $cigar->query_length; |
19
|
|
|
|
|
|
|
say "Reference length is ", $cigar->reference_length; |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
my ($qpos, $op) = $cigar->rpos_to_qpos(3); |
22
|
|
|
|
|
|
|
say "Alignment operation at reference position 3 is $op"; |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=head1 DESCRIPTION |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
Bio::Cigar is a small library to parse CIGAR strings ("Compact Idiosyncratic |
27
|
|
|
|
|
|
|
Gapped Alignment Report"), such as those used in the SAM file format. CIGAR |
28
|
|
|
|
|
|
|
strings are a run-length encoding which minimally describes the alignment of a |
29
|
|
|
|
|
|
|
query sequence to an (often longer) reference sequence. |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
Parsing follows the L |
32
|
|
|
|
|
|
|
for the C column. |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
Parsed strings are represented by an object that provides a few utility |
35
|
|
|
|
|
|
|
methods. |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=head1 ATTRIBUTES |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
All attributes are read-only. |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=head2 string |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
The CIGAR string for this object. |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=head2 reference_length |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
The length of the reference sequence segment aligned with the query sequence |
48
|
|
|
|
|
|
|
described by the CIGAR string. |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head2 query_length |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
The length of the query sequence described by the CIGAR string. |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=head2 ops |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
An arrayref of C<[length, operation]> tuples describing the CIGAR string. |
57
|
|
|
|
|
|
|
Lengths are integers, L. |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=cut |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
our $VERSION = '1.01'; |
62
|
|
|
|
|
|
|
|
63
|
1
|
|
|
1
|
|
657311
|
use Moo; |
|
1
|
|
|
|
|
1554761
|
|
|
1
|
|
|
|
|
6
|
|
64
|
1
|
|
|
1
|
|
2018
|
use Types::Standard qw< ArrayRef Tuple Int Enum StrMatch >; |
|
1
|
|
|
|
|
77952
|
|
|
1
|
|
|
|
|
14
|
|
65
|
1
|
|
|
1
|
|
1265
|
use List::Util qw< sum >; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
122
|
|
66
|
1
|
|
|
1
|
|
7
|
use Carp qw< croak >; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
58
|
|
67
|
1
|
|
|
1
|
|
657
|
use namespace::clean; |
|
1
|
|
|
|
|
12943
|
|
|
1
|
|
|
|
|
6
|
|
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=head3 CIGAR operations |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
The CIGAR operations are given in the following table, taken from the SAM v1 |
72
|
|
|
|
|
|
|
spec: |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
Op Description |
75
|
|
|
|
|
|
|
‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ |
76
|
|
|
|
|
|
|
M alignment match (can be a sequence match or mismatch) |
77
|
|
|
|
|
|
|
I insertion to the reference |
78
|
|
|
|
|
|
|
D deletion from the reference |
79
|
|
|
|
|
|
|
N skipped region from the reference |
80
|
|
|
|
|
|
|
S soft clipping (clipped sequences present in SEQ) |
81
|
|
|
|
|
|
|
H hard clipping (clipped sequences NOT present in SEQ) |
82
|
|
|
|
|
|
|
P padding (silent deletion from padded reference) |
83
|
|
|
|
|
|
|
= sequence match |
84
|
|
|
|
|
|
|
X sequence mismatch |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
• H can only be present as the first and/or last operation. |
87
|
|
|
|
|
|
|
• S may only have H operations between them and the ends of the string. |
88
|
|
|
|
|
|
|
• For mRNA-to-genome alignment, an N operation represents an intron. |
89
|
|
|
|
|
|
|
For other types of alignments, the interpretation of N is not defined. |
90
|
|
|
|
|
|
|
• Sum of the lengths of the M/I/S/=/X operations shall equal the length of SEQ. |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
=cut |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
our $CIGAR_REGEX = qr/^ |
95
|
|
|
|
|
|
|
(\d*H)?(\d*S)? |
96
|
|
|
|
|
|
|
(?\d*[MIDNP=X])* |
97
|
|
|
|
|
|
|
(\d*S)?(\d*H)? |
98
|
|
|
|
|
|
|
$/xa; |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
has 'string', |
101
|
|
|
|
|
|
|
is => 'ro', |
102
|
|
|
|
|
|
|
isa => StrMatch[ $CIGAR_REGEX ], |
103
|
|
|
|
|
|
|
required => 1; |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
has 'query_length', |
106
|
|
|
|
|
|
|
lazy => 1, |
107
|
|
|
|
|
|
|
is => 'ro', |
108
|
|
|
|
|
|
|
isa => Int, |
109
|
|
|
|
|
|
|
default => sub { sum(map { $_ || 1 } $_[0]->string =~ /(\d*)[MIS=X]/ga) || 0 }; |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
has 'reference_length', |
112
|
|
|
|
|
|
|
lazy => 1, |
113
|
|
|
|
|
|
|
is => 'ro', |
114
|
|
|
|
|
|
|
isa => Int, |
115
|
|
|
|
|
|
|
default => sub { sum(map { $_ || 1 } $_[0]->string =~ /(\d*)[MDN=X]/ga) || 0 }; |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
has 'ops', |
118
|
|
|
|
|
|
|
lazy => 1, |
119
|
|
|
|
|
|
|
is => 'ro', |
120
|
|
|
|
|
|
|
isa => ArrayRef[ Tuple[ Int, Enum[split '', 'MIDNSHP=X'] ] ], |
121
|
|
|
|
|
|
|
builder => '_parse'; |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
sub BUILDARGS { |
124
|
13
|
|
|
13
|
0
|
13734
|
my $class = shift; |
125
|
13
|
100
|
66
|
|
|
88
|
if (@_ == 1 and not ref $_[0]) { |
126
|
12
|
|
|
|
|
353
|
return { string => $_[0] }; |
127
|
|
|
|
|
|
|
} else { |
128
|
1
|
|
|
|
|
272
|
croak sprintf "%s->new must be called with a string as the sole argument", __PACKAGE__; |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
sub _parse { |
133
|
3
|
|
|
3
|
|
2380
|
my $self = shift; |
134
|
3
|
|
|
|
|
14
|
my $cigar = $self->string; |
135
|
3
|
|
|
|
|
9
|
my @ops; |
136
|
3
|
|
|
|
|
39
|
for my $op (grep defined, $cigar =~ /(\d*[MIDNSHP=X])/g) { |
137
|
12
|
|
|
|
|
41
|
my ($len, $type) = $op =~ /(\d*)(\D*)/a; |
138
|
12
|
|
100
|
|
|
64
|
push @ops, [ $len || 1, uc $type ]; |
139
|
|
|
|
|
|
|
} |
140
|
3
|
|
|
|
|
84
|
return \@ops; |
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=head1 CONSTRUCTOR |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=head2 new |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
Takes a CIGAR string as the sole argument and returns a new Bio::Cigar object. |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
=head1 METHODS |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
=head2 rpos_to_qpos |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
Takes a reference position (origin 1, base-numbered) and returns the |
154
|
|
|
|
|
|
|
corresponding position (origin 1, base-numbered) on the query sequence. Indels |
155
|
|
|
|
|
|
|
affect how the numbering maps from reference to query. |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
In list context returns a tuple of C<[query position, operation at position]>. |
158
|
|
|
|
|
|
|
Operation is a single-character string. See the |
159
|
|
|
|
|
|
|
L.
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
If the reference position does not map to the query sequence (as with a |
162
|
|
|
|
|
|
|
deletion, for example), returns C or C<[undef, operation]>. |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=head2 qpos_to_rpos |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
Takes a query position (origin 1, base-numbered) and returns the corresponding |
167
|
|
|
|
|
|
|
position (origin 1, base-numbered) on the reference sequence. Indels affect |
168
|
|
|
|
|
|
|
how the numbering maps from query to reference. |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
In list context returns a tuple of C<[references position, operation at position]>. |
171
|
|
|
|
|
|
|
Operation is a single-character string. See the |
172
|
|
|
|
|
|
|
L.
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
If the query position does not map to the reference sequence (as with an |
175
|
|
|
|
|
|
|
insertion, for example), returns C or C<[undef, operation]>. |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
=head2 op_at_rpos |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
Takes a reference position and returns the operation at that position. Simply |
180
|
|
|
|
|
|
|
a shortcut for calling L in list context and discarding the |
181
|
|
|
|
|
|
|
first return value. |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=head2 op_at_qpos |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
Takes a query position and returns the operation at that position. Simply |
186
|
|
|
|
|
|
|
a shortcut for calling L in list context and discarding the |
187
|
|
|
|
|
|
|
first return value. |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
=cut |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
# Consumption table based on |
192
|
|
|
|
|
|
|
# https://github.com/samtools/htslib/blob/develop/htslib/sam.h#L81-L100 |
193
|
|
|
|
|
|
|
my %op_consumes = ( |
194
|
|
|
|
|
|
|
# op => [query, reference] |
195
|
|
|
|
|
|
|
'M' => [1, 1], |
196
|
|
|
|
|
|
|
'I' => [1, 0], |
197
|
|
|
|
|
|
|
'D' => [0, 1], |
198
|
|
|
|
|
|
|
'N' => [0, 1], |
199
|
|
|
|
|
|
|
'S' => [1, 0], |
200
|
|
|
|
|
|
|
'H' => [0, 0], |
201
|
|
|
|
|
|
|
'P' => [0, 0], |
202
|
|
|
|
|
|
|
'=' => [1, 1], |
203
|
|
|
|
|
|
|
'X' => [1, 1], |
204
|
|
|
|
|
|
|
); |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
sub rpos_to_qpos { |
207
|
38
|
|
|
38
|
1
|
8179
|
my $self = shift; |
208
|
38
|
|
|
|
|
78
|
return $self->_map_position( rpos => @_ ); |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
sub qpos_to_rpos { |
212
|
39
|
|
|
39
|
1
|
7325
|
my $self = shift; |
213
|
39
|
|
|
|
|
82
|
return $self->_map_position( qpos => @_ ); |
214
|
|
|
|
|
|
|
} |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
sub _map_position { |
217
|
77
|
|
|
77
|
|
76
|
my $self = shift; |
218
|
77
|
|
|
|
|
73
|
my $from = shift; |
219
|
77
|
|
|
|
|
72
|
my $target = shift; |
220
|
77
|
100
|
|
|
|
161
|
my $target_len = $from eq "rpos" ? "reference_length" : "query_length"; |
221
|
|
|
|
|
|
|
|
222
|
77
|
|
|
|
|
68
|
my $rpos = 0; |
223
|
77
|
|
|
|
|
56
|
my $qpos = 0; |
224
|
|
|
|
|
|
|
|
225
|
77
|
100
|
100
|
|
|
2309
|
croak sprintf "$from = %d is < 1 or > $target_len (%d)", $target, $self->$target_len |
226
|
|
|
|
|
|
|
if $target < 1 or $target > $self->$target_len; |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
# Each cigar operation consumes the reference, query, or both. We keep |
229
|
|
|
|
|
|
|
# track of both positions until the query/reference position hits the |
230
|
|
|
|
|
|
|
# target. Then we return the equivalent other position. |
231
|
|
|
|
|
|
|
|
232
|
72
|
|
|
|
|
663
|
for my $op (@{ $self->ops }) { |
|
72
|
|
|
|
|
1419
|
|
233
|
222
|
|
|
|
|
654
|
my ($len, $type) = @$op; |
234
|
222
|
|
|
|
|
251
|
my $consumes = $op_consumes{$type}; |
235
|
222
|
50
|
66
|
|
|
455
|
next unless $consumes->[0] or $consumes->[1]; |
236
|
|
|
|
|
|
|
|
237
|
222
|
100
|
|
|
|
407
|
$qpos += $len if $consumes->[0]; |
238
|
222
|
100
|
|
|
|
339
|
$rpos += $len if $consumes->[1]; |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
# The logic could be written with more variables to reduce the |
241
|
|
|
|
|
|
|
# duplication in this if/else, but I think the clarity of the logic is |
242
|
|
|
|
|
|
|
# enhanced by the similarity of the if/else branches. Adding more |
243
|
|
|
|
|
|
|
# varibles to make the logic generic muddies the simplicity. |
244
|
222
|
100
|
|
|
|
272
|
if ($from eq "rpos") { |
245
|
114
|
100
|
|
|
|
156
|
if ($rpos > $target) { |
246
|
21
|
|
|
|
|
24
|
my $overshoot = $rpos - $target; |
247
|
21
|
|
|
|
|
17
|
$rpos -= $overshoot; |
248
|
21
|
|
|
|
|
25
|
$qpos -= $overshoot; |
249
|
|
|
|
|
|
|
} |
250
|
114
|
100
|
|
|
|
239
|
if ($rpos == $target) { |
251
|
36
|
100
|
66
|
|
|
114
|
if ($consumes->[1] and not $consumes->[0]) { |
252
|
|
|
|
|
|
|
# Reference positions which are missing in the query sequence |
253
|
|
|
|
|
|
|
# don't have a corresponding query position. |
254
|
3
|
100
|
|
|
|
18
|
return wantarray ? (undef, $type) : undef; |
255
|
|
|
|
|
|
|
} else { |
256
|
33
|
50
|
|
|
|
727
|
$qpos <= $self->query_length |
257
|
|
|
|
|
|
|
or croak sprintf "Bug! Assertion failed: qpos <= qlen" |
258
|
|
|
|
|
|
|
. " (target = %d, rpos = %d, qpos = %d, qlen = %d, cigar = %s)", |
259
|
|
|
|
|
|
|
$target, $rpos, $qpos, $self->query_length, $self->string; |
260
|
33
|
100
|
|
|
|
407
|
return wantarray ? ($qpos, $type) : $qpos; |
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
} |
263
|
|
|
|
|
|
|
} else { |
264
|
108
|
100
|
|
|
|
159
|
if ($qpos > $target) { |
265
|
18
|
|
|
|
|
25
|
my $overshoot = $qpos - $target; |
266
|
18
|
|
|
|
|
17
|
$rpos -= $overshoot; |
267
|
18
|
|
|
|
|
20
|
$qpos -= $overshoot; |
268
|
|
|
|
|
|
|
} |
269
|
108
|
100
|
|
|
|
201
|
if ($qpos == $target) { |
270
|
36
|
100
|
66
|
|
|
145
|
if ($consumes->[0] and not $consumes->[1]) { |
271
|
|
|
|
|
|
|
# Query positions which are insertions in the reference sequence |
272
|
|
|
|
|
|
|
# don't have a corresponding reference position. |
273
|
12
|
100
|
|
|
|
89
|
return wantarray ? (undef, $type) : undef; |
274
|
|
|
|
|
|
|
} else { |
275
|
24
|
50
|
|
|
|
657
|
$rpos <= $self->reference_length |
276
|
|
|
|
|
|
|
or croak sprintf "Bug! Assertion failed: rpos <= rlen" |
277
|
|
|
|
|
|
|
. " (target = %d, qpos = %d, rpos = %d, rlen = %d, cigar = %s)", |
278
|
|
|
|
|
|
|
$target, $qpos, $rpos, $self->reference_length, $self->string; |
279
|
24
|
100
|
|
|
|
355
|
return wantarray ? ($rpos, $type) : $rpos; |
280
|
|
|
|
|
|
|
} |
281
|
|
|
|
|
|
|
} |
282
|
|
|
|
|
|
|
} |
283
|
|
|
|
|
|
|
} |
284
|
0
|
|
|
|
|
0
|
croak sprintf "Bug! Ran out of ops but couldn't map %s %d" |
285
|
|
|
|
|
|
|
. " (rpos = %d, qpos = %d, cigar = %s)", |
286
|
|
|
|
|
|
|
$from, $target, $rpos, $qpos, $self->string; |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
sub op_at_rpos { |
290
|
12
|
|
|
12
|
1
|
15
|
my $self = shift; |
291
|
12
|
|
|
|
|
26
|
my ($qpos, $type) = $self->rpos_to_qpos(@_); |
292
|
12
|
|
|
|
|
60
|
return $type; |
293
|
|
|
|
|
|
|
} |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
sub op_at_qpos { |
296
|
12
|
|
|
12
|
1
|
19
|
my $self = shift; |
297
|
12
|
|
|
|
|
27
|
my ($rpos, $type) = $self->qpos_to_rpos(@_); |
298
|
12
|
|
|
|
|
91
|
return $type; |
299
|
|
|
|
|
|
|
} |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
=head1 AUTHOR |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
Thomas Sibley Etrsibley@uw.eduE |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
=head1 COPYRIGHT |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
Copyright 2014- Mullins Lab, Department of Microbiology, University of Washington. |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
=head1 LICENSE |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify it under |
312
|
|
|
|
|
|
|
the GNU General Public License, version 2. |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
=head1 SEE ALSO |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
L |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
=cut |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
1; |
| |