File Coverage

blib/lib/Bio/Cigar.pm
Criterion Covered Total %
statement 78 79 98.7
branch 35 38 92.1
condition 13 17 76.4
subroutine 15 15 100.0
pod 4 5 80.0
total 145 154 94.1


line stmt bran cond sub pod time code
1             package Bio::Cigar;
2              
3 1     1   16329 use strict;
  1         2  
  1         47  
4 1     1   6 use warnings;
  1         2  
  1         48  
5 1     1   27 use 5.014;
  1         50  
  1         53  
6              
7             =encoding utf-8
8              
9             =head1 NAME
10              
11             Bio::Cigar - Parse CIGAR strings and translate coordinates to/from reference/query
12              
13             =head1 SYNOPSIS
14              
15             use 5.014;
16             use Bio::Cigar;
17             my $cigar = Bio::Cigar->new("2M1D1M1I4M");
18             say "Query length is ", $cigar->query_length;
19             say "Reference length is ", $cigar->reference_length;
20            
21             my ($qpos, $op) = $cigar->rpos_to_qpos(3);
22             say "Alignment operation at reference position 3 is $op";
23              
24             =head1 DESCRIPTION
25              
26             Bio::Cigar is a small library to parse CIGAR strings ("Compact Idiosyncratic
27             Gapped Alignment Report"), such as those used in the SAM file format. CIGAR
28             strings are a run-length encoding which minimally describes the alignment of a
29             query sequence to an (often longer) reference sequence.
30              
31             Parsing follows the L
32             for the C column.
33              
34             Parsed strings are represented by an object that provides a few utility
35             methods.
36              
37             =head1 ATTRIBUTES
38              
39             All attributes are read-only.
40              
41             =head2 string
42              
43             The CIGAR string for this object.
44              
45             =head2 reference_length
46              
47             The length of the reference sequence segment aligned with the query sequence
48             described by the CIGAR string.
49              
50             =head2 query_length
51              
52             The length of the query sequence described by the CIGAR string.
53              
54             =head2 ops
55              
56             An arrayref of C<[length, operation]> tuples describing the CIGAR string.
57             Lengths are integers, L.
58              
59             =cut
60              
61             our $VERSION = '1.01';
62              
63 1     1   657311 use Moo;
  1         1554761  
  1         6  
64 1     1   2018 use Types::Standard qw< ArrayRef Tuple Int Enum StrMatch >;
  1         77952  
  1         14  
65 1     1   1265 use List::Util qw< sum >;
  1         1  
  1         122  
66 1     1   7 use Carp qw< croak >;
  1         1  
  1         58  
67 1     1   657 use namespace::clean;
  1         12943  
  1         6  
68              
69             =head3 CIGAR operations
70              
71             The CIGAR operations are given in the following table, taken from the SAM v1
72             spec:
73            
74             Op Description
75             ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
76             M alignment match (can be a sequence match or mismatch)
77             I insertion to the reference
78             D deletion from the reference
79             N skipped region from the reference
80             S soft clipping (clipped sequences present in SEQ)
81             H hard clipping (clipped sequences NOT present in SEQ)
82             P padding (silent deletion from padded reference)
83             = sequence match
84             X sequence mismatch
85              
86             • H can only be present as the first and/or last operation.
87             • S may only have H operations between them and the ends of the string.
88             • For mRNA-to-genome alignment, an N operation represents an intron.
89             For other types of alignments, the interpretation of N is not defined.
90             • Sum of the lengths of the M/I/S/=/X operations shall equal the length of SEQ.
91              
92             =cut
93              
94             our $CIGAR_REGEX = qr/^
95             (\d*H)?(\d*S)?
96             (?\d*[MIDNP=X])*
97             (\d*S)?(\d*H)?
98             $/xa;
99              
100             has 'string',
101             is => 'ro',
102             isa => StrMatch[ $CIGAR_REGEX ],
103             required => 1;
104              
105             has 'query_length',
106             lazy => 1,
107             is => 'ro',
108             isa => Int,
109             default => sub { sum(map { $_ || 1 } $_[0]->string =~ /(\d*)[MIS=X]/ga) || 0 };
110              
111             has 'reference_length',
112             lazy => 1,
113             is => 'ro',
114             isa => Int,
115             default => sub { sum(map { $_ || 1 } $_[0]->string =~ /(\d*)[MDN=X]/ga) || 0 };
116              
117             has 'ops',
118             lazy => 1,
119             is => 'ro',
120             isa => ArrayRef[ Tuple[ Int, Enum[split '', 'MIDNSHP=X'] ] ],
121             builder => '_parse';
122              
123             sub BUILDARGS {
124 13     13 0 13734 my $class = shift;
125 13 100 66     88 if (@_ == 1 and not ref $_[0]) {
126 12         353 return { string => $_[0] };
127             } else {
128 1         272 croak sprintf "%s->new must be called with a string as the sole argument", __PACKAGE__;
129             }
130             }
131              
132             sub _parse {
133 3     3   2380 my $self = shift;
134 3         14 my $cigar = $self->string;
135 3         9 my @ops;
136 3         39 for my $op (grep defined, $cigar =~ /(\d*[MIDNSHP=X])/g) {
137 12         41 my ($len, $type) = $op =~ /(\d*)(\D*)/a;
138 12   100     64 push @ops, [ $len || 1, uc $type ];
139             }
140 3         84 return \@ops;
141             }
142              
143             =head1 CONSTRUCTOR
144              
145             =head2 new
146              
147             Takes a CIGAR string as the sole argument and returns a new Bio::Cigar object.
148              
149             =head1 METHODS
150              
151             =head2 rpos_to_qpos
152              
153             Takes a reference position (origin 1, base-numbered) and returns the
154             corresponding position (origin 1, base-numbered) on the query sequence. Indels
155             affect how the numbering maps from reference to query.
156              
157             In list context returns a tuple of C<[query position, operation at position]>.
158             Operation is a single-character string. See the
159             L.
160              
161             If the reference position does not map to the query sequence (as with a
162             deletion, for example), returns C or C<[undef, operation]>.
163              
164             =head2 qpos_to_rpos
165              
166             Takes a query position (origin 1, base-numbered) and returns the corresponding
167             position (origin 1, base-numbered) on the reference sequence. Indels affect
168             how the numbering maps from query to reference.
169              
170             In list context returns a tuple of C<[references position, operation at position]>.
171             Operation is a single-character string. See the
172             L.
173              
174             If the query position does not map to the reference sequence (as with an
175             insertion, for example), returns C or C<[undef, operation]>.
176              
177             =head2 op_at_rpos
178              
179             Takes a reference position and returns the operation at that position. Simply
180             a shortcut for calling L in list context and discarding the
181             first return value.
182              
183             =head2 op_at_qpos
184              
185             Takes a query position and returns the operation at that position. Simply
186             a shortcut for calling L in list context and discarding the
187             first return value.
188              
189             =cut
190              
191             # Consumption table based on
192             # https://github.com/samtools/htslib/blob/develop/htslib/sam.h#L81-L100
193             my %op_consumes = (
194             # op => [query, reference]
195             'M' => [1, 1],
196             'I' => [1, 0],
197             'D' => [0, 1],
198             'N' => [0, 1],
199             'S' => [1, 0],
200             'H' => [0, 0],
201             'P' => [0, 0],
202             '=' => [1, 1],
203             'X' => [1, 1],
204             );
205              
206             sub rpos_to_qpos {
207 38     38 1 8179 my $self = shift;
208 38         78 return $self->_map_position( rpos => @_ );
209             }
210              
211             sub qpos_to_rpos {
212 39     39 1 7325 my $self = shift;
213 39         82 return $self->_map_position( qpos => @_ );
214             }
215              
216             sub _map_position {
217 77     77   76 my $self = shift;
218 77         73 my $from = shift;
219 77         72 my $target = shift;
220 77 100       161 my $target_len = $from eq "rpos" ? "reference_length" : "query_length";
221              
222 77         68 my $rpos = 0;
223 77         56 my $qpos = 0;
224              
225 77 100 100     2309 croak sprintf "$from = %d is < 1 or > $target_len (%d)", $target, $self->$target_len
226             if $target < 1 or $target > $self->$target_len;
227              
228             # Each cigar operation consumes the reference, query, or both. We keep
229             # track of both positions until the query/reference position hits the
230             # target. Then we return the equivalent other position.
231              
232 72         663 for my $op (@{ $self->ops }) {
  72         1419  
233 222         654 my ($len, $type) = @$op;
234 222         251 my $consumes = $op_consumes{$type};
235 222 50 66     455 next unless $consumes->[0] or $consumes->[1];
236              
237 222 100       407 $qpos += $len if $consumes->[0];
238 222 100       339 $rpos += $len if $consumes->[1];
239              
240             # The logic could be written with more variables to reduce the
241             # duplication in this if/else, but I think the clarity of the logic is
242             # enhanced by the similarity of the if/else branches. Adding more
243             # varibles to make the logic generic muddies the simplicity.
244 222 100       272 if ($from eq "rpos") {
245 114 100       156 if ($rpos > $target) {
246 21         24 my $overshoot = $rpos - $target;
247 21         17 $rpos -= $overshoot;
248 21         25 $qpos -= $overshoot;
249             }
250 114 100       239 if ($rpos == $target) {
251 36 100 66     114 if ($consumes->[1] and not $consumes->[0]) {
252             # Reference positions which are missing in the query sequence
253             # don't have a corresponding query position.
254 3 100       18 return wantarray ? (undef, $type) : undef;
255             } else {
256 33 50       727 $qpos <= $self->query_length
257             or croak sprintf "Bug! Assertion failed: qpos <= qlen"
258             . " (target = %d, rpos = %d, qpos = %d, qlen = %d, cigar = %s)",
259             $target, $rpos, $qpos, $self->query_length, $self->string;
260 33 100       407 return wantarray ? ($qpos, $type) : $qpos;
261             }
262             }
263             } else {
264 108 100       159 if ($qpos > $target) {
265 18         25 my $overshoot = $qpos - $target;
266 18         17 $rpos -= $overshoot;
267 18         20 $qpos -= $overshoot;
268             }
269 108 100       201 if ($qpos == $target) {
270 36 100 66     145 if ($consumes->[0] and not $consumes->[1]) {
271             # Query positions which are insertions in the reference sequence
272             # don't have a corresponding reference position.
273 12 100       89 return wantarray ? (undef, $type) : undef;
274             } else {
275 24 50       657 $rpos <= $self->reference_length
276             or croak sprintf "Bug! Assertion failed: rpos <= rlen"
277             . " (target = %d, qpos = %d, rpos = %d, rlen = %d, cigar = %s)",
278             $target, $qpos, $rpos, $self->reference_length, $self->string;
279 24 100       355 return wantarray ? ($rpos, $type) : $rpos;
280             }
281             }
282             }
283             }
284 0         0 croak sprintf "Bug! Ran out of ops but couldn't map %s %d"
285             . " (rpos = %d, qpos = %d, cigar = %s)",
286             $from, $target, $rpos, $qpos, $self->string;
287             }
288              
289             sub op_at_rpos {
290 12     12 1 15 my $self = shift;
291 12         26 my ($qpos, $type) = $self->rpos_to_qpos(@_);
292 12         60 return $type;
293             }
294              
295             sub op_at_qpos {
296 12     12 1 19 my $self = shift;
297 12         27 my ($rpos, $type) = $self->qpos_to_rpos(@_);
298 12         91 return $type;
299             }
300              
301             =head1 AUTHOR
302              
303             Thomas Sibley Etrsibley@uw.eduE
304              
305             =head1 COPYRIGHT
306              
307             Copyright 2014- Mullins Lab, Department of Microbiology, University of Washington.
308              
309             =head1 LICENSE
310              
311             This library is free software; you can redistribute it and/or modify it under
312             the GNU General Public License, version 2.
313              
314             =head1 SEE ALSO
315              
316             L
317              
318             =cut
319              
320             1;