line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package App::Sandy::Read; |
2
|
|
|
|
|
|
|
# ABSTRACT: Base class to simulate reads |
3
|
|
|
|
|
|
|
|
4
|
6
|
|
|
6
|
|
4011
|
use App::Sandy::Base 'class'; |
|
6
|
|
|
|
|
12
|
|
|
6
|
|
|
|
|
46
|
|
5
|
6
|
|
|
6
|
|
58
|
use List::Util 'first'; |
|
6
|
|
|
|
|
12
|
|
|
6
|
|
|
|
|
8851
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our $VERSION = '0.22'; # VERSION |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
has 'sequencing_error' => ( |
10
|
|
|
|
|
|
|
is => 'ro', |
11
|
|
|
|
|
|
|
isa => 'My:NumHS', |
12
|
|
|
|
|
|
|
required => 1 |
13
|
|
|
|
|
|
|
); |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
has '_count_base' => ( |
16
|
|
|
|
|
|
|
is => 'rw', |
17
|
|
|
|
|
|
|
isa => 'Int', |
18
|
|
|
|
|
|
|
default => 0 |
19
|
|
|
|
|
|
|
); |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
has '_base' => ( |
22
|
|
|
|
|
|
|
is => 'rw', |
23
|
|
|
|
|
|
|
isa => 'Int', |
24
|
|
|
|
|
|
|
builder => '_build_base', |
25
|
|
|
|
|
|
|
lazy_build => 1 |
26
|
|
|
|
|
|
|
); |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
has '_not_base' => ( |
29
|
|
|
|
|
|
|
is => 'ro', |
30
|
|
|
|
|
|
|
isa => 'HashRef', |
31
|
|
|
|
|
|
|
builder => '_build_not_base', |
32
|
|
|
|
|
|
|
lazy_build => 1 |
33
|
|
|
|
|
|
|
); |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
sub _build_not_base { |
36
|
31
|
|
|
31
|
|
873
|
my %not_base = ( |
37
|
|
|
|
|
|
|
A => ['T', 'C', 'G'], |
38
|
|
|
|
|
|
|
a => ['t', 'c', 'g'], |
39
|
|
|
|
|
|
|
T => ['A', 'C', 'G'], |
40
|
|
|
|
|
|
|
t => ['a', 'c', 'g'], |
41
|
|
|
|
|
|
|
C => ['A', 'T', 'G'], |
42
|
|
|
|
|
|
|
c => ['a', 't', 'g'], |
43
|
|
|
|
|
|
|
G => ['A', 'T', 'C'], |
44
|
|
|
|
|
|
|
g => ['a', 't', 'c'] |
45
|
|
|
|
|
|
|
); |
46
|
31
|
|
|
|
|
859
|
return \%not_base; |
47
|
|
|
|
|
|
|
} |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
sub _build_base { |
50
|
31
|
|
|
31
|
|
75
|
my $self = shift; |
51
|
|
|
|
|
|
|
# If sequencing_error equal to zero, set _base to zero |
52
|
31
|
|
33
|
|
|
823
|
return $self->sequencing_error && int(1 / $self->sequencing_error); |
53
|
|
|
|
|
|
|
} |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
sub subseq { |
56
|
1611
|
|
|
1611
|
0
|
120643
|
my ($self, $seq_ref, $seq_len, $slice_len, $pos) = @_; |
57
|
1611
|
|
|
|
|
3739
|
my $read = substr $$seq_ref, $pos, $slice_len; |
58
|
1611
|
|
|
|
|
3644
|
return \$read; |
59
|
|
|
|
|
|
|
} |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
sub subseq_rand { |
62
|
242
|
|
|
242
|
0
|
130060
|
my ($self, $seq_ref, $seq_len, $slice_len) = @_; |
63
|
242
|
|
|
|
|
480
|
my $usable_len = $seq_len - $slice_len; |
64
|
242
|
|
|
|
|
704
|
my $pos = int(rand($usable_len + 1)); |
65
|
242
|
|
|
|
|
596
|
my $read = substr $$seq_ref, $pos, $slice_len; |
66
|
242
|
|
|
|
|
649
|
return (\$read, $pos); |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
sub subseq_rand_ptable { |
70
|
2160
|
|
|
2160
|
0
|
4478
|
my ($self, $ptable, $ptable_size, $slice_len, $sub_slice_len) = @_; |
71
|
2160
|
|
|
|
|
4002
|
my $usable_len = $ptable_size - $slice_len; |
72
|
2160
|
|
|
|
|
5675
|
my $pos = int(rand($usable_len + 1)); |
73
|
2160
|
|
|
|
|
7920
|
my $pieces = $ptable->lookup($pos, $slice_len); |
74
|
2160
|
|
|
|
|
6396
|
return $self->_build_subseq($pieces, $pos, $slice_len, $sub_slice_len); |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
sub _build_subseq { |
78
|
2160
|
|
|
2160
|
|
4500
|
my ($self, $pieces, $pos, $len, $sub_len) = @_; |
79
|
|
|
|
|
|
|
|
80
|
2160
|
|
|
|
|
5029
|
my $offset = $pos - $pieces->[0]{offset}; |
81
|
2160
|
|
|
|
|
3752
|
my $usable_len = $pieces->[0]{len} - $offset; |
82
|
|
|
|
|
|
|
|
83
|
2160
|
|
|
|
|
3025
|
my @annot; |
84
|
|
|
|
|
|
|
|
85
|
2160
|
100
|
|
|
|
4952
|
if (not $pieces->[0]{is_orig}) { |
86
|
28
|
50
|
|
|
|
54
|
push @annot => @{ $pieces->[0]{annot2} } if @{ $pieces->[0]{annot2} }; |
|
0
|
|
|
|
|
0
|
|
|
28
|
|
|
|
|
73
|
|
87
|
28
|
50
|
|
|
|
95
|
push @annot => $pieces->[0]{annot1} if $pieces->[0]{annot1}; |
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
|
90
|
2160
|
100
|
|
|
|
4399
|
my $slice_len = $len < $usable_len |
91
|
|
|
|
|
|
|
? $len |
92
|
|
|
|
|
|
|
: $usable_len; |
93
|
|
|
|
|
|
|
|
94
|
2160
|
|
|
|
|
2995
|
my $read = substr ${ $pieces->[0]{ref} }, $pieces->[0]{start} + $offset, $slice_len; |
|
2160
|
|
|
|
|
7302
|
|
95
|
2160
|
|
|
|
|
4653
|
my $miss_len = $len - $slice_len; |
96
|
|
|
|
|
|
|
|
97
|
2160
|
|
|
|
|
5406
|
for (my $i = 1; $i < @$pieces; $i++) { |
98
|
|
|
|
|
|
|
|
99
|
248
|
100
|
|
|
|
335
|
push @annot => @{ $pieces->[$i]{annot2} } if @{ $pieces->[$i]{annot2} }; |
|
31
|
|
|
|
|
68
|
|
|
248
|
|
|
|
|
588
|
|
100
|
248
|
100
|
|
|
|
670
|
push @annot => $pieces->[$i]{annot1} if $pieces->[$i]{annot1}; |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
$slice_len = $miss_len < $pieces->[$i]{len} |
103
|
|
|
|
|
|
|
? $miss_len |
104
|
248
|
100
|
|
|
|
550
|
: $pieces->[$i]{len}; |
105
|
|
|
|
|
|
|
|
106
|
248
|
|
|
|
|
329
|
$read .= substr ${ $pieces->[$i]{ref} }, $pieces->[$i]{start}, $slice_len; |
|
248
|
|
|
|
|
667
|
|
107
|
248
|
|
|
|
|
626
|
$miss_len -= $slice_len; |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
# I must to make this mess in order to annotate paired-end reads :( |
111
|
2160
|
|
|
|
|
4242
|
my $start_ref = $pieces->[0]{pos} + $offset; |
112
|
2160
|
|
|
|
|
3802
|
my $end_ref = $start_ref + $len - 1; |
113
|
2160
|
|
|
|
|
3269
|
my $read_end_ref = $start_ref + $sub_len - 1; |
114
|
2160
|
|
|
2227
|
|
11666
|
my $read_end_piece = first { $self->_is_pos_inside_piece($read_end_ref, $_) } reverse @$pieces; |
|
2227
|
|
|
|
|
5885
|
|
115
|
2160
|
|
|
|
|
7622
|
my $read_start_ref = $end_ref - $sub_len + 1; |
116
|
2160
|
|
|
2160
|
|
7738
|
my $read_start_piece = first { $self->_is_pos_inside_piece($read_start_ref, $_) } @$pieces; |
|
2160
|
|
|
|
|
4899
|
|
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
my $attr = { |
119
|
|
|
|
|
|
|
'start' => $pos + 1, |
120
|
|
|
|
|
|
|
'end' => $pos + $len, |
121
|
|
|
|
|
|
|
'start_ref' => $pieces->[0]{is_orig} ? $start_ref + 1 : 'NA', |
122
|
|
|
|
|
|
|
'end_ref' => $pieces->[-1]{is_orig} ? $end_ref + 1 : 'NA', |
123
|
|
|
|
|
|
|
'read_end_ref' => $read_end_piece->{is_orig} ? $read_end_ref + 1 : 'NA', |
124
|
2160
|
100
|
|
|
|
18283
|
'read_start_ref' => $read_start_piece->{is_orig} ? $read_start_ref + 1 : 'NA', |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
125
|
|
|
|
|
|
|
'annot' => \@annot |
126
|
|
|
|
|
|
|
}; |
127
|
|
|
|
|
|
|
|
128
|
2160
|
|
|
|
|
10489
|
return (\$read, $attr); |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
sub _is_pos_inside_piece { |
132
|
4387
|
|
|
4387
|
|
7621
|
my ($self, $pos, $piece) = @_; |
133
|
4387
|
|
|
|
|
7631
|
my $end = $piece->{pos} + $piece->{len} - 1; |
134
|
4387
|
|
100
|
|
|
16989
|
return $pos >= $piece->{pos} && $pos <= $end; |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
sub insert_sequencing_error { |
138
|
3070
|
|
|
3070
|
0
|
7075
|
my ($self, $seq_ref, $read_size) = @_; |
139
|
3070
|
|
|
|
|
4220
|
my @errors; |
140
|
|
|
|
|
|
|
|
141
|
3070
|
100
|
|
|
|
90610
|
if ($self->sequencing_error) { |
142
|
2850
|
|
|
|
|
71097
|
my $acm_base = $read_size + $self->_count_base; |
143
|
2850
|
|
|
|
|
66894
|
my $num_err = int($acm_base / $self->_base); |
144
|
2850
|
|
|
|
|
82311
|
my $left_count = $acm_base % $self->_base; |
145
|
|
|
|
|
|
|
|
146
|
2850
|
|
|
|
|
7411
|
for (my $i = 0; $i < $num_err; $i++) { |
147
|
2850
|
|
|
|
|
64949
|
my $pos = $i * $self->_base + $self->_base - $self->_count_base - 1; |
148
|
2850
|
|
|
|
|
6984
|
my $b = substr($$seq_ref, $pos, 1); |
149
|
2850
|
|
|
|
|
6213
|
my $not_b = $self->_randb($b); |
150
|
2850
|
|
|
|
|
6784
|
substr($$seq_ref, $pos, 1) = $not_b; |
151
|
2850
|
|
|
|
|
17176
|
push @errors => sprintf("%d:%s/%s", $pos + 1, $b, $not_b); |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
|
154
|
2850
|
|
|
|
|
73400
|
$self->_count_base($left_count); |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
|
157
|
3070
|
|
|
|
|
11267
|
return \@errors; |
158
|
|
|
|
|
|
|
} |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
sub reverse_complement { |
161
|
1424
|
|
|
1424
|
0
|
25352
|
my ($self, $seq_ref) = @_; |
162
|
1424
|
|
|
|
|
3312
|
$$seq_ref = reverse $$seq_ref; |
163
|
1424
|
|
|
|
|
3740
|
$$seq_ref =~ tr/atcgATCG/tagcTAGC/; |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
sub _randb { |
167
|
2850
|
|
|
2850
|
|
5801
|
my ($self, $base) = @_; |
168
|
2850
|
|
33
|
|
|
67523
|
return $self->_not_base->{$base}[int(rand(3))] || $base; |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
__END__ |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
=pod |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=encoding UTF-8 |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
=head1 NAME |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
App::Sandy::Read - Base class to simulate reads |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
=head1 VERSION |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
version 0.22 |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=head1 AUTHORS |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
=over 4 |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
=item * |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
Thiago L. A. Miller <tmiller@mochsl.org.br> |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=item * |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
J. Leonel Buzzo <lbuzzo@mochsl.org.br> |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=item * |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
Felipe R. C. dos Santos <fsantos@mochsl.org.br> |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
=item * |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
Helena B. Conceição <hconceicao@mochsl.org.br> |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
=item * |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
Gabriela Guardia <gguardia@mochsl.org.br> |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
=item * |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
Fernanda Orpinelli <forpinelli@mochsl.org.br> |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
=item * |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
Pedro A. F. Galante <pgalante@mochsl.org.br> |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
=back |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
This software is Copyright (c) 2018 by Teaching and Research Institute from SÃrio-Libanês Hospital. |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
This is free software, licensed under: |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
The GNU General Public License, Version 3, June 2007 |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
=cut |