line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package App::Sandy::Read; |
2
|
|
|
|
|
|
|
# ABSTRACT: Base class to simulate reads |
3
|
|
|
|
|
|
|
|
4
|
6
|
|
|
6
|
|
3680
|
use App::Sandy::Base 'class'; |
|
6
|
|
|
|
|
12
|
|
|
6
|
|
|
|
|
46
|
|
5
|
6
|
|
|
6
|
|
119
|
use List::Util 'first'; |
|
6
|
|
|
|
|
23
|
|
|
6
|
|
|
|
|
417
|
|
6
|
|
|
|
|
|
|
|
7
|
6
|
|
|
6
|
|
45
|
use constant NUM_TRIES => 1000; |
|
6
|
|
|
|
|
12
|
|
|
6
|
|
|
|
|
10753
|
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
with 'App::Sandy::Role::BSearch'; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our $VERSION = '0.24'; # VERSION |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
has 'sequencing_error' => ( |
14
|
|
|
|
|
|
|
is => 'ro', |
15
|
|
|
|
|
|
|
isa => 'My:NumHS', |
16
|
|
|
|
|
|
|
required => 1 |
17
|
|
|
|
|
|
|
); |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
has '_count_base' => ( |
20
|
|
|
|
|
|
|
is => 'rw', |
21
|
|
|
|
|
|
|
isa => 'Int', |
22
|
|
|
|
|
|
|
default => 0 |
23
|
|
|
|
|
|
|
); |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
has '_base' => ( |
26
|
|
|
|
|
|
|
is => 'rw', |
27
|
|
|
|
|
|
|
isa => 'Int', |
28
|
|
|
|
|
|
|
builder => '_build_base', |
29
|
|
|
|
|
|
|
lazy_build => 1 |
30
|
|
|
|
|
|
|
); |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
has '_not_base' => ( |
33
|
|
|
|
|
|
|
is => 'ro', |
34
|
|
|
|
|
|
|
isa => 'HashRef', |
35
|
|
|
|
|
|
|
builder => '_build_not_base', |
36
|
|
|
|
|
|
|
lazy_build => 1 |
37
|
|
|
|
|
|
|
); |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
sub _build_not_base { |
40
|
35
|
|
|
35
|
|
838
|
my %not_base = ( |
41
|
|
|
|
|
|
|
A => ['T', 'C', 'G'], |
42
|
|
|
|
|
|
|
a => ['t', 'c', 'g'], |
43
|
|
|
|
|
|
|
T => ['A', 'C', 'G'], |
44
|
|
|
|
|
|
|
t => ['a', 'c', 'g'], |
45
|
|
|
|
|
|
|
C => ['A', 'T', 'G'], |
46
|
|
|
|
|
|
|
c => ['a', 't', 'g'], |
47
|
|
|
|
|
|
|
G => ['A', 'T', 'C'], |
48
|
|
|
|
|
|
|
g => ['a', 't', 'c'] |
49
|
|
|
|
|
|
|
); |
50
|
35
|
|
|
|
|
1036
|
return \%not_base; |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
sub _build_base { |
54
|
35
|
|
|
35
|
|
68
|
my $self = shift; |
55
|
|
|
|
|
|
|
# If sequencing_error equal to zero, set _base to zero |
56
|
35
|
|
33
|
|
|
948
|
return $self->sequencing_error && int(1 / $self->sequencing_error); |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
sub subseq { |
60
|
5911
|
|
|
5911
|
0
|
125827
|
my ($self, $seq_ref, $seq_len, $slice_len, $pos) = @_; |
61
|
5911
|
|
|
|
|
12786
|
my $read = substr $$seq_ref, $pos, $slice_len; |
62
|
5911
|
|
|
|
|
12666
|
return \$read; |
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
sub subseq_rand { |
66
|
242
|
|
|
242
|
0
|
125790
|
my ($self, $seq_ref, $seq_len, $slice_len, $rng) = @_; |
67
|
242
|
|
|
|
|
447
|
my $usable_len = $seq_len - $slice_len; |
68
|
|
|
|
|
|
|
# Use App::Sandy::Rand |
69
|
242
|
|
|
|
|
1033
|
my $pos = $rng->get_n($usable_len + 1); |
70
|
242
|
|
|
|
|
645
|
my $read = substr $$seq_ref, $pos, $slice_len; |
71
|
242
|
|
|
|
|
712
|
return (\$read, $pos); |
72
|
|
|
|
|
|
|
} |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
sub subseq_rand_ptable { |
75
|
7730
|
|
|
7730
|
0
|
17067
|
my ($self, $ptable, $ptable_size, $slice_len, $sub_slice_len, $rng, $blacklist) = @_; |
76
|
7730
|
|
|
|
|
13253
|
my $usable_len = $ptable_size - $slice_len; |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
state $cmp_func = sub { |
79
|
0
|
|
|
0
|
|
0
|
my ($key1, $key2) = @_; |
80
|
0
|
0
|
0
|
|
|
0
|
if ($key1->[1] >= $key2->[0] && $key1->[0] <= $key2->[1]) { |
|
|
0
|
|
|
|
|
|
81
|
0
|
|
|
|
|
0
|
return 0; |
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
elsif ($key1->[0] > $key2->[0]) { |
84
|
0
|
|
|
|
|
0
|
return 1; |
85
|
|
|
|
|
|
|
} else { |
86
|
0
|
|
|
|
|
0
|
return -1; |
87
|
|
|
|
|
|
|
} |
88
|
7730
|
|
|
|
|
10869
|
}; |
89
|
|
|
|
|
|
|
|
90
|
7730
|
|
|
|
|
9805
|
my $is_inside_blacklist; |
91
|
7730
|
|
|
|
|
10792
|
my $pos = 0; |
92
|
7730
|
|
|
|
|
9958
|
my $random_tries = 0; |
93
|
|
|
|
|
|
|
|
94
|
7730
|
|
|
|
|
11087
|
do { |
95
|
7730
|
50
|
|
|
|
16306
|
if (++$random_tries > NUM_TRIES) { |
96
|
0
|
|
|
|
|
0
|
croak sprintf |
97
|
|
|
|
|
|
|
"Too many tries to calculate a valid random position\n" . |
98
|
|
|
|
|
|
|
"Your FASTA file may be full of NNN regions\n" |
99
|
|
|
|
|
|
|
} |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# Use App::Sandy::Rand |
102
|
7730
|
|
|
|
|
28461
|
$pos = $rng->get_n($usable_len + 1); |
103
|
|
|
|
|
|
|
|
104
|
7730
|
|
|
|
|
29515
|
$is_inside_blacklist = $self->with_bsearch([$pos, $pos + $slice_len - 1], |
105
|
|
|
|
|
|
|
$blacklist, scalar @$blacklist, $cmp_func); |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
} while (defined $is_inside_blacklist); |
108
|
|
|
|
|
|
|
|
109
|
7730
|
|
|
|
|
26770
|
my $pieces = $ptable->lookup($pos, $slice_len); |
110
|
7730
|
|
|
|
|
21864
|
return $self->_build_subseq($pieces, $pos, $slice_len, $sub_slice_len); |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
sub _build_subseq { |
114
|
7730
|
|
|
7730
|
|
16117
|
my ($self, $pieces, $pos, $len, $sub_len) = @_; |
115
|
|
|
|
|
|
|
|
116
|
7730
|
|
|
|
|
15457
|
my $offset = $pos - $pieces->[0]{offset}; |
117
|
7730
|
|
|
|
|
12638
|
my $usable_len = $pieces->[0]{len} - $offset; |
118
|
|
|
|
|
|
|
|
119
|
7730
|
|
|
|
|
11440
|
my @annot; |
120
|
|
|
|
|
|
|
|
121
|
7730
|
100
|
|
|
|
15419
|
if (not $pieces->[0]{is_orig}) { |
122
|
44
|
50
|
|
|
|
54
|
push @annot => @{ $pieces->[0]{annot2} } if @{ $pieces->[0]{annot2} }; |
|
0
|
|
|
|
|
0
|
|
|
44
|
|
|
|
|
129
|
|
123
|
44
|
50
|
|
|
|
142
|
push @annot => $pieces->[0]{annot1} if $pieces->[0]{annot1}; |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
|
126
|
7730
|
100
|
|
|
|
17382
|
my $slice_len = $len < $usable_len |
127
|
|
|
|
|
|
|
? $len |
128
|
|
|
|
|
|
|
: $usable_len; |
129
|
|
|
|
|
|
|
|
130
|
7730
|
|
|
|
|
9821
|
my $read = substr ${ $pieces->[0]{ref} }, $pieces->[0]{start} + $offset, $slice_len; |
|
7730
|
|
|
|
|
24866
|
|
131
|
7730
|
|
|
|
|
12769
|
my $miss_len = $len - $slice_len; |
132
|
|
|
|
|
|
|
|
133
|
7730
|
|
|
|
|
17605
|
for (my $i = 1; $i < @$pieces; $i++) { |
134
|
|
|
|
|
|
|
|
135
|
275
|
100
|
|
|
|
356
|
push @annot => @{ $pieces->[$i]{annot2} } if @{ $pieces->[$i]{annot2} }; |
|
44
|
|
|
|
|
100
|
|
|
275
|
|
|
|
|
630
|
|
136
|
275
|
100
|
|
|
|
680
|
push @annot => $pieces->[$i]{annot1} if $pieces->[$i]{annot1}; |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
$slice_len = $miss_len < $pieces->[$i]{len} |
139
|
|
|
|
|
|
|
? $miss_len |
140
|
275
|
100
|
|
|
|
556
|
: $pieces->[$i]{len}; |
141
|
|
|
|
|
|
|
|
142
|
275
|
|
|
|
|
326
|
$read .= substr ${ $pieces->[$i]{ref} }, $pieces->[$i]{start}, $slice_len; |
|
275
|
|
|
|
|
658
|
|
143
|
275
|
|
|
|
|
687
|
$miss_len -= $slice_len; |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
# I must to make this mess in order to annotate paired-end reads :( |
147
|
7730
|
|
|
|
|
12758
|
my $start_ref = $pieces->[0]{pos} + $offset; |
148
|
7730
|
|
|
|
|
10990
|
my $end_ref = $start_ref + $len - 1; |
149
|
7730
|
|
|
|
|
10449
|
my $read_end_ref = $start_ref + $sub_len - 1; |
150
|
7730
|
|
|
7851
|
|
38531
|
my $read_end_piece = first { $self->_is_pos_inside_piece($read_end_ref, $_) } reverse @$pieces; |
|
7851
|
|
|
|
|
18159
|
|
151
|
7730
|
|
|
|
|
26027
|
my $read_start_ref = $end_ref - $sub_len + 1; |
152
|
7730
|
|
|
7730
|
|
23362
|
my $read_start_piece = first { $self->_is_pos_inside_piece($read_start_ref, $_) } @$pieces; |
|
7730
|
|
|
|
|
15015
|
|
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
my $attr = { |
155
|
|
|
|
|
|
|
'start' => $pos + 1, |
156
|
|
|
|
|
|
|
'end' => $pos + $len, |
157
|
|
|
|
|
|
|
'start_ref' => $pieces->[0]{is_orig} ? $start_ref + 1 : 'NA', |
158
|
|
|
|
|
|
|
'end_ref' => $pieces->[-1]{is_orig} ? $end_ref + 1 : 'NA', |
159
|
|
|
|
|
|
|
'read_end_ref' => $read_end_piece->{is_orig} ? $read_end_ref + 1 : 'NA', |
160
|
7730
|
100
|
|
|
|
65560
|
'read_start_ref' => $read_start_piece->{is_orig} ? $read_start_ref + 1 : 'NA', |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
161
|
|
|
|
|
|
|
'annot' => \@annot |
162
|
|
|
|
|
|
|
}; |
163
|
|
|
|
|
|
|
|
164
|
7730
|
|
|
|
|
35005
|
return (\$read, $attr); |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
sub _is_pos_inside_piece { |
168
|
15581
|
|
|
15581
|
|
24498
|
my ($self, $pos, $piece) = @_; |
169
|
15581
|
|
|
|
|
24220
|
my $end = $piece->{pos} + $piece->{len} - 1; |
170
|
15581
|
|
100
|
|
|
56852
|
return $pos >= $piece->{pos} && $pos <= $end; |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
sub insert_sequencing_error { |
174
|
10790
|
|
|
10790
|
0
|
24372
|
my ($self, $seq_ref, $read_size, $rng) = @_; |
175
|
10790
|
|
|
|
|
13593
|
my @errors; |
176
|
|
|
|
|
|
|
|
177
|
10790
|
100
|
|
|
|
318006
|
if ($self->sequencing_error) { |
178
|
10570
|
|
|
|
|
265587
|
my $acm_base = $read_size + $self->_count_base; |
179
|
10570
|
|
|
|
|
248669
|
my $num_err = int($acm_base / $self->_base); |
180
|
10570
|
|
|
|
|
250617
|
my $left_count = $acm_base % $self->_base; |
181
|
|
|
|
|
|
|
|
182
|
10570
|
|
|
|
|
24032
|
for (my $i = 0; $i < $num_err; $i++) { |
183
|
10570
|
|
|
|
|
244722
|
my $pos = $i * $self->_base + $self->_base - $self->_count_base - 1; |
184
|
10570
|
|
|
|
|
23223
|
my $b = substr($$seq_ref, $pos, 1); |
185
|
10570
|
|
|
|
|
22146
|
my $not_b = $self->_randb($b, $rng); |
186
|
10570
|
|
|
|
|
25655
|
substr($$seq_ref, $pos, 1) = $not_b; |
187
|
10570
|
|
|
|
|
61830
|
push @errors => sprintf("%d:%s/%s", $pos + 1, $b, $not_b); |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
|
190
|
10570
|
|
|
|
|
281413
|
$self->_count_base($left_count); |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
|
193
|
10790
|
|
|
|
|
41384
|
return \@errors; |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
sub reverse_complement { |
197
|
5265
|
|
|
5265
|
0
|
80465
|
my ($self, $seq_ref) = @_; |
198
|
5265
|
|
|
|
|
10702
|
$$seq_ref = reverse $$seq_ref; |
199
|
5265
|
|
|
|
|
12926
|
$$seq_ref =~ tr/atcgATCG/tagcTAGC/; |
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
sub _randb { |
203
|
10570
|
|
|
10570
|
|
19814
|
my ($self, $base, $rng) = @_; |
204
|
10570
|
|
33
|
|
|
256011
|
return $self->_not_base->{$base}[$rng->get_n(3)] || $base; |
205
|
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
__END__ |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
=pod |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
=encoding UTF-8 |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
=head1 NAME |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
App::Sandy::Read - Base class to simulate reads |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
=head1 VERSION |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
version 0.24 |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=head1 AUTHORS |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
=over 4 |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
=item * |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
Thiago L. A. Miller <tmiller@mochsl.org.br> |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
=item * |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
J. Leonel Buzzo <lbuzzo@mochsl.org.br> |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
=item * |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
Felipe R. C. dos Santos <fsantos@mochsl.org.br> |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
=item * |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
Helena B. Conceição <hconceicao@mochsl.org.br> |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
=item * |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
Rodrigo Barreiro <rbarreiro@mochsl.org.br> |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=item * |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
Gabriela Guardia <gguardia@mochsl.org.br> |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
=item * |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
Fernanda Orpinelli <forpinelli@mochsl.org.br> |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
=item * |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
Rafael Mercuri <rmercuri@mochsl.org.br> |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
=item * |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
Rodrigo Barreiro <rbarreiro@mochsl.org.br> |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
=item * |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
Pedro A. F. Galante <pgalante@mochsl.org.br> |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
=back |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
This software is Copyright (c) 2023 by Teaching and Research Institute from Sírio-Libanês Hospital. |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
This is free software, licensed under: |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
The GNU General Public License, Version 3, June 2007 |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
=cut |