line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package App::Sandy::DB::Handle::Variation; |
2
|
|
|
|
|
|
|
# ABSTRACT: Class to handle structural variation database schemas. |
3
|
|
|
|
|
|
|
|
4
|
6
|
|
|
6
|
|
48
|
use App::Sandy::Base 'class'; |
|
6
|
|
|
|
|
12
|
|
|
6
|
|
|
|
|
53
|
|
5
|
6
|
|
|
6
|
|
73
|
use App::Sandy::DB; |
|
6
|
|
|
|
|
18
|
|
|
6
|
|
|
|
|
181
|
|
6
|
6
|
|
|
6
|
|
42
|
use IO::Compress::Gzip 'gzip'; |
|
6
|
|
|
|
|
12
|
|
|
6
|
|
|
|
|
544
|
|
7
|
6
|
|
|
6
|
|
46
|
use IO::Uncompress::Gunzip 'gunzip'; |
|
6
|
|
|
|
|
11
|
|
|
6
|
|
|
|
|
402
|
|
8
|
6
|
|
|
6
|
|
46
|
use Storable qw/nfreeze thaw/; |
|
6
|
|
|
|
|
25
|
|
|
6
|
|
|
|
|
1024
|
|
9
|
6
|
|
|
6
|
|
47
|
use Scalar::Util qw/looks_like_number refaddr/; |
|
6
|
|
|
|
|
12
|
|
|
6
|
|
|
|
|
420
|
|
10
|
6
|
|
|
6
|
|
47
|
use List::Util 'max'; |
|
6
|
|
|
|
|
17
|
|
|
6
|
|
|
|
|
26106
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
with qw/App::Sandy::Role::IO App::Sandy::Role::SeqID/; |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our $VERSION = '0.22'; # VERSION |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
sub insertdb { |
17
|
0
|
|
|
0
|
0
|
|
my ($self, $file, $name, $source, $is_user_provided, $type, @args) = @_; |
18
|
0
|
|
|
|
|
|
my $schema = App::Sandy::DB->schema; |
19
|
|
|
|
|
|
|
|
20
|
0
|
|
|
|
|
|
log_msg ":: Checking if there is already a structural variation '$name' ..."; |
21
|
0
|
|
|
|
|
|
my $rs = $schema->resultset('StructuralVariation')->find({ name => $name }); |
22
|
0
|
0
|
|
|
|
|
if ($rs) { |
23
|
0
|
|
|
|
|
|
die "There is already a structural variation '$name'\n"; |
24
|
|
|
|
|
|
|
} else { |
25
|
0
|
|
|
|
|
|
log_msg ":: structural variation '$name' not found"; |
26
|
|
|
|
|
|
|
} |
27
|
|
|
|
|
|
|
|
28
|
0
|
|
|
|
|
|
log_msg ":: Indexing '$file'"; |
29
|
0
|
|
|
|
|
|
log_msg ":: It may take a while ..."; |
30
|
0
|
|
|
|
|
|
my $indexed_file; |
31
|
|
|
|
|
|
|
|
32
|
0
|
0
|
|
|
|
|
if ($type eq 'raw') { |
|
|
0
|
|
|
|
|
|
33
|
0
|
|
|
|
|
|
$indexed_file = $self->_index_snv($file); |
34
|
|
|
|
|
|
|
} elsif ($type eq 'vcf') { |
35
|
0
|
|
|
|
|
|
$indexed_file = $self->_index_vcf($file, @args); |
36
|
|
|
|
|
|
|
} else { |
37
|
0
|
|
|
|
|
|
croak "No type '$type'"; |
38
|
|
|
|
|
|
|
} |
39
|
|
|
|
|
|
|
|
40
|
0
|
|
|
|
|
|
log_msg ":: Removing overlapping entries in structural variation file '$file', if any ..."; |
41
|
0
|
|
|
|
|
|
$self->_validate_indexed_snv($indexed_file, $file); |
42
|
|
|
|
|
|
|
|
43
|
0
|
|
|
|
|
|
log_msg ":: Converting data to bytes ..."; |
44
|
0
|
|
|
|
|
|
my $bytes = nfreeze $indexed_file; |
45
|
0
|
|
|
|
|
|
log_msg ":: Compressing bytes ..."; |
46
|
0
|
|
|
|
|
|
gzip \$bytes => \my $compressed; |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
# Begin transation |
49
|
0
|
|
|
|
|
|
my $guard = $schema->txn_scope_guard; |
50
|
|
|
|
|
|
|
|
51
|
0
|
|
|
|
|
|
log_msg ":: Storing structural variation '$name'..."; |
52
|
0
|
|
|
|
|
|
$rs = $schema->resultset('StructuralVariation')->create({ |
53
|
|
|
|
|
|
|
name => $name, |
54
|
|
|
|
|
|
|
source => $source, |
55
|
|
|
|
|
|
|
is_user_provided => $is_user_provided, |
56
|
|
|
|
|
|
|
matrix => $compressed |
57
|
|
|
|
|
|
|
}); |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
# End transation |
60
|
0
|
|
|
|
|
|
$guard->commit; |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
sub _index_snv { |
64
|
0
|
|
|
0
|
|
|
my ($self, $variation_file) = @_; |
65
|
|
|
|
|
|
|
|
66
|
0
|
|
|
|
|
|
my $fh = $self->with_open_r($variation_file); |
67
|
|
|
|
|
|
|
|
68
|
0
|
|
|
|
|
|
my %indexed_snv; |
69
|
0
|
|
|
|
|
|
my $line = 0; |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
# chr pos ref @obs he |
72
|
0
|
|
|
|
|
|
while (<$fh>) { |
73
|
0
|
|
|
|
|
|
$line++; |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
# Skip coments and blank lines |
76
|
0
|
0
|
|
|
|
|
next if /^#/; |
77
|
0
|
0
|
|
|
|
|
next if /^\s*$/; |
78
|
|
|
|
|
|
|
|
79
|
0
|
|
|
|
|
|
chomp; |
80
|
0
|
|
|
|
|
|
my @fields = split; |
81
|
|
|
|
|
|
|
|
82
|
0
|
0
|
|
|
|
|
die "Not found all fields (SEQID, POSITION, ID, REFERENCE, OBSERVED, GENOTYPE) into file '$variation_file' at line $line\n" |
83
|
|
|
|
|
|
|
unless scalar @fields >= 6; |
84
|
|
|
|
|
|
|
|
85
|
0
|
0
|
|
|
|
|
die "Second column, position, does not seem to be a number into file '$variation_file' at line $line\n" |
86
|
|
|
|
|
|
|
unless looks_like_number($fields[1]); |
87
|
|
|
|
|
|
|
|
88
|
0
|
0
|
|
|
|
|
die "Second column, position, has a value lesser or equal to zero into file '$variation_file' at line $line\n" |
89
|
|
|
|
|
|
|
if $fields[1] <= 0; |
90
|
|
|
|
|
|
|
|
91
|
0
|
0
|
|
|
|
|
die "Fourth column, reference, does not seem to be a valid entry: '$fields[3]' into file '$variation_file' at line $line\n" |
92
|
|
|
|
|
|
|
unless $fields[3] =~ /^(\w+|-)$/; |
93
|
|
|
|
|
|
|
|
94
|
0
|
0
|
|
|
|
|
die "Fifth column, alteration, does not seem to be a valid entry: '$fields[4]' into file '$variation_file' at line $line\n" |
95
|
|
|
|
|
|
|
unless $fields[4] =~ /^(\w+|-)$/; |
96
|
|
|
|
|
|
|
|
97
|
0
|
0
|
|
|
|
|
die "Sixth column, genotype, has an invalid entry: '$fields[5]' into file '$variation_file' at line $line. Valid ones are 'HE' or 'HO'\n" |
98
|
|
|
|
|
|
|
unless $fields[5] =~ /^(HE|HO)$/; |
99
|
|
|
|
|
|
|
|
100
|
0
|
0
|
|
|
|
|
if ($fields[3] eq $fields[4]) { |
101
|
0
|
|
|
|
|
|
warn "There is an alteration equal to the reference at '$variation_file' line $line. I will ignore it\n"; |
102
|
0
|
|
|
|
|
|
next; |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
# Sequence inside perl begins at 0 |
106
|
0
|
|
|
|
|
|
my $position = int($fields[1] - 1); |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
# Compare the alterations and reference to guess the max variation on sequence |
109
|
0
|
|
|
|
|
|
my $size_of_variation = max map { length } $fields[3], $fields[4]; |
|
0
|
|
|
|
|
|
|
110
|
0
|
|
|
|
|
|
my $high = $position + $size_of_variation - 1; |
111
|
|
|
|
|
|
|
|
112
|
0
|
|
|
|
|
|
my %variation = ( |
113
|
|
|
|
|
|
|
seq_id => $fields[0], |
114
|
|
|
|
|
|
|
id => $fields[2], |
115
|
|
|
|
|
|
|
ref => $fields[3], |
116
|
|
|
|
|
|
|
alt => $fields[4], |
117
|
|
|
|
|
|
|
plo => $fields[5], |
118
|
|
|
|
|
|
|
pos => $position, |
119
|
|
|
|
|
|
|
low => $position, |
120
|
|
|
|
|
|
|
high => $high, |
121
|
|
|
|
|
|
|
line => $line |
122
|
|
|
|
|
|
|
); |
123
|
|
|
|
|
|
|
|
124
|
0
|
|
|
|
|
|
push @{ $indexed_snv{$self->with_std_seqid($fields[0])} } => \%variation; |
|
0
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
|
127
|
0
|
0
|
|
|
|
|
close $fh |
128
|
|
|
|
|
|
|
or die "Cannot close structural variation file '$variation_file': $!\n"; |
129
|
|
|
|
|
|
|
|
130
|
0
|
|
|
|
|
|
return \%indexed_snv; |
131
|
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub _index_vcf { |
134
|
0
|
|
|
0
|
|
|
my ($self, $vcf_file, $sample_name) = @_; |
135
|
|
|
|
|
|
|
|
136
|
0
|
|
|
|
|
|
my $fh = $self->with_open_r($vcf_file); |
137
|
|
|
|
|
|
|
|
138
|
0
|
|
|
|
|
|
my $magic = <$fh>; |
139
|
0
|
|
|
|
|
|
chomp $magic; |
140
|
|
|
|
|
|
|
|
141
|
0
|
0
|
|
|
|
|
if ($magic !~ /^##fileformat=VCFv\d+(\.\d+)?$/) { |
142
|
0
|
|
|
|
|
|
log_msg ":: Putative VCF file '$vcf_file' does not have the required 'fileformat=VCFv{VERSION}' field\n"; |
143
|
0
|
|
|
|
|
|
log_msg ":: Continue anyway ...\n"; |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
|
146
|
0
|
|
|
|
|
|
my $sample_i = 9; |
147
|
0
|
|
|
|
|
|
my $has_header = 0; |
148
|
0
|
|
|
|
|
|
my $line = 1; |
149
|
0
|
|
|
|
|
|
my %indexed_snv; |
150
|
|
|
|
|
|
|
|
151
|
0
|
|
|
|
|
|
while (<$fh>) { |
152
|
0
|
|
|
|
|
|
$line ++; |
153
|
|
|
|
|
|
|
|
154
|
0
|
0
|
|
|
|
|
next if /^##/; |
155
|
0
|
0
|
|
|
|
|
next if /^\s*$/; |
156
|
|
|
|
|
|
|
|
157
|
0
|
|
|
|
|
|
chomp; |
158
|
0
|
|
|
|
|
|
my @fields = split /\t/; |
159
|
|
|
|
|
|
|
|
160
|
0
|
0
|
|
|
|
|
if (/^#/) { |
161
|
0
|
|
|
|
|
|
my $number_of_fields = scalar @fields; |
162
|
|
|
|
|
|
|
|
163
|
0
|
0
|
|
|
|
|
if ($number_of_fields < 8) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
164
|
0
|
|
|
|
|
|
die "Invalid header: Less than 8 columns at line $line\n"; |
165
|
|
|
|
|
|
|
} elsif ($number_of_fields == 8) { |
166
|
0
|
|
|
|
|
|
die "No genotype data found in the header at line $line\n"; |
167
|
|
|
|
|
|
|
} elsif ($number_of_fields == 9) { |
168
|
0
|
|
|
|
|
|
die "Invalid header: Missing sample data at line $line\n"; |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
|
171
|
0
|
|
|
|
|
|
my @samples = splice @fields, 9; |
172
|
|
|
|
|
|
|
|
173
|
0
|
0
|
|
|
|
|
if (defined $sample_name) { |
174
|
0
|
|
|
|
|
|
my %columns; |
175
|
|
|
|
|
|
|
|
176
|
0
|
|
|
|
|
|
for my $i (0..$#samples) { |
177
|
0
|
|
|
|
|
|
$columns{$samples[$i]} = $i; |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
|
180
|
0
|
0
|
|
|
|
|
if (not exists $columns{$sample_name}) { |
181
|
0
|
|
|
|
|
|
die "Not found sample named '$sample_name' at file '$vcf_file'\n"; |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
|
184
|
0
|
|
|
|
|
|
$sample_i += $columns{$sample_name}; |
185
|
|
|
|
|
|
|
} else { |
186
|
0
|
|
|
|
|
|
$sample_name = $samples[$sample_i]; |
187
|
|
|
|
|
|
|
} |
188
|
|
|
|
|
|
|
|
189
|
0
|
|
|
|
|
|
$has_header = 1; |
190
|
0
|
|
|
|
|
|
next; |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
|
193
|
0
|
0
|
|
|
|
|
unless ($has_header) { |
194
|
0
|
|
|
|
|
|
die "No header found in file '$vcf_file'\n"; |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
|
197
|
0
|
0
|
|
|
|
|
die "Not found all columns into file '$vcf_file' at line $line\n" |
198
|
|
|
|
|
|
|
unless scalar @fields > 9; |
199
|
|
|
|
|
|
|
|
200
|
0
|
0
|
|
|
|
|
die "Second column, position, does not seem to be a number into file '$vcf_file' at line $line\n" |
201
|
|
|
|
|
|
|
unless looks_like_number($fields[1]); |
202
|
|
|
|
|
|
|
|
203
|
0
|
0
|
|
|
|
|
die "Second column, position, has a value lesser or equal to zero into file '$vcf_file' at line $line\n" |
204
|
|
|
|
|
|
|
if $fields[1] <= 0; |
205
|
|
|
|
|
|
|
|
206
|
0
|
0
|
|
|
|
|
die "Fourth column, reference, does not seem to be a valid entry: '$fields[3]' into file '$vcf_file' at line $line\n" |
207
|
|
|
|
|
|
|
unless $fields[3] =~ /^(<\w+(:\w+)*>|\w+|-)$/; |
208
|
|
|
|
|
|
|
|
209
|
0
|
0
|
|
|
|
|
die "Fifth column, alternate, does not seem to be a valid entry: '$fields[4]' into file '$vcf_file' at line $line\n" |
210
|
|
|
|
|
|
|
unless $fields[4] =~ /^(\w+|<\w+(:\w+)*>|-|\*)(,(\w+|<\w+(:\w+)*>|-|\*))*$/; |
211
|
|
|
|
|
|
|
|
212
|
0
|
0
|
|
|
|
|
die "No genotype field found in format column at line $line\n" |
213
|
|
|
|
|
|
|
unless $fields[8] =~ /^GT:?/; |
214
|
|
|
|
|
|
|
|
215
|
0
|
0
|
|
|
|
|
die "Sample '$sample_name', column '$sample_i', does not seem to be a valid entry: '$fields[$sample_i]' into file '$vcf_file' at line $line\n" |
216
|
|
|
|
|
|
|
unless $fields[$sample_i] =~ /^[0-9.]+([\/\|][0-9.])?:*/; |
217
|
|
|
|
|
|
|
|
218
|
0
|
0
|
|
|
|
|
if ($fields[3] eq $fields[4]) { |
219
|
0
|
|
|
|
|
|
log_msg ":: There is an alternate equal to the reference at '$vcf_file' line $line. I will ignore it\n"; |
220
|
0
|
|
|
|
|
|
next; |
221
|
|
|
|
|
|
|
} |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
# No support for structural variation <VAR> |
224
|
|
|
|
|
|
|
# So ignore it |
225
|
0
|
0
|
0
|
|
|
|
if ($fields[3] =~ /<\w+>/ || $fields[4] =~ /<\w+>/) { |
226
|
0
|
|
|
|
|
|
next; |
227
|
|
|
|
|
|
|
} |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
# Ignore missing alleles |
230
|
0
|
0
|
|
|
|
|
if ($fields[4] =~ /\*/) { |
231
|
0
|
|
|
|
|
|
next; |
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
|
234
|
0
|
|
|
|
|
|
my @alternates = split /,/ => $fields[4]; |
235
|
0
|
|
|
|
|
|
my $genotype = (split /:/ => $fields[$sample_i])[0]; |
236
|
|
|
|
|
|
|
|
237
|
0
|
|
|
|
|
|
my ($plo, $alternate); |
238
|
0
|
|
|
|
|
|
my ($g1, $g2) = split /\|/ => $genotype; |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
|
241
|
0
|
0
|
0
|
|
|
|
if ($g1 eq '.' || (defined $g2 && $g2 eq '.')) { |
|
|
|
0
|
|
|
|
|
242
|
0
|
|
|
|
|
|
next; |
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
# Homozygosity or haploid chromossome |
246
|
0
|
0
|
0
|
|
|
|
if (! defined($g2) || ($g1 == $g2)) { |
247
|
|
|
|
|
|
|
# No variation |
248
|
0
|
0
|
|
|
|
|
next if $g1 == 0; |
249
|
0
|
|
|
|
|
|
$alternate = $alternates[$g1 - 1]; |
250
|
0
|
|
|
|
|
|
$plo = 'HO'; |
251
|
|
|
|
|
|
|
} else { |
252
|
|
|
|
|
|
|
# The alternate is the one with the max index, in the case |
253
|
|
|
|
|
|
|
# of a variation with the two alleles been alterations |
254
|
0
|
0
|
|
|
|
|
my $i = $g1 > $g2 |
255
|
|
|
|
|
|
|
? $g1 - 1 |
256
|
|
|
|
|
|
|
: $g2 - 1; |
257
|
0
|
|
|
|
|
|
$alternate = $alternates[$i]; |
258
|
0
|
|
|
|
|
|
$plo = 'HE'; |
259
|
|
|
|
|
|
|
} |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
# Sequence inside perl begins at 0 |
262
|
0
|
|
|
|
|
|
my $position = int($fields[1] - 1); |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
# Compare the alterations and reference to guess the max variation on sequence |
265
|
0
|
|
|
|
|
|
my $size_of_variation = max map { length } $fields[3], $alternate; |
|
0
|
|
|
|
|
|
|
266
|
0
|
|
|
|
|
|
my $high = $position + $size_of_variation - 1; |
267
|
|
|
|
|
|
|
|
268
|
0
|
|
|
|
|
|
my %variation = ( |
269
|
|
|
|
|
|
|
seq_id => $fields[0], |
270
|
|
|
|
|
|
|
id => $fields[2], |
271
|
|
|
|
|
|
|
ref => $fields[3], |
272
|
|
|
|
|
|
|
alt => $alternate, |
273
|
|
|
|
|
|
|
plo => $plo, |
274
|
|
|
|
|
|
|
pos => $position, |
275
|
|
|
|
|
|
|
low => $position, |
276
|
|
|
|
|
|
|
high => $high, |
277
|
|
|
|
|
|
|
line => $line |
278
|
|
|
|
|
|
|
); |
279
|
|
|
|
|
|
|
|
280
|
0
|
|
|
|
|
|
push @{ $indexed_snv{$self->with_std_seqid($fields[0])} } => \%variation; |
|
0
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
} |
282
|
|
|
|
|
|
|
|
283
|
0
|
0
|
|
|
|
|
close $fh |
284
|
|
|
|
|
|
|
or die "Cannot close vcf file '$vcf_file': $!\n"; |
285
|
|
|
|
|
|
|
|
286
|
0
|
|
|
|
|
|
return \%indexed_snv; |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
sub _validate_indexed_snv { |
290
|
0
|
|
|
0
|
|
|
my ($self, $indexed_snv, $variation_file) = @_; |
291
|
|
|
|
|
|
|
|
292
|
0
|
|
|
|
|
|
for my $seq_id (keys %$indexed_snv) { |
293
|
0
|
|
|
|
|
|
my $snvs_a = delete $indexed_snv->{$seq_id}; |
294
|
0
|
|
|
|
|
|
my @sorted_snvs = sort { $a->{low} <=> $b->{low} } @$snvs_a; |
|
0
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
|
296
|
0
|
|
|
|
|
|
my $prev_snv = $sorted_snvs[0]; |
297
|
0
|
|
|
|
|
|
my $high = $prev_snv->{high}; |
298
|
0
|
|
|
|
|
|
push my @snv_cluster => $prev_snv; |
299
|
|
|
|
|
|
|
|
300
|
0
|
|
|
|
|
|
for (my $i = 1; $i < @sorted_snvs; $i++) { |
301
|
0
|
|
|
|
|
|
my $next_snv = $sorted_snvs[$i]; |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
# If not overlapping |
304
|
0
|
0
|
|
|
|
|
if ($next_snv->{low} > $high) { |
305
|
0
|
|
|
|
|
|
my $valid_snvs = $self->_validate_indexed_snv_cluster(\@snv_cluster, $variation_file); |
306
|
0
|
|
|
|
|
|
push @{ $indexed_snv->{$seq_id} } => @$valid_snvs; |
|
0
|
|
|
|
|
|
|
307
|
0
|
|
|
|
|
|
@snv_cluster = (); |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
|
310
|
0
|
|
|
|
|
|
push @snv_cluster => $next_snv; |
311
|
0
|
|
|
|
|
|
$high = max $high, $next_snv->{high}; |
312
|
0
|
|
|
|
|
|
$prev_snv = $next_snv; |
313
|
|
|
|
|
|
|
} |
314
|
|
|
|
|
|
|
|
315
|
0
|
|
|
|
|
|
my $valid_snvs = $self->_validate_indexed_snv_cluster(\@snv_cluster, $variation_file); |
316
|
0
|
|
|
|
|
|
push @{ $indexed_snv->{$seq_id} } => @$valid_snvs; |
|
0
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
} |
318
|
|
|
|
|
|
|
} |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
sub _validate_indexed_snv_cluster { |
321
|
0
|
|
|
0
|
|
|
my ($self, $snvs, $variation_file) = @_; |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
# My rules: |
324
|
|
|
|
|
|
|
# The biggest structural variation gains precedence. |
325
|
|
|
|
|
|
|
# If occurs an overlapping, I search to the biggest variation among |
326
|
|
|
|
|
|
|
# the saved alterations and compare it with the actual entry: |
327
|
|
|
|
|
|
|
# *** Remove all overlapping variations if actual entry is bigger; |
328
|
|
|
|
|
|
|
# *** Skip actual entry if it is lower than the biggest variation |
329
|
|
|
|
|
|
|
# *** Insertionw can be before any alterations |
330
|
|
|
|
|
|
|
|
331
|
0
|
|
|
|
|
|
my @saved_snvs; |
332
|
|
|
|
|
|
|
my %blacklist; |
333
|
|
|
|
|
|
|
|
334
|
0
|
|
|
|
|
|
OUTER: for (my $i = 0; $i < @$snvs; $i++) { |
335
|
0
|
|
|
|
|
|
my $prev_snv = $snvs->[$i]; |
336
|
|
|
|
|
|
|
|
337
|
0
|
0
|
|
|
|
|
if ($blacklist{refaddr($prev_snv)}) { |
338
|
0
|
|
|
|
|
|
next OUTER; |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
|
341
|
0
|
|
|
|
|
|
INNER: for (my $j = 0; $j < @$snvs; $j++) { |
342
|
0
|
|
|
|
|
|
my $next_snv = $snvs->[$j]; |
343
|
|
|
|
|
|
|
|
344
|
0
|
0
|
0
|
|
|
|
if (($i == $j) || $blacklist{refaddr($next_snv)}) { |
345
|
0
|
|
|
|
|
|
next INNER; |
346
|
|
|
|
|
|
|
} |
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
# Insertion after insertion |
349
|
0
|
0
|
0
|
|
|
|
if ($next_snv->{ref} eq '-' && $prev_snv->{ref} eq '-' && $next_snv->{pos} != $prev_snv->{pos}) { |
|
|
0
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
350
|
0
|
|
|
|
|
|
next INNER; |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
# Insertion before alteration |
353
|
|
|
|
|
|
|
} elsif ($next_snv->{ref} eq '-' && $prev_snv->{ref} ne '-' && $next_snv->{pos} < $prev_snv->{pos}) { |
354
|
0
|
|
|
|
|
|
next INNER; |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
# In this case, it gains the biggest one |
357
|
|
|
|
|
|
|
} else { |
358
|
0
|
|
|
|
|
|
my $prev_size = $prev_snv->{high} - $prev_snv->{low} + 1; |
359
|
0
|
|
|
|
|
|
my $next_size = $next_snv->{high} - $next_snv->{low} + 1; |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
my ($prev_ref, $prev_alt, $next_ref, $next_alt) = |
362
|
|
|
|
|
|
|
map { |
363
|
0
|
0
|
|
|
|
|
length($_) > 25 |
364
|
|
|
|
|
|
|
? substr($_, 0, 25) . "..." |
365
|
|
|
|
|
|
|
: $_ |
366
|
0
|
|
|
|
|
|
} ($prev_snv->{ref}, $prev_snv->{alt}, $next_snv->{ref}, $next_snv->{alt}); |
367
|
|
|
|
|
|
|
|
368
|
0
|
0
|
|
|
|
|
if ($prev_size >= $next_size) { |
369
|
0
|
0
|
|
|
|
|
if ($variation_file) { |
370
|
|
|
|
|
|
|
log_msg sprintf ":: Alteration [%s %d %s %s %s %s] masks [%s %d %s %s %s %s] at '%s' line %d" |
371
|
|
|
|
|
|
|
=> $prev_snv->{seq_id}, $prev_snv->{pos}+1, $prev_snv->{id}, $prev_ref, $prev_alt, $prev_snv->{plo}, |
372
|
|
|
|
|
|
|
$next_snv->{seq_id}, $next_snv->{pos}+1, $next_snv->{id}, $next_ref, $next_alt, $next_snv->{plo}, |
373
|
0
|
|
|
|
|
|
$variation_file, $next_snv->{line}; |
374
|
|
|
|
|
|
|
} else { |
375
|
|
|
|
|
|
|
log_msg sprintf ":: Alteration [%s %d %s %s %s %s] masks [%s %d %s %s %s %s]" |
376
|
|
|
|
|
|
|
=> $prev_snv->{seq_id}, $prev_snv->{pos}+1, $prev_snv->{id}, $prev_ref, $prev_alt, $prev_snv->{plo}, |
377
|
0
|
|
|
|
|
|
$next_snv->{seq_id}, $next_snv->{pos}+1, $next_snv->{id}, $next_ref, $next_alt, $next_snv->{plo}; |
378
|
|
|
|
|
|
|
} |
379
|
|
|
|
|
|
|
|
380
|
0
|
|
|
|
|
|
$blacklist{refaddr($next_snv)} = 1; |
381
|
0
|
|
|
|
|
|
next INNER; |
382
|
|
|
|
|
|
|
} else { |
383
|
0
|
0
|
|
|
|
|
if ($variation_file) { |
384
|
|
|
|
|
|
|
log_msg sprintf ":: Alteration [%s %d %s %s %s %s] masks [%s %d %s %s %s %s] at '%s' line %d" |
385
|
|
|
|
|
|
|
=> $next_snv->{seq_id}, $next_snv->{pos}+1, $next_snv->{id}, $next_ref, $next_alt, $next_snv->{plo}, |
386
|
|
|
|
|
|
|
$prev_snv->{seq_id}, $prev_snv->{pos}+1, $prev_snv->{id}, $prev_ref, $prev_alt, $prev_snv->{plo}, |
387
|
0
|
|
|
|
|
|
$variation_file, $prev_snv->{line}; |
388
|
|
|
|
|
|
|
} else { |
389
|
|
|
|
|
|
|
log_msg sprintf ":: Alteration [%s %d %s %s %s %s] masks [%s %d %s %s %s %s]" |
390
|
|
|
|
|
|
|
=> $next_snv->{seq_id}, $next_snv->{pos}+1, $next_snv->{id}, $next_ref, $next_alt, $next_snv->{plo}, |
391
|
0
|
|
|
|
|
|
$prev_snv->{seq_id}, $prev_snv->{pos}+1, $prev_snv->{id}, $prev_ref, $prev_alt, $prev_snv->{plo}; |
392
|
|
|
|
|
|
|
} |
393
|
|
|
|
|
|
|
|
394
|
0
|
|
|
|
|
|
$blacklist{refaddr($prev_snv)} = 1; |
395
|
0
|
|
|
|
|
|
next OUTER; |
396
|
|
|
|
|
|
|
} |
397
|
|
|
|
|
|
|
} |
398
|
|
|
|
|
|
|
} |
399
|
|
|
|
|
|
|
|
400
|
0
|
|
|
|
|
|
push @saved_snvs => $prev_snv; |
401
|
|
|
|
|
|
|
} |
402
|
|
|
|
|
|
|
|
403
|
0
|
|
|
|
|
|
return \@saved_snvs; |
404
|
|
|
|
|
|
|
} |
405
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
sub retrievedb { |
407
|
0
|
|
|
0
|
0
|
|
my ($self, $structural_variation) = @_; |
408
|
0
|
|
|
|
|
|
my $schema = App::Sandy::DB->schema; |
409
|
|
|
|
|
|
|
|
410
|
0
|
|
|
|
|
|
my %all_matrix; |
411
|
|
|
|
|
|
|
|
412
|
0
|
|
|
|
|
|
for my $sv (@$structural_variation) { |
413
|
0
|
|
|
|
|
|
my $rs = $schema->resultset('StructuralVariation')->find({ name => $sv }); |
414
|
0
|
0
|
|
|
|
|
die "'$sv' not found into database\n" unless defined $rs; |
415
|
|
|
|
|
|
|
|
416
|
0
|
|
|
|
|
|
my $compressed = $rs->matrix; |
417
|
0
|
0
|
|
|
|
|
die "structural variation entry '$sv' exists, but the related data is missing\n" |
418
|
|
|
|
|
|
|
unless defined $compressed; |
419
|
|
|
|
|
|
|
|
420
|
0
|
|
|
|
|
|
gunzip \$compressed => \my $bytes; |
421
|
0
|
|
|
|
|
|
my $matrix = thaw $bytes; |
422
|
|
|
|
|
|
|
|
423
|
0
|
|
|
|
|
|
while (my ($seq_id, $data) = each %$matrix) { |
424
|
0
|
|
|
|
|
|
push @{ $all_matrix{$seq_id} } => @$data; |
|
0
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
} |
426
|
|
|
|
|
|
|
} |
427
|
|
|
|
|
|
|
|
428
|
0
|
0
|
|
|
|
|
if (scalar @$structural_variation > 1) { |
429
|
0
|
|
|
|
|
|
log_msg sprintf ":: Removing overlapping entries from '[%s]'. If any ..." |
430
|
|
|
|
|
|
|
=> join(', ', @$structural_variation); |
431
|
0
|
|
|
|
|
|
$self->_validate_indexed_snv(\%all_matrix); |
432
|
|
|
|
|
|
|
} |
433
|
|
|
|
|
|
|
|
434
|
0
|
|
|
|
|
|
return \%all_matrix; |
435
|
|
|
|
|
|
|
} |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
sub deletedb { |
438
|
0
|
|
|
0
|
0
|
|
my ($self, $structural_variation) = @_; |
439
|
0
|
|
|
|
|
|
my $schema = App::Sandy::DB->schema; |
440
|
|
|
|
|
|
|
|
441
|
0
|
|
|
|
|
|
log_msg ":: Checking if there is already a structural variation '$structural_variation' ..."; |
442
|
0
|
|
|
|
|
|
my $rs = $schema->resultset('StructuralVariation')->find({ name => $structural_variation }); |
443
|
0
|
0
|
|
|
|
|
die "'$structural_variation' not found into database\n" unless defined $rs; |
444
|
|
|
|
|
|
|
|
445
|
0
|
|
|
|
|
|
log_msg ":: Found '$structural_variation'"; |
446
|
0
|
0
|
|
|
|
|
die "'$structural_variation' is not a user provided entry. Cannot be deleted\n" |
447
|
|
|
|
|
|
|
unless $rs->is_user_provided; |
448
|
|
|
|
|
|
|
|
449
|
0
|
|
|
|
|
|
log_msg ":: Removing '$structural_variation' entry ..."; |
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
# Begin transation |
452
|
0
|
|
|
|
|
|
my $guard = $schema->txn_scope_guard; |
453
|
|
|
|
|
|
|
|
454
|
0
|
|
|
|
|
|
$rs->delete; |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
# End transation |
457
|
0
|
|
|
|
|
|
$guard->commit; |
458
|
|
|
|
|
|
|
} |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
sub restoredb { |
461
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
462
|
0
|
|
|
|
|
|
my $schema = App::Sandy::DB->schema; |
463
|
|
|
|
|
|
|
|
464
|
0
|
|
|
|
|
|
log_msg ":: Searching for user-provided entries ..."; |
465
|
0
|
|
|
|
|
|
my $user_provided = $schema->resultset('StructuralVariation')->search( |
466
|
|
|
|
|
|
|
{ is_user_provided => 1 } |
467
|
|
|
|
|
|
|
); |
468
|
|
|
|
|
|
|
|
469
|
0
|
|
|
|
|
|
my $entry = $user_provided->next; |
470
|
0
|
0
|
|
|
|
|
if ($entry) { |
471
|
0
|
|
|
|
|
|
log_msg ":: Found:"; |
472
|
0
|
|
|
|
|
|
do { |
473
|
0
|
|
|
|
|
|
log_msg ' ==> ' . $entry->name; |
474
|
|
|
|
|
|
|
} while ($entry = $user_provided->next); |
475
|
|
|
|
|
|
|
} else { |
476
|
0
|
|
|
|
|
|
die "Not found user-provided entries. There is no need to restoring\n"; |
477
|
|
|
|
|
|
|
} |
478
|
|
|
|
|
|
|
|
479
|
0
|
|
|
|
|
|
log_msg ":: Removing all user-provided entries ..."; |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
# Begin transation |
482
|
0
|
|
|
|
|
|
my $guard = $schema->txn_scope_guard; |
483
|
|
|
|
|
|
|
|
484
|
0
|
|
|
|
|
|
$user_provided->delete; |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
# End transation |
487
|
0
|
|
|
|
|
|
$guard->commit; |
488
|
|
|
|
|
|
|
} |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
sub make_report { |
491
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
492
|
0
|
|
|
|
|
|
my $schema = App::Sandy::DB->schema; |
493
|
0
|
|
|
|
|
|
my %report; |
494
|
|
|
|
|
|
|
|
495
|
0
|
|
|
|
|
|
my $rs = $schema->resultset('StructuralVariation')->search(undef); |
496
|
|
|
|
|
|
|
|
497
|
0
|
|
|
|
|
|
while (my $structural_variation = $rs->next) { |
498
|
0
|
0
|
|
|
|
|
my %hash = ( |
499
|
|
|
|
|
|
|
source => $structural_variation->source, |
500
|
|
|
|
|
|
|
provider => $structural_variation->is_user_provided ? "user" : "vendor", |
501
|
|
|
|
|
|
|
date => $structural_variation->date |
502
|
|
|
|
|
|
|
); |
503
|
0
|
|
|
|
|
|
$report{$structural_variation->name} = \%hash; |
504
|
|
|
|
|
|
|
} |
505
|
|
|
|
|
|
|
|
506
|
0
|
|
|
|
|
|
return \%report; |
507
|
|
|
|
|
|
|
} |
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
__END__ |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
=pod |
512
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
=encoding UTF-8 |
514
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
=head1 NAME |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
App::Sandy::DB::Handle::Variation - Class to handle structural variation database schemas. |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
=head1 VERSION |
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
version 0.22 |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
=head1 AUTHORS |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
=over 4 |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
=item * |
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
Thiago L. A. Miller <tmiller@mochsl.org.br> |
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
=item * |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
J. Leonel Buzzo <lbuzzo@mochsl.org.br> |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
=item * |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
Felipe R. C. dos Santos <fsantos@mochsl.org.br> |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
=item * |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
Helena B. Conceição <hconceicao@mochsl.org.br> |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
=item * |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
Gabriela Guardia <gguardia@mochsl.org.br> |
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
=item * |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
Fernanda Orpinelli <forpinelli@mochsl.org.br> |
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
=item * |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
Pedro A. F. Galante <pgalante@mochsl.org.br> |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
=back |
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
This software is Copyright (c) 2018 by Teaching and Research Institute from SÃrio-Libanês Hospital. |
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
This is free software, licensed under: |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
The GNU General Public License, Version 3, June 2007 |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
=cut |