File Coverage

lib/App/Sandy/DB/Handle/Variation.pm
Criterion Covered Total %
statement 21 246 8.5
branch 0 108 0.0
condition 0 27 0.0
subroutine 7 16 43.7
pod 0 5 0.0
total 28 402 6.9


line stmt bran cond sub pod time code
1             package App::Sandy::DB::Handle::Variation;
2             # ABSTRACT: Class to handle structural variation database schemas.
3              
4 6     6   48 use App::Sandy::Base 'class';
  6         12  
  6         53  
5 6     6   73 use App::Sandy::DB;
  6         18  
  6         181  
6 6     6   42 use IO::Compress::Gzip 'gzip';
  6         12  
  6         544  
7 6     6   46 use IO::Uncompress::Gunzip 'gunzip';
  6         11  
  6         402  
8 6     6   46 use Storable qw/nfreeze thaw/;
  6         25  
  6         1024  
9 6     6   47 use Scalar::Util qw/looks_like_number refaddr/;
  6         12  
  6         420  
10 6     6   47 use List::Util 'max';
  6         17  
  6         26106  
11              
12             with qw/App::Sandy::Role::IO App::Sandy::Role::SeqID/;
13              
14             our $VERSION = '0.22'; # VERSION
15              
16             sub insertdb {
17 0     0 0   my ($self, $file, $name, $source, $is_user_provided, $type, @args) = @_;
18 0           my $schema = App::Sandy::DB->schema;
19              
20 0           log_msg ":: Checking if there is already a structural variation '$name' ...";
21 0           my $rs = $schema->resultset('StructuralVariation')->find({ name => $name });
22 0 0         if ($rs) {
23 0           die "There is already a structural variation '$name'\n";
24             } else {
25 0           log_msg ":: structural variation '$name' not found";
26             }
27              
28 0           log_msg ":: Indexing '$file'";
29 0           log_msg ":: It may take a while ...";
30 0           my $indexed_file;
31              
32 0 0         if ($type eq 'raw') {
    0          
33 0           $indexed_file = $self->_index_snv($file);
34             } elsif ($type eq 'vcf') {
35 0           $indexed_file = $self->_index_vcf($file, @args);
36             } else {
37 0           croak "No type '$type'";
38             }
39              
40 0           log_msg ":: Removing overlapping entries in structural variation file '$file', if any ...";
41 0           $self->_validate_indexed_snv($indexed_file, $file);
42              
43 0           log_msg ":: Converting data to bytes ...";
44 0           my $bytes = nfreeze $indexed_file;
45 0           log_msg ":: Compressing bytes ...";
46 0           gzip \$bytes => \my $compressed;
47              
48             # Begin transation
49 0           my $guard = $schema->txn_scope_guard;
50              
51 0           log_msg ":: Storing structural variation '$name'...";
52 0           $rs = $schema->resultset('StructuralVariation')->create({
53             name => $name,
54             source => $source,
55             is_user_provided => $is_user_provided,
56             matrix => $compressed
57             });
58              
59             # End transation
60 0           $guard->commit;
61             }
62              
63             sub _index_snv {
64 0     0     my ($self, $variation_file) = @_;
65              
66 0           my $fh = $self->with_open_r($variation_file);
67              
68 0           my %indexed_snv;
69 0           my $line = 0;
70              
71             # chr pos ref @obs he
72 0           while (<$fh>) {
73 0           $line++;
74              
75             # Skip coments and blank lines
76 0 0         next if /^#/;
77 0 0         next if /^\s*$/;
78              
79 0           chomp;
80 0           my @fields = split;
81              
82 0 0         die "Not found all fields (SEQID, POSITION, ID, REFERENCE, OBSERVED, GENOTYPE) into file '$variation_file' at line $line\n"
83             unless scalar @fields >= 6;
84              
85 0 0         die "Second column, position, does not seem to be a number into file '$variation_file' at line $line\n"
86             unless looks_like_number($fields[1]);
87              
88 0 0         die "Second column, position, has a value lesser or equal to zero into file '$variation_file' at line $line\n"
89             if $fields[1] <= 0;
90              
91 0 0         die "Fourth column, reference, does not seem to be a valid entry: '$fields[3]' into file '$variation_file' at line $line\n"
92             unless $fields[3] =~ /^(\w+|-)$/;
93              
94 0 0         die "Fifth column, alteration, does not seem to be a valid entry: '$fields[4]' into file '$variation_file' at line $line\n"
95             unless $fields[4] =~ /^(\w+|-)$/;
96              
97 0 0         die "Sixth column, genotype, has an invalid entry: '$fields[5]' into file '$variation_file' at line $line. Valid ones are 'HE' or 'HO'\n"
98             unless $fields[5] =~ /^(HE|HO)$/;
99              
100 0 0         if ($fields[3] eq $fields[4]) {
101 0           warn "There is an alteration equal to the reference at '$variation_file' line $line. I will ignore it\n";
102 0           next;
103             }
104              
105             # Sequence inside perl begins at 0
106 0           my $position = int($fields[1] - 1);
107              
108             # Compare the alterations and reference to guess the max variation on sequence
109 0           my $size_of_variation = max map { length } $fields[3], $fields[4];
  0            
110 0           my $high = $position + $size_of_variation - 1;
111              
112 0           my %variation = (
113             seq_id => $fields[0],
114             id => $fields[2],
115             ref => $fields[3],
116             alt => $fields[4],
117             plo => $fields[5],
118             pos => $position,
119             low => $position,
120             high => $high,
121             line => $line
122             );
123              
124 0           push @{ $indexed_snv{$self->with_std_seqid($fields[0])} } => \%variation;
  0            
125             }
126              
127 0 0         close $fh
128             or die "Cannot close structural variation file '$variation_file': $!\n";
129              
130 0           return \%indexed_snv;
131             }
132              
133             sub _index_vcf {
134 0     0     my ($self, $vcf_file, $sample_name) = @_;
135              
136 0           my $fh = $self->with_open_r($vcf_file);
137              
138 0           my $magic = <$fh>;
139 0           chomp $magic;
140              
141 0 0         if ($magic !~ /^##fileformat=VCFv\d+(\.\d+)?$/) {
142 0           log_msg ":: Putative VCF file '$vcf_file' does not have the required 'fileformat=VCFv{VERSION}' field\n";
143 0           log_msg ":: Continue anyway ...\n";
144             }
145              
146 0           my $sample_i = 9;
147 0           my $has_header = 0;
148 0           my $line = 1;
149 0           my %indexed_snv;
150              
151 0           while (<$fh>) {
152 0           $line ++;
153              
154 0 0         next if /^##/;
155 0 0         next if /^\s*$/;
156              
157 0           chomp;
158 0           my @fields = split /\t/;
159              
160 0 0         if (/^#/) {
161 0           my $number_of_fields = scalar @fields;
162              
163 0 0         if ($number_of_fields < 8) {
    0          
    0          
164 0           die "Invalid header: Less than 8 columns at line $line\n";
165             } elsif ($number_of_fields == 8) {
166 0           die "No genotype data found in the header at line $line\n";
167             } elsif ($number_of_fields == 9) {
168 0           die "Invalid header: Missing sample data at line $line\n";
169             }
170              
171 0           my @samples = splice @fields, 9;
172              
173 0 0         if (defined $sample_name) {
174 0           my %columns;
175              
176 0           for my $i (0..$#samples) {
177 0           $columns{$samples[$i]} = $i;
178             }
179              
180 0 0         if (not exists $columns{$sample_name}) {
181 0           die "Not found sample named '$sample_name' at file '$vcf_file'\n";
182             }
183              
184 0           $sample_i += $columns{$sample_name};
185             } else {
186 0           $sample_name = $samples[$sample_i];
187             }
188              
189 0           $has_header = 1;
190 0           next;
191             }
192              
193 0 0         unless ($has_header) {
194 0           die "No header found in file '$vcf_file'\n";
195             }
196              
197 0 0         die "Not found all columns into file '$vcf_file' at line $line\n"
198             unless scalar @fields > 9;
199              
200 0 0         die "Second column, position, does not seem to be a number into file '$vcf_file' at line $line\n"
201             unless looks_like_number($fields[1]);
202              
203 0 0         die "Second column, position, has a value lesser or equal to zero into file '$vcf_file' at line $line\n"
204             if $fields[1] <= 0;
205              
206 0 0         die "Fourth column, reference, does not seem to be a valid entry: '$fields[3]' into file '$vcf_file' at line $line\n"
207             unless $fields[3] =~ /^(<\w+(:\w+)*>|\w+|-)$/;
208              
209 0 0         die "Fifth column, alternate, does not seem to be a valid entry: '$fields[4]' into file '$vcf_file' at line $line\n"
210             unless $fields[4] =~ /^(\w+|<\w+(:\w+)*>|-|\*)(,(\w+|<\w+(:\w+)*>|-|\*))*$/;
211              
212 0 0         die "No genotype field found in format column at line $line\n"
213             unless $fields[8] =~ /^GT:?/;
214              
215 0 0         die "Sample '$sample_name', column '$sample_i', does not seem to be a valid entry: '$fields[$sample_i]' into file '$vcf_file' at line $line\n"
216             unless $fields[$sample_i] =~ /^[0-9.]+([\/\|][0-9.])?:*/;
217              
218 0 0         if ($fields[3] eq $fields[4]) {
219 0           log_msg ":: There is an alternate equal to the reference at '$vcf_file' line $line. I will ignore it\n";
220 0           next;
221             }
222              
223             # No support for structural variation <VAR>
224             # So ignore it
225 0 0 0       if ($fields[3] =~ /<\w+>/ || $fields[4] =~ /<\w+>/) {
226 0           next;
227             }
228              
229             # Ignore missing alleles
230 0 0         if ($fields[4] =~ /\*/) {
231 0           next;
232             }
233              
234 0           my @alternates = split /,/ => $fields[4];
235 0           my $genotype = (split /:/ => $fields[$sample_i])[0];
236              
237 0           my ($plo, $alternate);
238 0           my ($g1, $g2) = split /\|/ => $genotype;
239              
240              
241 0 0 0       if ($g1 eq '.' || (defined $g2 && $g2 eq '.')) {
      0        
242 0           next;
243             }
244              
245             # Homozygosity or haploid chromossome
246 0 0 0       if (! defined($g2) || ($g1 == $g2)) {
247             # No variation
248 0 0         next if $g1 == 0;
249 0           $alternate = $alternates[$g1 - 1];
250 0           $plo = 'HO';
251             } else {
252             # The alternate is the one with the max index, in the case
253             # of a variation with the two alleles been alterations
254 0 0         my $i = $g1 > $g2
255             ? $g1 - 1
256             : $g2 - 1;
257 0           $alternate = $alternates[$i];
258 0           $plo = 'HE';
259             }
260              
261             # Sequence inside perl begins at 0
262 0           my $position = int($fields[1] - 1);
263              
264             # Compare the alterations and reference to guess the max variation on sequence
265 0           my $size_of_variation = max map { length } $fields[3], $alternate;
  0            
266 0           my $high = $position + $size_of_variation - 1;
267              
268 0           my %variation = (
269             seq_id => $fields[0],
270             id => $fields[2],
271             ref => $fields[3],
272             alt => $alternate,
273             plo => $plo,
274             pos => $position,
275             low => $position,
276             high => $high,
277             line => $line
278             );
279              
280 0           push @{ $indexed_snv{$self->with_std_seqid($fields[0])} } => \%variation;
  0            
281             }
282              
283 0 0         close $fh
284             or die "Cannot close vcf file '$vcf_file': $!\n";
285              
286 0           return \%indexed_snv;
287             }
288              
289             sub _validate_indexed_snv {
290 0     0     my ($self, $indexed_snv, $variation_file) = @_;
291              
292 0           for my $seq_id (keys %$indexed_snv) {
293 0           my $snvs_a = delete $indexed_snv->{$seq_id};
294 0           my @sorted_snvs = sort { $a->{low} <=> $b->{low} } @$snvs_a;
  0            
295              
296 0           my $prev_snv = $sorted_snvs[0];
297 0           my $high = $prev_snv->{high};
298 0           push my @snv_cluster => $prev_snv;
299              
300 0           for (my $i = 1; $i < @sorted_snvs; $i++) {
301 0           my $next_snv = $sorted_snvs[$i];
302              
303             # If not overlapping
304 0 0         if ($next_snv->{low} > $high) {
305 0           my $valid_snvs = $self->_validate_indexed_snv_cluster(\@snv_cluster, $variation_file);
306 0           push @{ $indexed_snv->{$seq_id} } => @$valid_snvs;
  0            
307 0           @snv_cluster = ();
308             }
309              
310 0           push @snv_cluster => $next_snv;
311 0           $high = max $high, $next_snv->{high};
312 0           $prev_snv = $next_snv;
313             }
314              
315 0           my $valid_snvs = $self->_validate_indexed_snv_cluster(\@snv_cluster, $variation_file);
316 0           push @{ $indexed_snv->{$seq_id} } => @$valid_snvs;
  0            
317             }
318             }
319              
320             sub _validate_indexed_snv_cluster {
321 0     0     my ($self, $snvs, $variation_file) = @_;
322              
323             # My rules:
324             # The biggest structural variation gains precedence.
325             # If occurs an overlapping, I search to the biggest variation among
326             # the saved alterations and compare it with the actual entry:
327             # *** Remove all overlapping variations if actual entry is bigger;
328             # *** Skip actual entry if it is lower than the biggest variation
329             # *** Insertionw can be before any alterations
330              
331 0           my @saved_snvs;
332             my %blacklist;
333              
334 0           OUTER: for (my $i = 0; $i < @$snvs; $i++) {
335 0           my $prev_snv = $snvs->[$i];
336              
337 0 0         if ($blacklist{refaddr($prev_snv)}) {
338 0           next OUTER;
339             }
340              
341 0           INNER: for (my $j = 0; $j < @$snvs; $j++) {
342 0           my $next_snv = $snvs->[$j];
343              
344 0 0 0       if (($i == $j) || $blacklist{refaddr($next_snv)}) {
345 0           next INNER;
346             }
347              
348             # Insertion after insertion
349 0 0 0       if ($next_snv->{ref} eq '-' && $prev_snv->{ref} eq '-' && $next_snv->{pos} != $prev_snv->{pos}) {
    0 0        
      0        
      0        
350 0           next INNER;
351              
352             # Insertion before alteration
353             } elsif ($next_snv->{ref} eq '-' && $prev_snv->{ref} ne '-' && $next_snv->{pos} < $prev_snv->{pos}) {
354 0           next INNER;
355              
356             # In this case, it gains the biggest one
357             } else {
358 0           my $prev_size = $prev_snv->{high} - $prev_snv->{low} + 1;
359 0           my $next_size = $next_snv->{high} - $next_snv->{low} + 1;
360              
361             my ($prev_ref, $prev_alt, $next_ref, $next_alt) =
362             map {
363 0 0         length($_) > 25
364             ? substr($_, 0, 25) . "..."
365             : $_
366 0           } ($prev_snv->{ref}, $prev_snv->{alt}, $next_snv->{ref}, $next_snv->{alt});
367              
368 0 0         if ($prev_size >= $next_size) {
369 0 0         if ($variation_file) {
370             log_msg sprintf ":: Alteration [%s %d %s %s %s %s] masks [%s %d %s %s %s %s] at '%s' line %d"
371             => $prev_snv->{seq_id}, $prev_snv->{pos}+1, $prev_snv->{id}, $prev_ref, $prev_alt, $prev_snv->{plo},
372             $next_snv->{seq_id}, $next_snv->{pos}+1, $next_snv->{id}, $next_ref, $next_alt, $next_snv->{plo},
373 0           $variation_file, $next_snv->{line};
374             } else {
375             log_msg sprintf ":: Alteration [%s %d %s %s %s %s] masks [%s %d %s %s %s %s]"
376             => $prev_snv->{seq_id}, $prev_snv->{pos}+1, $prev_snv->{id}, $prev_ref, $prev_alt, $prev_snv->{plo},
377 0           $next_snv->{seq_id}, $next_snv->{pos}+1, $next_snv->{id}, $next_ref, $next_alt, $next_snv->{plo};
378             }
379              
380 0           $blacklist{refaddr($next_snv)} = 1;
381 0           next INNER;
382             } else {
383 0 0         if ($variation_file) {
384             log_msg sprintf ":: Alteration [%s %d %s %s %s %s] masks [%s %d %s %s %s %s] at '%s' line %d"
385             => $next_snv->{seq_id}, $next_snv->{pos}+1, $next_snv->{id}, $next_ref, $next_alt, $next_snv->{plo},
386             $prev_snv->{seq_id}, $prev_snv->{pos}+1, $prev_snv->{id}, $prev_ref, $prev_alt, $prev_snv->{plo},
387 0           $variation_file, $prev_snv->{line};
388             } else {
389             log_msg sprintf ":: Alteration [%s %d %s %s %s %s] masks [%s %d %s %s %s %s]"
390             => $next_snv->{seq_id}, $next_snv->{pos}+1, $next_snv->{id}, $next_ref, $next_alt, $next_snv->{plo},
391 0           $prev_snv->{seq_id}, $prev_snv->{pos}+1, $prev_snv->{id}, $prev_ref, $prev_alt, $prev_snv->{plo};
392             }
393              
394 0           $blacklist{refaddr($prev_snv)} = 1;
395 0           next OUTER;
396             }
397             }
398             }
399              
400 0           push @saved_snvs => $prev_snv;
401             }
402              
403 0           return \@saved_snvs;
404             }
405              
406             sub retrievedb {
407 0     0 0   my ($self, $structural_variation) = @_;
408 0           my $schema = App::Sandy::DB->schema;
409              
410 0           my %all_matrix;
411              
412 0           for my $sv (@$structural_variation) {
413 0           my $rs = $schema->resultset('StructuralVariation')->find({ name => $sv });
414 0 0         die "'$sv' not found into database\n" unless defined $rs;
415              
416 0           my $compressed = $rs->matrix;
417 0 0         die "structural variation entry '$sv' exists, but the related data is missing\n"
418             unless defined $compressed;
419              
420 0           gunzip \$compressed => \my $bytes;
421 0           my $matrix = thaw $bytes;
422              
423 0           while (my ($seq_id, $data) = each %$matrix) {
424 0           push @{ $all_matrix{$seq_id} } => @$data;
  0            
425             }
426             }
427              
428 0 0         if (scalar @$structural_variation > 1) {
429 0           log_msg sprintf ":: Removing overlapping entries from '[%s]'. If any ..."
430             => join(', ', @$structural_variation);
431 0           $self->_validate_indexed_snv(\%all_matrix);
432             }
433              
434 0           return \%all_matrix;
435             }
436              
437             sub deletedb {
438 0     0 0   my ($self, $structural_variation) = @_;
439 0           my $schema = App::Sandy::DB->schema;
440              
441 0           log_msg ":: Checking if there is already a structural variation '$structural_variation' ...";
442 0           my $rs = $schema->resultset('StructuralVariation')->find({ name => $structural_variation });
443 0 0         die "'$structural_variation' not found into database\n" unless defined $rs;
444              
445 0           log_msg ":: Found '$structural_variation'";
446 0 0         die "'$structural_variation' is not a user provided entry. Cannot be deleted\n"
447             unless $rs->is_user_provided;
448              
449 0           log_msg ":: Removing '$structural_variation' entry ...";
450              
451             # Begin transation
452 0           my $guard = $schema->txn_scope_guard;
453              
454 0           $rs->delete;
455              
456             # End transation
457 0           $guard->commit;
458             }
459              
460             sub restoredb {
461 0     0 0   my $self = shift;
462 0           my $schema = App::Sandy::DB->schema;
463              
464 0           log_msg ":: Searching for user-provided entries ...";
465 0           my $user_provided = $schema->resultset('StructuralVariation')->search(
466             { is_user_provided => 1 }
467             );
468              
469 0           my $entry = $user_provided->next;
470 0 0         if ($entry) {
471 0           log_msg ":: Found:";
472 0           do {
473 0           log_msg ' ==> ' . $entry->name;
474             } while ($entry = $user_provided->next);
475             } else {
476 0           die "Not found user-provided entries. There is no need to restoring\n";
477             }
478              
479 0           log_msg ":: Removing all user-provided entries ...";
480              
481             # Begin transation
482 0           my $guard = $schema->txn_scope_guard;
483              
484 0           $user_provided->delete;
485              
486             # End transation
487 0           $guard->commit;
488             }
489              
490             sub make_report {
491 0     0 0   my $self = shift;
492 0           my $schema = App::Sandy::DB->schema;
493 0           my %report;
494              
495 0           my $rs = $schema->resultset('StructuralVariation')->search(undef);
496              
497 0           while (my $structural_variation = $rs->next) {
498 0 0         my %hash = (
499             source => $structural_variation->source,
500             provider => $structural_variation->is_user_provided ? "user" : "vendor",
501             date => $structural_variation->date
502             );
503 0           $report{$structural_variation->name} = \%hash;
504             }
505              
506 0           return \%report;
507             }
508              
509             __END__
510              
511             =pod
512              
513             =encoding UTF-8
514              
515             =head1 NAME
516              
517             App::Sandy::DB::Handle::Variation - Class to handle structural variation database schemas.
518              
519             =head1 VERSION
520              
521             version 0.22
522              
523             =head1 AUTHORS
524              
525             =over 4
526              
527             =item *
528              
529             Thiago L. A. Miller <tmiller@mochsl.org.br>
530              
531             =item *
532              
533             J. Leonel Buzzo <lbuzzo@mochsl.org.br>
534              
535             =item *
536              
537             Felipe R. C. dos Santos <fsantos@mochsl.org.br>
538              
539             =item *
540              
541             Helena B. Conceição <hconceicao@mochsl.org.br>
542              
543             =item *
544              
545             Gabriela Guardia <gguardia@mochsl.org.br>
546              
547             =item *
548              
549             Fernanda Orpinelli <forpinelli@mochsl.org.br>
550              
551             =item *
552              
553             Pedro A. F. Galante <pgalante@mochsl.org.br>
554              
555             =back
556              
557             =head1 COPYRIGHT AND LICENSE
558              
559             This software is Copyright (c) 2018 by Teaching and Research Institute from Sírio-Libanês Hospital.
560              
561             This is free software, licensed under:
562              
563             The GNU General Public License, Version 3, June 2007
564              
565             =cut