File Coverage

blib/lib/BioX/Wrapper/Annovar.pm
Criterion Covered Total %
statement 24 204 11.7
branch 0 42 0.0
condition 0 3 0.0
subroutine 8 24 33.3
pod n/a
total 32 273 11.7


line stmt bran cond sub pod time code
1             package BioX::Wrapper::Annovar;
2              
3             #use 5.006;
4              
5 1     1   28863 use Moose;
  1         453748  
  1         8  
6 1     1   7071 use File::Find::Rule;
  1         7196  
  1         8  
7 1     1   44 use File::Basename;
  1         7  
  1         65  
8 1     1   5 use File::Path qw(make_path remove_tree);
  1         0  
  1         45  
9 1     1   4 use File::Find::Rule;
  1         1  
  1         3  
10 1     1   25 use Cwd;
  1         1  
  1         43  
11 1     1   733 use IO::Uncompress::Gunzip;
  1         40044  
  1         48  
12 1     1   606 use Data::Dumper;
  1         5041  
  1         2574  
13              
14             require Vcf;
15              
16             extends 'BioX::Wrapper';
17             with 'MooseX::Getopt';
18              
19             =head1 NAME
20              
21             BioX::Wrapper::Annovar - A wrapper around the annovar annotation pipeline
22              
23             =head1 VERSION
24              
25             Version 0.06
26              
27             =cut
28              
29             our $VERSION = '0.39';
30              
31              
32             =head1 SYNOPSIS
33              
34             annovar-wrapper.pl --vcfs file1.vcf,file2.vcf --annovardb_path /path/to/annovar/dbs
35              
36             This module is a wrapper around the popular annotation tool, annovar. http://www.openbioinformatics.org/annovar/ . The commands generated are taken straight from the documentation. In addition, there is an option to reannotate using vcf-annotate from vcftools.
37              
38             It takes as its input a list or directory of vcf files, bgzipped and tabixed or not, and uses annovar to create annotation files. These multianno table files can be optionally reannotated into the vcf file. This script does not actually execute any commands, only writes them to STDOUT for the user to run as they wish.
39              
40             It comes with an executable script annovar-wrapper.pl. This should be sufficient for most of your needs, but if you wish to overwrite methods you can always do so in the usual Moose fashion.
41              
42             #!/usr/bin/env perl
43              
44             package Main;
45              
46             use Moose;
47             extends 'BioX::Wrapper::Annovar';
48              
49             BioX::Wrapper::Annovar->new_with_options->run;
50              
51             sub method_to_override {
52             my $self = shift;
53              
54             #dostuff
55             };
56              
57             before 'method' => sub {
58             my $self = shift;
59              
60             #dostuff
61             };
62              
63             has '+variable' => (
64             #things to add to variable declaration
65             );
66              
67             #or
68            
69             has 'variable' => (
70             #override variable declaration
71             );
72              
73             1;
74              
75             Please see the Moose::Manual::MethodModifiers for more information.
76              
77             =head1 Prerequisites
78              
79             This module requires the annovar download. The easiest thing to do is to put the annovar scripts in your ENV{PATH}, but if you choose not to do this you can also pass in the location with
80              
81             annovar-wrapper.pl --tableannovar_path /path/to/table_annovar.pl --convert2annovar_path /path/to/convert2annovar.pl
82              
83             It requires Vcf.pm, which comes with vcftools.
84              
85             Vcftools is publicly available for download. http://vcftools.sourceforge.net/.
86              
87             export PERL5LIB=$PERL5LIB:path_to_vcftools/perl
88              
89             If you wish to you reannotate the vcf file you need to have bgzip and tabix installed, and have the executables in vcftools in your path.
90              
91             export PATH=$PATH:path_to_vcftools
92              
93             =head1 Generate an Example
94              
95             To generate an example you can run the following commands
96              
97             tabix -h ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/ALL.2of4intersection.20100804.genotypes.vcf.gz 2:39967768-40000000 > test.vcf
98             bgzip test.vcf
99             tabix test.vcf.gz
100             vcf-subset -c HG00098,HG00100,HG00106,HG00112,HG00114 test.vcf.gz | bgzip -c > out.vcf.gz
101             tabix out.vcf.gz
102             rm test.vcf.gz
103             rm test.vcf.gz.tbi
104              
105             annovar-wrapper.pl --vcfs out.vcf.gz --annovar_dbs refGene --annovar_fun g --outdir annovar_out --annovardb_path /path/to/annovar/dbs > my_cmds.sh
106              
107             There is more detail on the example in the pod files.
108              
109             =cut
110              
111             =head1 Variables
112              
113             =cut
114              
115             =head2 Annovar Options
116              
117             =cut
118              
119             =head3 tableannovar_path
120              
121             You can put the location of the annovar scripts in your ENV{PATH}, and the default is fine. If annovar is not in your PATH, please supply the location.
122              
123             =cut
124              
125             has 'tableannovar_path' => (
126             is => 'rw',
127             isa => 'Str',
128             required => 1,
129             default => "table_annovar.pl"
130             );
131              
132             =head3 convert2annovar_path
133              
134             You can put the location of the annovar scripts in your ENV{PATH}, and the default is fine. If annovar is not in your PATH, please supply the location
135              
136             =cut
137              
138             has 'convert2annovar_path' => (
139             is => 'rw',
140             isa => 'Str',
141             required => 1,
142             default => "convert2annovar.pl"
143             );
144              
145             =head3 annovardb_path
146              
147             Path to your annovar databases
148              
149             =cut
150              
151             has 'annovardb_path' => (
152             is => 'rw',
153             isa => 'Str',
154             default => '/data/apps/software/annovar/hg19',
155             required => 1,
156             );
157              
158             =head3 buildver
159              
160             Its probably hg19 or hg18
161              
162             =cut
163              
164             has 'buildver' => (
165             is => 'rw',
166             isa => 'Str',
167             default => 'hg19',
168             required => 1,
169             );
170              
171             =head3 convert2annovar_opts
172              
173             Assumes vcf version 4 and that you want to convert all samples
174              
175             Not using --allsample on a multisample vcf is untested and will probably break the whole pipeline
176              
177             =cut
178              
179             has 'convert2annovar_opts' => (
180             is => 'rw',
181             isa => 'Str',
182             default => '-format vcf4 --allsample',
183             );
184              
185             =head3 annovar_dbs
186              
187             These are pretty much all the databases listed on
188              
189             http://www.openbioinformatics.org/annovar/annovar_download.html for hg19 that I tested as working
190              
191             #Download databases with
192              
193             cd path_to_annovar_dir
194             ./annotate_variation.pl --buildver hg19 -downdb -webfrom annovar esp6500si_aa hg19/
195              
196             #Option is an ArrayRef, and can be given as either
197              
198             --annovar_dbs cg46,cg69,nci60
199              
200             #or
201              
202             --annovar_dbs cg46 --annovar_dbs cg69 --annovar_dbs nci60
203              
204             =cut
205             #TODO
206             #Add in a hashref so I don't have to remember the list
207             #The following are redundant within other databases
208             #esp are in popfreq_all
209             #esp6500si_aa
210             #esp6500si_ea
211             #ljb23 are in ljb23_all
212             #ljb23_fathmm
213             #ljb23_gerp++
214             #ljb23_lrt
215             #ljb23_ma
216             #ljb23_metalr
217             #ljb23_metasvm
218             #ljb23_mt
219             #ljb23_phylop
220             #ljb23_pp2hdiv
221             #ljb23_pp2hvar
222             #ljb23_sift
223             #ljb23_siphy
224             #ljb2_pp2hvar
225             #Leaving out cg46
226             #
227             ## The following have been tested
228             # snp138NonFlagged,snp138,popfreq_all,cg69,cosmic68wgs,clinvar_20140211,gwasCatalog,caddgt20,phastConsElements46way,gerp++elem,wgEncodeBroadHmmGm12878HMM,wgEncodeUwDnaseGm12878HotspotsRep2,ljb23_all,refGene
229              
230             has 'annovar_dbs' => (
231             is => 'rw',
232             isa => 'ArrayRef',
233             required => 0,
234             default => sub {
235             return [qw(snp138NonFlagged
236             snp138
237             popfreq_all
238             cg69
239             cosmic68wgs
240             clinvar_20140211
241             gwasCatalog
242             caddgt20
243             phastConsElements46way
244             gerp++elem
245             wgEncodeBroadHmmGm12878HMM
246             wgEncodeUwDnaseGm12878HotspotsRep2
247             ljb23_all
248             refGene
249             )]
250             }
251             );
252              
253             =head3 annovar_fun
254              
255             Functions of the individual databases can be found at
256              
257             What function your DB may already be listed otherwise it is probably listed in the URLS under Annotation: Gene-Based, Region-Based, or Filter-Based
258              
259             Functions must be given in the corresponding order of your annovar_dbs
260              
261             #Option is an ArrayRef, and can be given as either
262              
263             --annovar_fun f,f,g
264              
265             #or
266              
267             --annovar_fun f --annovar_fun f --annovar_fun g
268              
269             =cut
270              
271             has 'annovar_fun' => (
272             is => 'rw',
273             isa => 'ArrayRef',
274             required => 0,
275             default => sub {
276             return [qw(f
277             f
278             f
279             f
280             f
281             f
282             r
283             f
284             r
285             f
286             r
287             r
288             f
289             g)]
290             }
291             );
292              
293             =head3 annovar_cols
294              
295             Some database annotations generate multiple columns. For reannotating the vcf we need to know what these columns are. Below are the columns generated for the databases given in annovar_dbs
296              
297             To add give a hashref of array
298              
299             =cut
300              
301             has 'annovar_cols' => (
302             is => 'rw',
303             isa => 'HashRef',
304             required => 0,
305             default => sub {
306             my $href = {};
307             #Old table_annovar.pl script
308             # $href->{popfreq_max} = ["PopFreqMax"];
309             # $href->{popfreq_all} = ["PopFreqMax","1000G2012APR_ALL","1000G2012APR_AFR","1000G2012APR_AMR","1000G2012APR_ASN","1000G2012APR_EUR","ESP6500si_AA","ESP6500si_EA","CG46","NCI60","SNP137","COSMIC65","DISEASE"];
310             # $href->{refGene} = ["Func.refGene","Gene.refGene","ExonicFunc.refGene","AAChange.refGene"];
311             # $href->{ljb_all} = [ qw/LJB_PhyloP LJB_PhyloP_Pred LJB_SIFT LJB_SIFT_Pred LJB_PolyPhen2 LJB_PolyPhen2_Pred LJB_LRT LJB_LRT_Pred LJB_MutationTaster LJB_MutationTaster_Pred LJB_GERP++/ ];
312             # $href->{ljb2_all} = [ qw/LJB2_SIFT LJB2_PolyPhen2_HDIV LJB2_PP2_HDIV_Pred LJB2_PolyPhen2_HVAR LJB2_PolyPhen2_HVAR_Pred LJB2_LRT LJB2_LRT_Pred LJB2_MutationTaster LJB2_MutationTaster_Pred LJB_MutationAssessor LJB_MutationAssessor_Pred LJB2_FATHMM LJB2_GERP++ LJB2_PhyloP LJB2_SiPhy/ ];
313             # $href->{ljb23_all} = [ qw/LJB23_SIFT_score LJB23_SIFT_score_converted LJB23_SIFT_pred LJB23_Polyphen2_HDIV_score LJB23_Polyphen2_HDIV_pred LJB23_Polyphen2_HVAR_score LJB23_Polyphen2_HVAR_pred LJB23_LRT_score LJB23_LRT_score_converted LJB23_LRT_pred LJB23_MutationTaster_score LJB23_MutationTaster_score_converted LJB23_MutationTaster_pred LJB23_MutationAssessor_score LJB23_MutationAssessor_score_converted LJB23_MutationAssessor_pred LJB23_FATHMM_score LJB23_FATHMM_score_converted LJB23_FATHMM_pred LJB23_RadialSVM_score LJB23_RadialSVM_score_converted LJB23_RadialSVM_pred LJB23_LR_score LJB23_LR_pred LJB23_GERP++ LJB23_PhyloP LJB23_SiPhy/ ];
314              
315             # #Which one?
316             # #$href->{refGene} = ["Func.refGene","Gene.refGene", "ExonicFunc.refGene","AAChange.refGene"];
317             $href->{refGene} = ["Func.refGene","Gene.refGene","GeneDetail.refGene", "ExonicFunc.refGene","AAChange.refGene"];
318             $href->{ljb_all} = [ qw/LJB_PhyloP LJB_PhyloP_Pred LJB_SIFT LJB_SIFT_Pred LJB_PolyPhen2 LJB_PolyPhen2_Pred LJB_LRT LJB_LRT_Pred LJB_MutationTaster LJB_MutationTaster_Pred LJB_GERPPP/ ];
319             $href->{ljb2_all} = [ qw/LJB2_SIFT LJB2_PolyPhen2_HDIV LJB2_PP2_HDIV_Pred LJB2_PolyPhen2_HVAR LJB2_PolyPhen2_HVAR_Pred LJB2_LRT LJB2_LRT_Pred LJB2_MutationTaster LJB2_MutationTaster_Pred LJB_MutationAssessor LJB_MutationAssessor_Pred LJB2_FATHMM LJB2_GERPPP LJB2_PhyloP LJB2_SiPhy/ ];
320             $href->{ljb23_all} = [ qw/LJB23_SIFT_score LJB23_SIFT_score_converted LJB23_SIFT_pred LJB23_Polyphen2_HDIV_score LJB23_Polyphen2_HDIV_pred LJB23_Polyphen2_HVAR_score LJB23_Polyphen2_HVAR_pred LJB23_LRT_score LJB23_LRT_score_converted LJB23_LRT_pred LJB23_MutationTaster_score LJB23_MutationTaster_score_converted LJB23_MutationTaster_pred LJB23_MutationAssessor_score LJB23_MutationAssessor_score_converted LJB23_MutationAssessor_pred LJB23_FATHMM_score LJB23_FATHMM_score_converted LJB23_FATHMM_pred LJB23_RadialSVM_score LJB23_RadialSVM_score_converted LJB23_RadialSVM_pred LJB23_LR_score LJB23_LR_pred LJB23_GERPPP LJB23_PhyloP LJB23_SiPhy/ ];
321             $href->{popfreq_all} = [ qw/PopFreqMax 1000G2012APR_ALL 1000G2012APR_AFR 1000G2012APR_AMR 1000G2012APR_ASN 1000G2012APR_EUR ESP6500si_ALL ESP6500si_AA ESP6500si_EA CG46/ ];
322             return $href;
323             }
324             );
325              
326             =head3 vcfs
327              
328             VCF files can be given individually as well.
329              
330             #Option is an ArrayRef and can be given as either
331              
332             --vcfs 1.vcf,2.vcf,3.vcfs
333              
334             #or
335              
336             --vcfs 1.vcf --vcfs 2.vcf --vcfs 3.vcf
337              
338             Don't mix the methods
339              
340             =cut
341              
342             has 'vcfs' => (
343             is => 'rw',
344             isa => 'ArrayRef',
345             required => 0,
346             );
347              
348              
349              
350             =head3 annotate_vcf
351              
352             Use vcf-annotate from VCF tools to annotate the VCF file
353              
354             This does not overwrite the original VCF file, but instead creates a new one
355              
356             To turn this off
357              
358             annovar-wrapper.pl --annotate_vcf 0
359              
360             =cut
361              
362             has 'annotate_vcf' => (
363             is => 'rw',
364             isa => 'Bool',
365             default => 1,
366             required => 1,
367             );
368              
369             =head2 Internal Variables
370             =cut
371              
372             has 'samples' => (
373             is => 'rw',
374             isa => 'HashRef',
375             required => 0,
376             default => sub { return {} },
377             );
378              
379             has 'orig_samples' => (
380             is => 'rw',
381             isa => 'HashRef',
382             required => 0,
383             default => sub { return {} },
384             );
385              
386             has 'file' => (
387             traits => ['String'],
388             is => 'rw',
389             isa => 'Str',
390             required => 0,
391             default => sub { return "" },
392             handles => {
393             match_file => 'match',
394             },
395             );
396              
397             has 'fname' => (
398             is => 'rw',
399             isa => 'Str',
400             required => 0,
401             default => sub { return "" },
402             );
403              
404             =head1 SUBROUTINES/METHODS
405              
406             =cut
407              
408             =head2 run
409              
410             Subroutine that starts everything off
411              
412             =cut
413              
414             sub run {
415 0     0     my($self) = @_;
416              
417 0           $self->print_opts;
418              
419 0           $self->check_files;
420 0           $self->parse_commands;
421 0           $self->find_vcfs;
422 0           $self->write_annovar;
423             }
424              
425             =head2 print_opts
426              
427             Print out the command line options
428              
429             =cut
430              
431             sub print_opts {
432 0     0     my($self) = @_;
433              
434 0           print "## This file was generated with the options\n";
435 0           for(my $x=0; $x<=$#ARGV; $x++){
436 0 0         next unless $ARGV[$x];
437 0           print "#\t$ARGV[$x]\t".$ARGV[$x+1]."\n";
438 0           $x++;
439             }
440 0           print "\n";
441             }
442              
443             =head2 check_files
444              
445             Check to make sure either an indir or vcfs are supplied
446              
447             =cut
448              
449             sub check_files{
450 0     0     my($self) = @_;
451 0           my($t);
452              
453 0 0 0       die print "Must specificy an indirectory or vcfs!\n" if (!$self->indir && !$self->vcfs);
454            
455 0 0         if($self->indir){
456 0           $t = $self->indir;
457 0           $t =~ s/\/$//g;
458 0           $self->indir($t);
459             }
460              
461 0           $t = $self->outdir;
462 0           $t =~ s/\/$//g;
463 0           $t = $t."/annovar-wrapper";
464 0           $self->outdir($t);
465              
466             #make the outdir
467 0 0         make_path($self->outdir) if ! -d $self->outdir;
468 0 0         make_path($self->outdir."/annovar_interim") if ! -d $self->outdir."/annovar_interim";
469 0 0         make_path($self->outdir."/annovar_final") if ! -d $self->outdir."/annovar_final";
470             }
471              
472             =head2 find_vcfs
473              
474             Use File::Find::Rule to find the vcfs
475              
476             =cut
477              
478             sub find_vcfs{
479 0     0     my($self) = @_;
480              
481 0 0         return if $self->vcfs;
482 0           $self->vcfs([]);
483              
484 0           my $rule = File::Find::Rule->file->name(qr/(vcf|vcf\.gz)$/)->start( $self->indir);
485 0           while ( defined ( my $file = $rule->match ) ) {
486 0           push(@{$self->vcfs}, $file);
  0            
487             }
488              
489 0 0         die print "No vcfs were found!\n" unless $self->vcfs;
490             }
491              
492             =head2 parse_commands
493              
494             Allow for giving ArrayRef either in the usual fashion or with commas
495              
496             =cut
497              
498             sub parse_commands{
499 0     0     my($self) = @_;
500              
501 0 0         if($#{$self->annovar_dbs} == 0){
  0            
502 0           my @tmp = split(",", $self->annovar_dbs->[0]);
503 0           $self->annovar_dbs(\@tmp);
504             }
505 0 0         if($#{$self->annovar_fun} == 0){
  0            
506 0           my @tmp = split(",", $self->annovar_fun->[0]);
507 0           $self->annovar_fun(\@tmp);
508             }
509              
510 0 0         return unless $self->vcfs;
511 0 0         if($#{$self->vcfs} == 0){
  0            
512 0           my @tmp = split(",", $self->vcfs->[0]);
513 0           $self->vcfs(\@tmp);
514             }
515             }
516              
517             =head2 write_annovar
518              
519             Write the commands that
520              
521             Convert the vcf file to annovar input
522             Do the annotations
523             Reannotate the vcf - if you want
524              
525             =cut
526              
527             sub write_annovar{
528 0     0     my($self) = @_;
529              
530 0 0         die print "Dbs are ".scalar @{$self->annovar_dbs}."\nand Funcs are ".scalar @{$self->annovar_fun}." \n" unless scalar @{$self->annovar_dbs} == scalar @{$self->annovar_fun};
  0            
  0            
  0            
  0            
531              
532             # #Convert vcf to annovar
533 0           print "## Converting to annovar input\n\n";
534 0           foreach my $file (@{$self->vcfs}){
  0            
535 0           print "## Processing file $file\n\n";
536            
537 0           $self->file($file);
538 0           my $tname = basename($self->file);
539 0           $tname =~ s/\.vcf$|\.vcf\.gz$//;
540 0           $self->fname($tname);
541             # $self->fname(basename($self->file));
542 0           $self->get_samples;
543 0           $self->convert_annovar;
544              
545             }
546 0           print "## Wait for all convert commands to complete\n";
547 0           print "wait\n\n";
548              
549             # #Annotate annovar input
550              
551 0           $self->iter_vcfs('table_annovar');
552              
553 0           print "## Wait for all table commands to complete\n";
554 0           print "wait\n\n";
555              
556             # #Annotate the VCF
557             # #This is only relevant for putting it back into vcfs
558              
559 0 0         return unless $self->annotate_vcf;
560              
561 0           $self->iter_vcfs('gen_descr');
562            
563 0           print "## Wait for file copying bgzip and tabix to finish...\n";
564 0           print "wait\n\n";
565              
566 0           $self->iter_vcfs('gen_annot');
567              
568 0           print "## Wait for all vcf-annotate commands to complete\n";
569 0           print "wait\n\n";
570              
571 0           $self->iter_vcfs('merge_vcfs');
572              
573             }
574              
575             =head2 iter_vcfs
576             Iterate over the vcfs with some changes for lookups
577             =cut
578             sub iter_vcfs{
579             # my $self = shift;
580 0     0     my($self, $fun) = @_;
581              
582 0           foreach my $file (@{$self->vcfs}){
  0            
583 0           $self->file($file);
584 0           my $tname = basename($self->file);
585 0           $tname =~ s/\.vcf$|\.vcf\.gz$//;
586 0           $self->fname($tname);
587             # $self->fname(basename($self->file));
588             #Make this a parameter of the script
589 0           $self->$fun;
590             }
591              
592             }
593              
594             =head2 get_samples
595              
596             Using VCF tools get the samples listed per vcf file
597              
598             Supports files that are bgzipped or not
599              
600             Sample names are stripped of all non alphanumeric characters.
601              
602             =cut
603              
604             sub get_samples{
605 0     0     my($self) = @_;
606              
607 0           my(@samples, $vcf, $out, $fh);
608              
609             #This doesn't work in large vcf files, end up with funny blocks
610             # if($self->match_file(qr/gz$/)){
611             # $fh = new IO::Uncompress::Gunzip $self->file or warn "## File handle didn't work for gzipped file ".$self->file."\n\n";
612             # }
613             # else{
614             # $fh = new IO::File $self->file, "r" or warn "## File handle didn't work for ".$self->file."\n\n";
615             # }
616              
617             # next unless $fh;
618              
619 0           $vcf = Vcf->new(file => $self->file);
620 0           $vcf->parse_header();
621 0           (@samples) = $vcf->get_samples();
622              
623             # #TODO Have this in a proper debug msg
624             # print "##Before transform samples are :\n##".join("\n##", @samples)."\n";
625              
626             #Keep the original samples names for subsetting the vcf
627 0           my(@tmp) = @samples;
628 0           $self->orig_samples->{$self->file} = \@tmp;
629              
630             #Must keep this the same as annovar!
631             # #I think annovar got rid of these in the most recent implementation
632             # 2014-12-20 Downloaded new annovar
633             # @samples = map { s/[^A-Za-z0-9\-\.]//g; $_ } @samples;
634             # @samples = map { s/^\.//g; $_ } @samples;
635              
636             # #TODO Have this in a proper debug msg
637             # print "##After transform samples are :\n##".join("\n##", @samples)."\n";
638 0           $vcf->close();
639              
640             # #TODO Put a warning msg here
641 0 0         die print "There are no samples!\n" unless @samples;
642              
643 0           $self->samples->{$self->file} = \@samples;
644              
645 0           print "##Original samples names are :\n##".join("\n##", @{$self->orig_samples->{$self->file}})."\n";
  0            
646 0           print "##Annovar samples names are :\n##".join("\n##", @{$self->samples->{$self->file}})."\n";
  0            
647             }
648              
649             =head2 convert_annovar
650              
651             Print out the command to print the convert2annovar commands
652              
653             =cut
654              
655             sub convert_annovar{
656 0     0     my($self) = @_;
657              
658 0           print $self->convert2annovar_path." ".$self->convert2annovar_opts." ".$self->file." \\\n--outfile ".$self->outdir."/".$self->fname.".annovar\n\n";
659             }
660              
661             =head2 table_annovar
662              
663             Print out the commands to generate the annotation using table_annovar.pl command.
664             =cut
665              
666             sub table_annovar{
667 0     0     my($self) = @_;
668              
669 0           print "## Generating annotations\n\n";
670 0           foreach my $sample (@{$self->samples->{$self->file}}){
  0            
671 0           print "## Processing sample $sample\n";
672 0           print $self->tableannovar_path." ".$self->outdir."/".$self->fname.".annovar.$sample.avinput \\\n ".$self->annovardb_path." --buildver ".$self->buildver." \\\n -protocol ".join(",", @{$self->annovar_dbs})." \\\n -operation ".join(",", @{$self->annovar_fun})." \\\n -nastring NA --outfile ".$self->outdir."/".$self->fname.".annovar.$sample \\\n";
  0            
  0            
673 0           print "&& find ".$self->outdir."/ |grep ".$self->outdir."/".$self->fname.".annovar.$sample | grep -v \"multianno\" | xargs -i -t mv {} ".$self->outdir."/annovar_interim \\\n";
674 0           print "&& find ".$self->outdir."/ |grep ".$self->outdir."/".$self->fname.".annovar.$sample | grep \"avinput\$\" | xargs -i -t mv {} ".$self->outdir."/annovar_interim \\\n";
675 0           print "&& find ".$self->outdir."/ |grep ".$self->outdir."/".$self->fname.".annovar.$sample | grep \"multianno\" | xargs -i -t mv {} ".$self->outdir."/annovar_final\n\n";
676             }
677              
678             }
679              
680             =head2 vcf_annotate
681              
682             Generate the commands to annotate the vcf file using vcf-annotate
683              
684             =cut
685              
686             sub vcf_annotate{
687 0     0     my($self) = @_;
688              
689 0           $self->gen_descr;
690 0           $self->gen_annot;
691              
692              
693 0           $self->merge_vcfs;
694             }
695              
696             =head2 gen_descr
697              
698             Bgzip, tabix, all of vcftools, and sort must be in your PATH for these to work.
699              
700              
701             There are two parts to this command.
702              
703             The first prepares the annotation file.
704              
705             1. The annotation file is backed up just in case
706             2. The annotation file is sorted, because I had some problems with sorting
707             3. The annotation file is bgzipped, as required by vcf-annotate
708             4. The annotation file is tabix indexed using the special commands -s 1 -b 2 -e 3
709              
710             The second writes out the vcf-annotate commands
711              
712             Example with RefGene
713             zcat ../../variants.vcf.gz | vcf-annotate -a sorted.annotation.gz \
714             -d key=INFO,ID=SAMPLEID_Func_refGene,Number=0,Type=String,Description='SAMPLEID Annovar Func_refGene' \
715             -d key=INFO,ID=SAMPLEID_Gene_refGene,Number=0,Type=String,Description='SAMPLEID Annovar Gene_refGene' \
716             -d key=INFO,ID=SAMPLEID_ExonicFun_refGene,Number=0,Type=String,Description='SAMPLEID Annovar ExonicFun_refGene' \
717             -d key=INFO,ID=SAMPLEID_AAChange_refGene,Number=0,Type=String,Description='SAMPLEID Annovar AAChange_refGene' \
718             -c CHROM,FROM,TO,-,-,INFO/SAMPLEID_Func_refGene,INFO/SAMPLEID_Gene_refGene,INFO/SAMPLEID_ExonicFun_refGene,INFO/SAMPLEID_AAChange_refGene > SAMPLEID.annotated.vcf
719              
720             =cut
721              
722             sub gen_descr{
723 0     0     my($self) = @_;
724              
725 0           print "##Prepare to reannotate VCF files\n\n";
726            
727 0 0         make_path($self->outdir."/vcf-annotate_interim") if ! -d $self->outdir."/vcf-annotate_interim";
728 0 0         make_path($self->outdir."/vcf-annotate_final") if ! -d $self->outdir."/vcf-annotate_final";
729              
730 0           foreach my $sample (@{$self->samples->{$self->file}}){
  0            
731 0           print "cp ".$self->outdir."/annovar_final/".$self->fname.".annovar.$sample.hg19_multianno.txt \\\n";
732 0           print $self->outdir."/vcf-annotate_interim/".$self->fname.".annovar.$sample.hg19_multianno.txt \\\n";
733 0           print "&& sed -i 's/;/,/;s/=/->/;s/GERP++/GERPPP/;s/+/P/'g ".$self->outdir."/vcf-annotate_interim/".$self->fname.".annovar.$sample.hg19_multianno.txt \\\n";
734 0           print "&& sort -k1,1 -k2,2n ".$self->outdir."/vcf-annotate_interim/".$self->fname.".annovar.$sample.hg19_multianno.txt > ";
735 0           print $self->outdir."/vcf-annotate_interim/".$self->fname.".sorted.annovar.$sample.hg19_multianno.txt \\\n";
736 0           print "&& bgzip -f ".$self->outdir."/vcf-annotate_interim/".$self->fname.".sorted.annovar.$sample.hg19_multianno.txt \\\n";
737 0           print "&& tabix -s 1 -b 2 -e 3 ".$self->outdir."/vcf-annotate_interim/".$self->fname.".sorted.annovar.$sample.hg19_multianno.txt.gz \n\n";
738             }
739              
740             }
741              
742             =head2 gen_annot
743             =cut
744             sub gen_annot {
745 0     0     my $self = shift;
746              
747 0           print "##Reannotate VCF files\n\n";
748 0           foreach my $sample (@{$self->samples->{$self->file}}){
  0            
749 0 0         if($self->match_file(qr/gz$/)){
750 0           print "zcat ".$self->file." | ";
751             }
752             else{
753 0           print "cat ".$self->file." | ";
754             }
755 0           print "vcf-annotate -a ".$self->outdir."/vcf-annotate_interim/".$self->fname.".sorted.annovar.$sample.hg19_multianno.txt.gz \\\n";
756            
757 0           foreach my $db (@{$self->annovar_dbs}){
  0            
758 0 0         if(exists $self->annovar_cols->{$db}){
759 0           my $tmp = $self->annovar_cols->{$db};
760              
761             #Test this!!!
762 0           $db =~ s/\+/P/g;
763 0           $db =~ s/\W/_/g;
764              
765 0           foreach my $t (@$tmp){
766 0           $t =~ s/\+/P/g;
767 0           $t =~ s/\W/_/g;
768 0           print <<EOF;
769             -d key=INFO,ID=$sample.annovar.$db.$t,Number=0,Type=String,Description='Annovar $sample $db $t' \\
770             EOF
771             }
772             }
773             else{
774 0           print <<EOF;
775             -d key=INFO,ID=$sample.annovar.$db,Number=0,Type=String,Description='Annovar gen $sample $db' \\
776             EOF
777             }
778              
779             }
780              
781 0           $self->gen_cols($sample);
782             }
783            
784             }
785             =head2 gen_cols
786              
787             Generate the -c portion of the vcf-annotate command
788              
789             =cut
790              
791             sub gen_cols{
792 0     0     my($self, $sample) = @_;
793              
794 0           my $cols = "-c CHROM,FROM,TO,-,-";
795              
796 0           foreach my $db (@{$self->annovar_dbs}){
  0            
797 0 0         if(exists $self->annovar_cols->{$db}){
798 0           my $tmp = $self->annovar_cols->{$db};
799              
800 0           foreach my $t (@$tmp){
801 0           $cols .= ",INFO/$sample.annovar.$db.$t";
802             }
803             }
804             else{
805 0           $cols .= ",INFO/$sample.annovar.$db";
806             }
807              
808             }
809              
810 0           print $cols." | bgzip -f -c > ".$self->outdir."/vcf-annotate_final/".$self->fname.".$sample.annovar.vcf.gz && \\\n";
811 0           print "tabix -p vcf ".$self->outdir."/vcf-annotate_final/".$self->fname.".$sample.annovar.vcf.gz\n\n";
812              
813             }
814              
815             =head2 merge_vcfs
816              
817             There is one vcf-annotated file per sample, so merge those at the the end to get a multisample file using vcf-merge
818              
819             =cut
820              
821             sub merge_vcfs {
822 0     0     my($self) = @_;
823              
824 0 0         return if scalar @{$self->samples->{$self->file}} == 1;
  0            
825              
826 0           print "##Merge single sample VCF files\n\n";
827              
828 0           print "vcf-merge \\\n";
829 0           foreach my $sample (@{$self->samples->{$self->file}}){
  0            
830 0           print $self->outdir."/vcf-annotate_final/".$self->fname.".$sample.annovar.vcf.gz \\\n";
831             }
832 0           print " | bgzip -f -c > ".$self->outdir."/vcf-annotate_interim/".$self->fname.".allsamples.annovar.vcf.gz \\\n";
833 0           print "&& tabix -p vcf ".$self->outdir."/vcf-annotate_interim/".$self->fname.".allsamples.annovar.vcf.gz\n";
834              
835 0           print "\nwait\n\n";
836              
837 0           $self->subset_vcfs();
838              
839             }
840              
841             =head2 subset_vcfs
842              
843             vcf-merge used in this fashion will create a lot of redundant columns, because it wants to assume all sample names are unique
844              
845             Straight from the vcftools documentation
846              
847             vcf-subset -c NA0001,NA0002 file.vcf.gz | bgzip -c > out.vcf.gz
848              
849             =cut
850              
851             sub subset_vcfs {
852 0     0     my($self) = @_;
853              
854 0           print "##Subsetting the files to get rid of redundant info\n\n";
855              
856 0           my $str = join(",", @{$self->orig_samples->{$self->file}});
  0            
857              
858 0           print "vcf-subset -c $str ".$self->outdir."/vcf-annotate_interim/".$self->fname.".allsamples.annovar.vcf.gz | bgzip -f -c > ".$self->outdir."/vcf-annotate_final/".$self->fname.".allsamples.nonredundant.annovar.vcf.gz \\\n";
859 0           print "&& tabix -p vcf ".$self->outdir."/vcf-annotate_final/".$self->fname.".allsamples.nonredundant.annovar.vcf.gz\n";
860              
861 0           print "## Finished processing file ".$self->file."\n\n";
862             }
863              
864              
865              
866             =head1 AUTHOR
867              
868             Jillian Rowe, C<< <jillian.e.rowe at gmail.com> >>
869              
870             =head1 BUGS
871              
872             Please report any bugs or feature requests to C<bug-annovar-wrapper at rt.cpan.org>, or through
873             the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Annovar-Wrapper>. I will be notified, and then you'll
874             automatically be notified of progress on your bug as I make changes.
875              
876             =head1 SUPPORT
877              
878             You can find documentation for this module with the perldoc command.
879              
880             perldoc Annovar::Wrapper
881              
882              
883             You can also look for information at:
884              
885             =over 4
886              
887             =item * RT: CPAN's request tracker (report bugs here)
888              
889             L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Annovar-Wrapper>
890              
891             =item * AnnoCPAN: Annotated CPAN documentation
892              
893             L<http://annocpan.org/dist/Annovar-Wrapper>
894              
895             =item * CPAN Ratings
896              
897             L<http://cpanratings.perl.org/d/Annovar-Wrapper>
898              
899             =item * Search CPAN
900              
901             L<http://search.cpan.org/dist/Annovar-Wrapper/>
902              
903             =back
904              
905              
906             =head1 ACKNOWLEDGEMENTS
907              
908             This module is a wrapper around the well developed annovar pipeline. The commands come straight from the documentation.
909              
910             This module was originally developed at and for Weill Cornell Medical College in Qatar within ITS Advanced Computing Team and LOTS of input and scientist debugging by Khalid Fahkro. With approval from WCMC-Q, this information was generalized and put on github, for which the authors would like to express their gratitude.
911              
912             =head1 LICENSE AND COPYRIGHT
913              
914             Copyright 2014 Jillian Rowe.
915              
916             This program is free software; you can redistribute it and/or modify it
917             under the terms of the the Artistic License (2.0). You may obtain a
918             copy of the full license at:
919              
920             L<http://www.perlfoundation.org/artistic_license_2_0>
921              
922             Any use, modification, and distribution of the Standard or Modified
923             Versions is governed by this Artistic License. By using, modifying or
924             distributing the Package, you accept this license. Do not use, modify,
925             or distribute the Package, if you do not accept this license.
926              
927             If your Modified Version has been derived from a Modified Version made
928             by someone other than you, you are nevertheless required to ensure that
929             your Modified Version complies with the requirements of this license.
930              
931             This license does not grant you the right to use any trademark, service
932             mark, tradename, or logo of the Copyright Holder.
933              
934             This license includes the non-exclusive, worldwide, free-of-charge
935             patent license to make, have made, use, offer to sell, sell, import and
936             otherwise transfer the Package with respect to any patent claims
937             licensable by the Copyright Holder that are necessarily infringed by the
938             Package. If you institute patent litigation (including a cross-claim or
939             counterclaim) against any party alleging that the Package constitutes
940             direct or contributory patent infringement, then this Artistic License
941             to you shall terminate on the date that such litigation is filed.
942              
943             Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
944             AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
945             THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
946             PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
947             YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
948             CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
949             CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
950             EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
951              
952              
953             =cut
954              
955             1; # End of Annovar::Wrapper