File Coverage

blib/lib/Annovar/Wrapper.pm
Criterion Covered Total %
statement 24 204 11.7
branch 0 42 0.0
condition 0 3 0.0
subroutine 8 24 33.3
pod n/a
total 32 273 11.7


line stmt bran cond sub pod time code
1             package Annovar::Wrapper;
2              
3             #use 5.006;
4              
5 1     1   20448 use Moose;
  1         461116  
  1         9  
6 1     1   6974 use File::Find::Rule;
  1         7133  
  1         11  
7 1     1   54 use File::Basename;
  1         6  
  1         77  
8 1     1   5 use File::Path qw(make_path remove_tree);
  1         2  
  1         47  
9 1     1   668 use Data::Dumper;
  1         5821  
  1         81  
10 1     1   8 use File::Find::Rule;
  1         1  
  1         10  
11 1     1   2514 use IO::Uncompress::Gunzip;
  1         60827  
  1         60  
12 1     1   9 use Cwd;
  1         2  
  1         2570  
13              
14             require Vcf;
15              
16             with 'MooseX::Getopt';
17              
18             =head1 NAME
19              
20             Annovar::Wrapper - A wrapper around the annovar annotation pipeline
21              
22             =head1 VERSION
23              
24             Version 0.06
25              
26             =cut
27              
28             our $VERSION = '0.32';
29              
30              
31             =head1 SYNOPSIS
32              
33             annovar-wrapper.pl --vcfs file1.vcf,file2.vcf --annovardb_path /path/to/annovar/dbs
34              
35             This module is a wrapper around the popular annotation tool, annovar. http://www.openbioinformatics.org/annovar/ . The commands generated are taken straight from the documentation. In addition, there is an option to reannotate using vcf-annotate from vcftools.
36              
37             It takes as its input a list or directory of vcf files, bgzipped and tabixed or not, and uses annovar to create annotation files. These multianno table files can be optionally reannotated into the vcf file. This script does not actually execute any commands, only writes them to STDOUT for the user to run as they wish.
38              
39             It comes with an executable script annovar-wrapper.pl. This should be sufficient for most of your needs, but if you wish to overwrite methods you can always do so in the usual Moose fashion.
40              
41             #!/usr/bin/env perl
42              
43             package Main;
44              
45             use Moose;
46             extends 'Annovar::Wrapper';
47              
48             Annovar::Wrapper->new_with_options->run;
49              
50             sub method_to_override {
51             my $self = shift;
52              
53             #dostuff
54             };
55              
56             before 'method' => sub {
57             my $self = shift;
58              
59             #dostuff
60             };
61              
62             has '+variable' => (
63             #things to add to variable declaration
64             );
65              
66             #or
67            
68             has 'variable' => (
69             #override variable declaration
70             );
71              
72             1;
73              
74             Please see the Moose::Manual::MethodModifiers for more information.
75              
76             =head1 Prerequisites
77              
78             This module requires the annovar download. The easiest thing to do is to put the annovar scripts in your ENV{PATH}, but if you choose not to do this you can also pass in the location with
79              
80             annovar-wrapper.pl --tableannovar_path /path/to/table_annovar.pl --convert2annovar_path /path/to/convert2annovar.pl
81              
82             It requires Vcf.pm, which comes with vcftools.
83              
84             Vcftools is publicly available for download. http://vcftools.sourceforge.net/.
85              
86             export PERL5LIB=$PERL5LIB:path_to_vcftools/perl
87              
88             If you wish to you reannotate the vcf file you need to have bgzip and tabix installed, and have the executables in vcftools in your path.
89              
90             export PATH=$PATH:path_to_vcftools
91              
92             =head1 Generate an Example
93              
94             To generate an example you can run the following commands
95              
96             tabix -h ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/ALL.2of4intersection.20100804.genotypes.vcf.gz 2:39967768-40000000 > test.vcf
97             bgzip test.vcf
98             tabix test.vcf.gz
99             vcf-subset -c HG00098,HG00100,HG00106,HG00112,HG00114 test.vcf.gz | bgzip -c > out.vcf.gz
100             tabix out.vcf.gz
101             rm test.vcf.gz
102             rm test.vcf.gz.tbi
103              
104             annovar-wrapper.pl --vcfs out.vcf.gz --annovar_dbs refGene --annovar_fun g --outdir annovar_out --annovardb_path /path/to/annovar/dbs > my_cmds.sh
105              
106             There is more detail on the example in the pod files.
107              
108             =cut
109              
110             =head1 Variables
111              
112             =cut
113              
114             =head2 Annovar Options
115              
116             =cut
117              
118             =head3 tableannovar_path
119              
120             You can put the location of the annovar scripts in your ENV{PATH}, and the default is fine. If annovar is not in your PATH, please supply the location.
121              
122             =cut
123              
124             has 'tableannovar_path' => (
125             is => 'rw',
126             isa => 'Str',
127             required => 1,
128             default => "table_annovar.pl"
129             );
130              
131             =head3 convert2annovar_path
132              
133             You can put the location of the annovar scripts in your ENV{PATH}, and the default is fine. If annovar is not in your PATH, please supply the location
134              
135             =cut
136              
137             has 'convert2annovar_path' => (
138             is => 'rw',
139             isa => 'Str',
140             required => 1,
141             default => "convert2annovar.pl"
142             );
143              
144             =head3 annovardb_path
145              
146             Path to your annovar databases
147              
148             =cut
149              
150             has 'annovardb_path' => (
151             is => 'rw',
152             isa => 'Str',
153             default => '/data/apps/software/annovar/hg19',
154             required => 1,
155             );
156              
157             =head3 buildver
158              
159             Its probably hg19 or hg18
160              
161             =cut
162              
163             has 'buildver' => (
164             is => 'rw',
165             isa => 'Str',
166             default => 'hg19',
167             required => 1,
168             );
169              
170             =head3 convert2annovar_opts
171              
172             Assumes vcf version 4 and that you want to convert all samples
173              
174             Not using --allsample on a multisample vcf is untested and will probably break the whole pipeline
175              
176             =cut
177              
178             has 'convert2annovar_opts' => (
179             is => 'rw',
180             isa => 'Str',
181             default => '-format vcf4 --allsample',
182             );
183              
184             =head3 annovar_dbs
185              
186             These are pretty much all the databases listed on
187              
188             http://www.openbioinformatics.org/annovar/annovar_download.html for hg19 that I tested as working
189              
190             #Download databases with
191              
192             cd path_to_annovar_dir
193             ./annotate_variation.pl --buildver hg19 -downdb -webfrom annovar esp6500si_aa hg19/
194              
195             #Option is an ArrayRef, and can be given as either
196              
197             --annovar_dbs cg46,cg69,nci60
198              
199             #or
200              
201             --annovar_dbs cg46 --annovar_dbs cg69 --annovar_dbs nci60
202              
203             =cut
204             #TODO
205             #Add in a hashref so I don't have to remember the list
206             #The following are redundant within other databases
207             #esp are in popfreq_all
208             #esp6500si_aa
209             #esp6500si_ea
210             #ljb23 are in ljb23_all
211             #ljb23_fathmm
212             #ljb23_gerp++
213             #ljb23_lrt
214             #ljb23_ma
215             #ljb23_metalr
216             #ljb23_metasvm
217             #ljb23_mt
218             #ljb23_phylop
219             #ljb23_pp2hdiv
220             #ljb23_pp2hvar
221             #ljb23_sift
222             #ljb23_siphy
223             #ljb2_pp2hvar
224             #Leaving out cg46
225             #
226             ## The following have been tested
227             # snp138NonFlagged,snp138,popfreq_all,cg69,cosmic68wgs,clinvar_20140211,gwasCatalog,caddgt20,phastConsElements46way,gerp++elem,wgEncodeBroadHmmGm12878HMM,wgEncodeUwDnaseGm12878HotspotsRep2,ljb23_all,refGene
228              
229             has 'annovar_dbs' => (
230             is => 'rw',
231             isa => 'ArrayRef',
232             required => 0,
233             default => sub {
234             return [qw(snp138NonFlagged
235             snp138
236             popfreq_all
237             cg69
238             cosmic68wgs
239             clinvar_20140211
240             gwasCatalog
241             caddgt20
242             phastConsElements46way
243             gerp++elem
244             wgEncodeBroadHmmGm12878HMM
245             wgEncodeUwDnaseGm12878HotspotsRep2
246             ljb23_all
247             refGene
248             )]
249             }
250             );
251              
252             =head3 annovar_fun
253              
254             Functions of the individual databases can be found at
255              
256             What function your DB may already be listed otherwise it is probably listed in the URLS under Annotation: Gene-Based, Region-Based, or Filter-Based
257              
258             Functions must be given in the corresponding order of your annovar_dbs
259              
260             #Option is an ArrayRef, and can be given as either
261              
262             --annovar_fun f,f,g
263              
264             #or
265              
266             --annovar_fun f --annovar_fun f --annovar_fun g
267              
268             =cut
269              
270             has 'annovar_fun' => (
271             is => 'rw',
272             isa => 'ArrayRef',
273             required => 0,
274             default => sub {
275             return [qw(f
276             f
277             f
278             f
279             f
280             f
281             r
282             f
283             r
284             f
285             r
286             r
287             f
288             g)]
289             }
290             );
291              
292             =head3 annovar_cols
293              
294             Some database annotations generate multiple columns. For reannotating the vcf we need to know what these columns are. Below are the columns generated for the databases given in annovar_dbs
295              
296             To add give a hashref of array
297              
298             =cut
299              
300             has 'annovar_cols' => (
301             is => 'rw',
302             isa => 'HashRef',
303             required => 0,
304             default => sub {
305             my $href = {};
306             #Old table_annovar.pl script
307             # $href->{popfreq_max} = ["PopFreqMax"];
308             # $href->{popfreq_all} = ["PopFreqMax","1000G2012APR_ALL","1000G2012APR_AFR","1000G2012APR_AMR","1000G2012APR_ASN","1000G2012APR_EUR","ESP6500si_AA","ESP6500si_EA","CG46","NCI60","SNP137","COSMIC65","DISEASE"];
309             # $href->{refGene} = ["Func.refGene","Gene.refGene","ExonicFunc.refGene","AAChange.refGene"];
310             # $href->{ljb_all} = [ qw/LJB_PhyloP LJB_PhyloP_Pred LJB_SIFT LJB_SIFT_Pred LJB_PolyPhen2 LJB_PolyPhen2_Pred LJB_LRT LJB_LRT_Pred LJB_MutationTaster LJB_MutationTaster_Pred LJB_GERP++/ ];
311             # $href->{ljb2_all} = [ qw/LJB2_SIFT LJB2_PolyPhen2_HDIV LJB2_PP2_HDIV_Pred LJB2_PolyPhen2_HVAR LJB2_PolyPhen2_HVAR_Pred LJB2_LRT LJB2_LRT_Pred LJB2_MutationTaster LJB2_MutationTaster_Pred LJB_MutationAssessor LJB_MutationAssessor_Pred LJB2_FATHMM LJB2_GERP++ LJB2_PhyloP LJB2_SiPhy/ ];
312             # $href->{ljb23_all} = [ qw/LJB23_SIFT_score LJB23_SIFT_score_converted LJB23_SIFT_pred LJB23_Polyphen2_HDIV_score LJB23_Polyphen2_HDIV_pred LJB23_Polyphen2_HVAR_score LJB23_Polyphen2_HVAR_pred LJB23_LRT_score LJB23_LRT_score_converted LJB23_LRT_pred LJB23_MutationTaster_score LJB23_MutationTaster_score_converted LJB23_MutationTaster_pred LJB23_MutationAssessor_score LJB23_MutationAssessor_score_converted LJB23_MutationAssessor_pred LJB23_FATHMM_score LJB23_FATHMM_score_converted LJB23_FATHMM_pred LJB23_RadialSVM_score LJB23_RadialSVM_score_converted LJB23_RadialSVM_pred LJB23_LR_score LJB23_LR_pred LJB23_GERP++ LJB23_PhyloP LJB23_SiPhy/ ];
313              
314             # #Which one?
315             # #$href->{refGene} = ["Func.refGene","Gene.refGene", "ExonicFunc.refGene","AAChange.refGene"];
316             $href->{refGene} = ["Func.refGene","Gene.refGene","GeneDetail.refGene", "ExonicFunc.refGene","AAChange.refGene"];
317             $href->{ljb_all} = [ qw/LJB_PhyloP LJB_PhyloP_Pred LJB_SIFT LJB_SIFT_Pred LJB_PolyPhen2 LJB_PolyPhen2_Pred LJB_LRT LJB_LRT_Pred LJB_MutationTaster LJB_MutationTaster_Pred LJB_GERPPP/ ];
318             $href->{ljb2_all} = [ qw/LJB2_SIFT LJB2_PolyPhen2_HDIV LJB2_PP2_HDIV_Pred LJB2_PolyPhen2_HVAR LJB2_PolyPhen2_HVAR_Pred LJB2_LRT LJB2_LRT_Pred LJB2_MutationTaster LJB2_MutationTaster_Pred LJB_MutationAssessor LJB_MutationAssessor_Pred LJB2_FATHMM LJB2_GERPPP LJB2_PhyloP LJB2_SiPhy/ ];
319             $href->{ljb23_all} = [ qw/LJB23_SIFT_score LJB23_SIFT_score_converted LJB23_SIFT_pred LJB23_Polyphen2_HDIV_score LJB23_Polyphen2_HDIV_pred LJB23_Polyphen2_HVAR_score LJB23_Polyphen2_HVAR_pred LJB23_LRT_score LJB23_LRT_score_converted LJB23_LRT_pred LJB23_MutationTaster_score LJB23_MutationTaster_score_converted LJB23_MutationTaster_pred LJB23_MutationAssessor_score LJB23_MutationAssessor_score_converted LJB23_MutationAssessor_pred LJB23_FATHMM_score LJB23_FATHMM_score_converted LJB23_FATHMM_pred LJB23_RadialSVM_score LJB23_RadialSVM_score_converted LJB23_RadialSVM_pred LJB23_LR_score LJB23_LR_pred LJB23_GERPPP LJB23_PhyloP LJB23_SiPhy/ ];
320             $href->{popfreq_all} = [ qw/PopFreqMax 1000G2012APR_ALL 1000G2012APR_AFR 1000G2012APR_AMR 1000G2012APR_ASN 1000G2012APR_EUR ESP6500si_ALL ESP6500si_AA ESP6500si_EA CG46/ ];
321             return $href;
322             }
323             );
324             =head2 Wrapper Options
325              
326             =cut
327              
328             =head3 indir
329              
330             A path to your vcf files can be given, and using File::Find::Rule it will recursively search for vcf or vcf.gz
331              
332             =cut
333              
334             has 'indir' => (
335             is => 'rw',
336             isa => 'Str|Undef',
337             required => 0,
338             );
339              
340             =head3 vcfs
341              
342             VCF files can be given individually as well.
343              
344             #Option is an ArrayRef and can be given as either
345              
346             --vcfs 1.vcf,2.vcf,3.vcfs
347              
348             #or
349              
350             --vcfs 1.vcf --vcfs 2.vcf --vcfs 3.vcf
351              
352             Don't mix the methods
353              
354             =cut
355              
356             has 'vcfs' => (
357             is => 'rw',
358             isa => 'ArrayRef',
359             required => 0,
360             );
361              
362             =head3 outdir
363              
364             Path to write out annotation files. It creates the structure
365              
366             outdir
367             --annovar_interim
368             --annovar_final
369             --vcf-annotate_interim #If you choose to reannotate VCF file
370             --vcf-annotate_final #If you choose to reannotate VCF file
371              
372             A lot of interim files are created by annovar, and the only one that really matters unless you debugging a new database is the multianno file found in annovar_final
373              
374             If not given the outdirectory is assumed to be the current working directory.
375              
376             =cut
377              
378             has 'outdir' => (
379             is => 'rw',
380             isa => 'Str',
381             required => 1,
382             default => sub { return getcwd() },
383             );
384              
385              
386             =head3 annotate_vcf
387              
388             Use vcf-annotate from VCF tools to annotate the VCF file
389              
390             This does not overwrite the original VCF file, but instead creates a new one
391              
392             To turn this off
393              
394             annovar-wrapper.pl --annotate_vcf 0
395              
396             =cut
397              
398             has 'annotate_vcf' => (
399             is => 'rw',
400             isa => 'Bool',
401             default => 1,
402             required => 1,
403             );
404              
405             #Internal Variables
406              
407             has 'samples' => (
408             is => 'rw',
409             isa => 'HashRef',
410             required => 0,
411             default => sub { return {} },
412             );
413              
414             has 'orig_samples' => (
415             is => 'rw',
416             isa => 'HashRef',
417             required => 0,
418             default => sub { return {} },
419             );
420              
421             has 'file' => (
422             traits => ['String'],
423             is => 'rw',
424             isa => 'Str',
425             required => 0,
426             default => sub { return "" },
427             handles => {
428             match_file => 'match',
429             },
430             );
431              
432             has 'fname' => (
433             is => 'rw',
434             isa => 'Str',
435             required => 0,
436             default => sub { return "" },
437             );
438              
439             =head1 SUBROUTINES/METHODS
440              
441             =cut
442              
443             =head2 run
444              
445             Subroutine that starts everything off
446              
447             =cut
448              
449             sub run {
450 0     0     my($self) = @_;
451              
452 0           $self->print_opts;
453              
454 0           $self->check_files;
455 0           $self->parse_commands;
456 0           $self->find_vcfs;
457 0           $self->write_annovar;
458             }
459              
460             =head2 print_opts
461              
462             Print out the command line options
463              
464             =cut
465              
466             sub print_opts {
467 0     0     my($self) = @_;
468              
469 0           print "## This file was generated with the options\n";
470 0           for(my $x=0; $x<=$#ARGV; $x++){
471 0 0         next unless $ARGV[$x];
472 0           print "#\t$ARGV[$x]\t".$ARGV[$x+1]."\n";
473 0           $x++;
474             }
475 0           print "\n";
476             }
477              
478             =head2 check_files
479              
480             Check to make sure either an indir or vcfs are supplied
481              
482             =cut
483              
484             sub check_files{
485 0     0     my($self) = @_;
486 0           my($t);
487              
488 0 0 0       die print "Must specificy an indirectory or vcfs!\n" if (!$self->indir && !$self->vcfs);
489            
490 0 0         if($self->indir){
491 0           $t = $self->indir;
492 0           $t =~ s/\/$//g;
493 0           $self->indir($t);
494             }
495              
496 0           $t = $self->outdir;
497 0           $t =~ s/\/$//g;
498 0           $t = $t."/annovar-wrapper";
499 0           $self->outdir($t);
500              
501             #make the outdir
502 0 0         make_path($self->outdir) if ! -d $self->outdir;
503 0 0         make_path($self->outdir."/annovar_interim") if ! -d $self->outdir."/annovar_interim";
504 0 0         make_path($self->outdir."/annovar_final") if ! -d $self->outdir."/annovar_final";
505             }
506              
507             =head2 find_vcfs
508              
509             Use File::Find::Rule to find the vcfs
510              
511             =cut
512              
513             sub find_vcfs{
514 0     0     my($self) = @_;
515              
516 0 0         return if $self->vcfs;
517 0           $self->vcfs([]);
518              
519 0           my $rule = File::Find::Rule->file->name(qr/(vcf|vcf\.gz)$/)->start( $self->indir);
520 0           while ( defined ( my $file = $rule->match ) ) {
521 0           push(@{$self->vcfs}, $file);
  0            
522             }
523              
524 0 0         die print "No vcfs were found!\n" unless $self->vcfs;
525             }
526              
527             =head2 parse_commands
528              
529             Allow for giving ArrayRef either in the usual fashion or with commas
530              
531             =cut
532              
533             sub parse_commands{
534 0     0     my($self) = @_;
535              
536 0 0         if($#{$self->annovar_dbs} == 0){
  0            
537 0           my @tmp = split(",", $self->annovar_dbs->[0]);
538 0           $self->annovar_dbs(\@tmp);
539             }
540 0 0         if($#{$self->annovar_fun} == 0){
  0            
541 0           my @tmp = split(",", $self->annovar_fun->[0]);
542 0           $self->annovar_fun(\@tmp);
543             }
544              
545 0 0         return unless $self->vcfs;
546 0 0         if($#{$self->vcfs} == 0){
  0            
547 0           my @tmp = split(",", $self->vcfs->[0]);
548 0           $self->vcfs(\@tmp);
549             }
550             }
551              
552             =head2 write_annovar
553              
554             Write the commands that
555              
556             Convert the vcf file to annovar input
557             Do the annotations
558             Reannotate the vcf - if you want
559              
560             =cut
561              
562             sub write_annovar{
563 0     0     my($self) = @_;
564              
565 0 0         die print "Dbs are ".scalar @{$self->annovar_dbs}."\nand Funcs are ".scalar @{$self->annovar_fun}." \n" unless scalar @{$self->annovar_dbs} == scalar @{$self->annovar_fun};
  0            
  0            
  0            
  0            
566              
567             # #Convert vcf to annovar
568 0           print "## Converting to annovar input\n\n";
569 0           foreach my $file (@{$self->vcfs}){
  0            
570 0           print "## Processing file $file\n\n";
571            
572 0           $self->file($file);
573 0           my $tname = basename($self->file);
574 0           $tname =~ s/\.vcf$|\.vcf\.gz$//;
575 0           $self->fname($tname);
576             # $self->fname(basename($self->file));
577 0           $self->get_samples;
578 0           $self->convert_annovar;
579              
580             }
581 0           print "## Wait for all convert commands to complete\n";
582 0           print "wait\n\n";
583              
584             # #Annotate annovar input
585              
586 0           $self->iter_vcfs('table_annovar');
587              
588 0           print "## Wait for all table commands to complete\n";
589 0           print "wait\n\n";
590              
591             # #Annotate the VCF
592             # #This is only relevant for putting it back into vcfs
593              
594 0 0         return unless $self->annotate_vcf;
595              
596 0           $self->iter_vcfs('gen_descr');
597            
598 0           print "## Wait for file copying bgzip and tabix to finish...\n";
599 0           print "wait\n\n";
600              
601 0           $self->iter_vcfs('gen_annot');
602              
603 0           print "## Wait for all vcf-annotate commands to complete\n";
604 0           print "wait\n\n";
605              
606 0           $self->iter_vcfs('merge_vcfs');
607              
608             }
609              
610             =head2 iter_vcfs
611             Iterate over the vcfs with some changes for lookups
612             =cut
613             sub iter_vcfs{
614             # my $self = shift;
615 0     0     my($self, $fun) = @_;
616              
617 0           foreach my $file (@{$self->vcfs}){
  0            
618 0           $self->file($file);
619 0           my $tname = basename($self->file);
620 0           $tname =~ s/\.vcf$|\.vcf\.gz$//;
621 0           $self->fname($tname);
622             # $self->fname(basename($self->file));
623             #Make this a parameter of the script
624 0           $self->$fun;
625             }
626              
627             }
628              
629             =head2 get_samples
630              
631             Using VCF tools get the samples listed per vcf file
632              
633             Supports files that are bgzipped or not
634              
635             Sample names are stripped of all non alphanumeric characters.
636              
637             =cut
638              
639             sub get_samples{
640 0     0     my($self) = @_;
641              
642 0           my(@samples, $vcf, $out, $fh);
643              
644             #This doesn't work in large vcf files, end up with funny blocks
645             # if($self->match_file(qr/gz$/)){
646             # $fh = new IO::Uncompress::Gunzip $self->file or warn "## File handle didn't work for gzipped file ".$self->file."\n\n";
647             # }
648             # else{
649             # $fh = new IO::File $self->file, "r" or warn "## File handle didn't work for ".$self->file."\n\n";
650             # }
651              
652             # next unless $fh;
653              
654 0           $vcf = Vcf->new(file => $self->file);
655 0           $vcf->parse_header();
656 0           (@samples) = $vcf->get_samples();
657              
658             # #TODO Have this in a proper debug msg
659             # print "##Before transform samples are :\n##".join("\n##", @samples)."\n";
660              
661             #Keep the original samples names for subsetting the vcf
662 0           my(@tmp) = @samples;
663 0           $self->orig_samples->{$self->file} = \@tmp;
664              
665             #Must keep this the same as annovar!
666             # #I think annovar got rid of these in the most recent implementation
667             # 2014-12-20 Downloaded new annovar
668             # @samples = map { s/[^A-Za-z0-9\-\.]//g; $_ } @samples;
669             # @samples = map { s/^\.//g; $_ } @samples;
670              
671             # #TODO Have this in a proper debug msg
672             # print "##After transform samples are :\n##".join("\n##", @samples)."\n";
673 0           $vcf->close();
674              
675             # #TODO Put a warning msg here
676 0 0         die print "There are no samples!\n" unless @samples;
677              
678 0           $self->samples->{$self->file} = \@samples;
679              
680 0           print "##Original samples names are :\n##".join("\n##", @{$self->orig_samples->{$self->file}})."\n";
  0            
681 0           print "##Annovar samples names are :\n##".join("\n##", @{$self->samples->{$self->file}})."\n";
  0            
682             }
683              
684             =head2 convert_annovar
685              
686             Print out the command to print the convert2annovar commands
687              
688             =cut
689              
690             sub convert_annovar{
691 0     0     my($self) = @_;
692              
693 0           print $self->convert2annovar_path." ".$self->convert2annovar_opts." ".$self->file." \\\n--outfile ".$self->outdir."/".$self->fname.".annovar\n\n";
694             }
695              
696             =head2 table_annovar
697              
698             Print out the commands to generate the annotation using table_annovar.pl command.
699             =cut
700              
701             sub table_annovar{
702 0     0     my($self) = @_;
703              
704 0           print "## Generating annotations\n\n";
705 0           foreach my $sample (@{$self->samples->{$self->file}}){
  0            
706 0           print "## Processing sample $sample\n";
707 0           print $self->tableannovar_path." ".$self->outdir."/".$self->fname.".annovar.$sample.avinput \\\n ".$self->annovardb_path." --buildver ".$self->buildver." \\\n -protocol ".join(",", @{$self->annovar_dbs})." \\\n -operation ".join(",", @{$self->annovar_fun})." \\\n -nastring NA --outfile ".$self->outdir."/".$self->fname.".annovar.$sample \\\n";
  0            
  0            
708 0           print "&& find ".$self->outdir."/ |grep ".$self->outdir."/".$self->fname.".annovar.$sample | grep -v \"multianno\" | xargs -i -t mv {} ".$self->outdir."/annovar_interim \\\n";
709 0           print "&& find ".$self->outdir."/ |grep ".$self->outdir."/".$self->fname.".annovar.$sample | grep \"avinput\$\" | xargs -i -t mv {} ".$self->outdir."/annovar_interim \\\n";
710 0           print "&& find ".$self->outdir."/ |grep ".$self->outdir."/".$self->fname.".annovar.$sample | grep \"multianno\" | xargs -i -t mv {} ".$self->outdir."/annovar_final\n\n";
711             }
712              
713             }
714              
715             =head2 vcf_annotate
716              
717             Generate the commands to annotate the vcf file using vcf-annotate
718              
719             =cut
720              
721             sub vcf_annotate{
722 0     0     my($self) = @_;
723              
724 0           $self->gen_descr;
725 0           $self->gen_annot;
726              
727              
728 0           $self->merge_vcfs;
729             }
730              
731             =head2 gen_descr
732              
733             Bgzip, tabix, all of vcftools, and sort must be in your PATH for these to work.
734              
735              
736             There are two parts to this command.
737              
738             The first prepares the annotation file.
739              
740             1. The annotation file is backed up just in case
741             2. The annotation file is sorted, because I had some problems with sorting
742             3. The annotation file is bgzipped, as required by vcf-annotate
743             4. The annotation file is tabix indexed using the special commands -s 1 -b 2 -e 3
744              
745             The second writes out the vcf-annotate commands
746              
747             Example with RefGene
748             zcat ../../variants.vcf.gz | vcf-annotate -a sorted.annotation.gz \
749             -d key=INFO,ID=SAMPLEID_Func_refGene,Number=0,Type=String,Description='SAMPLEID Annovar Func_refGene' \
750             -d key=INFO,ID=SAMPLEID_Gene_refGene,Number=0,Type=String,Description='SAMPLEID Annovar Gene_refGene' \
751             -d key=INFO,ID=SAMPLEID_ExonicFun_refGene,Number=0,Type=String,Description='SAMPLEID Annovar ExonicFun_refGene' \
752             -d key=INFO,ID=SAMPLEID_AAChange_refGene,Number=0,Type=String,Description='SAMPLEID Annovar AAChange_refGene' \
753             -c CHROM,FROM,TO,-,-,INFO/SAMPLEID_Func_refGene,INFO/SAMPLEID_Gene_refGene,INFO/SAMPLEID_ExonicFun_refGene,INFO/SAMPLEID_AAChange_refGene > SAMPLEID.annotated.vcf
754              
755             =cut
756              
757             sub gen_descr{
758 0     0     my($self) = @_;
759              
760 0           print "##Prepare to reannotate VCF files\n\n";
761            
762 0 0         make_path($self->outdir."/vcf-annotate_interim") if ! -d $self->outdir."/vcf-annotate_interim";
763 0 0         make_path($self->outdir."/vcf-annotate_final") if ! -d $self->outdir."/vcf-annotate_final";
764              
765 0           foreach my $sample (@{$self->samples->{$self->file}}){
  0            
766 0           print "cp ".$self->outdir."/annovar_final/".$self->fname.".annovar.$sample.hg19_multianno.txt \\\n";
767 0           print $self->outdir."/vcf-annotate_interim/".$self->fname.".annovar.$sample.hg19_multianno.txt \\\n";
768 0           print "&& sed -i 's/;/,/;s/=/->/;s/GERP++/GERPPP/;s/+/P/'g ".$self->outdir."/vcf-annotate_interim/".$self->fname.".annovar.$sample.hg19_multianno.txt \\\n";
769 0           print "&& sort -k1,1 -k2,2n ".$self->outdir."/vcf-annotate_interim/".$self->fname.".annovar.$sample.hg19_multianno.txt > ";
770 0           print $self->outdir."/vcf-annotate_interim/".$self->fname.".sorted.annovar.$sample.hg19_multianno.txt \\\n";
771 0           print "&& bgzip -f ".$self->outdir."/vcf-annotate_interim/".$self->fname.".sorted.annovar.$sample.hg19_multianno.txt \\\n";
772 0           print "&& tabix -s 1 -b 2 -e 3 ".$self->outdir."/vcf-annotate_interim/".$self->fname.".sorted.annovar.$sample.hg19_multianno.txt.gz \n\n";
773             }
774              
775             }
776              
777             =head2 gen_annot
778             =cut
779             sub gen_annot {
780 0     0     my $self = shift;
781              
782 0           print "##Reannotate VCF files\n\n";
783 0           foreach my $sample (@{$self->samples->{$self->file}}){
  0            
784 0 0         if($self->match_file(qr/gz$/)){
785 0           print "zcat ".$self->file." | ";
786             }
787             else{
788 0           print "cat ".$self->file." | ";
789             }
790 0           print "vcf-annotate -a ".$self->outdir."/vcf-annotate_interim/".$self->fname.".sorted.annovar.$sample.hg19_multianno.txt.gz \\\n";
791            
792 0           foreach my $db (@{$self->annovar_dbs}){
  0            
793 0 0         if(exists $self->annovar_cols->{$db}){
794 0           my $tmp = $self->annovar_cols->{$db};
795              
796             #Test this!!!
797 0           $db =~ s/\+/P/g;
798 0           $db =~ s/\W/_/g;
799              
800 0           foreach my $t (@$tmp){
801 0           $t =~ s/\+/P/g;
802 0           $t =~ s/\W/_/g;
803 0           print <<EOF;
804             -d key=INFO,ID=$sample.annovar.$db.$t,Number=0,Type=String,Description='Annovar $sample $db $t' \\
805             EOF
806             }
807             }
808             else{
809 0           print <<EOF;
810             -d key=INFO,ID=$sample.annovar.$db,Number=0,Type=String,Description='Annovar gen $sample $db' \\
811             EOF
812             }
813              
814             }
815              
816 0           $self->gen_cols($sample);
817             }
818            
819             }
820             =head2 gen_cols
821              
822             Generate the -c portion of the vcf-annotate command
823              
824             =cut
825              
826             sub gen_cols{
827 0     0     my($self, $sample) = @_;
828              
829 0           my $cols = "-c CHROM,FROM,TO,-,-";
830              
831 0           foreach my $db (@{$self->annovar_dbs}){
  0            
832 0 0         if(exists $self->annovar_cols->{$db}){
833 0           my $tmp = $self->annovar_cols->{$db};
834              
835 0           foreach my $t (@$tmp){
836 0           $cols .= ",INFO/$sample.annovar.$db.$t";
837             }
838             }
839             else{
840 0           $cols .= ",INFO/$sample.annovar.$db";
841             }
842              
843             }
844              
845 0           print $cols." | bgzip -f -c > ".$self->outdir."/vcf-annotate_final/".$self->fname.".$sample.annovar.vcf.gz && \\\n";
846 0           print "tabix -p vcf ".$self->outdir."/vcf-annotate_final/".$self->fname.".$sample.annovar.vcf.gz\n\n";
847              
848             }
849              
850             =head2 merge_vcfs
851              
852             There is one vcf-annotated file per sample, so merge those at the the end to get a multisample file using vcf-merge
853              
854             =cut
855              
856             sub merge_vcfs {
857 0     0     my($self) = @_;
858              
859 0 0         return if scalar @{$self->samples->{$self->file}} == 1;
  0            
860              
861 0           print "##Merge single sample VCF files\n\n";
862              
863 0           print "vcf-merge \\\n";
864 0           foreach my $sample (@{$self->samples->{$self->file}}){
  0            
865 0           print $self->outdir."/vcf-annotate_final/".$self->fname.".$sample.annovar.vcf.gz \\\n";
866             }
867 0           print " | bgzip -f -c > ".$self->outdir."/vcf-annotate_interim/".$self->fname.".allsamples.annovar.vcf.gz \\\n";
868 0           print "&& tabix -p vcf ".$self->outdir."/vcf-annotate_interim/".$self->fname.".allsamples.annovar.vcf.gz\n";
869              
870 0           print "\nwait\n\n";
871              
872 0           $self->subset_vcfs();
873              
874             }
875              
876             =head2 subset_vcfs
877              
878             vcf-merge used in this fashion will create a lot of redundant columns, because it wants to assume all sample names are unique
879              
880             Straight from the vcftools documentation
881              
882             vcf-subset -c NA0001,NA0002 file.vcf.gz | bgzip -c > out.vcf.gz
883              
884             =cut
885              
886             sub subset_vcfs {
887 0     0     my($self) = @_;
888              
889 0           print "##Subsetting the files to get rid of redundant info\n\n";
890              
891 0           my $str = join(",", @{$self->orig_samples->{$self->file}});
  0            
892              
893 0           print "vcf-subset -c $str ".$self->outdir."/vcf-annotate_interim/".$self->fname.".allsamples.annovar.vcf.gz | bgzip -f -c > ".$self->outdir."/vcf-annotate_final/".$self->fname.".allsamples.nonredundant.annovar.vcf.gz \\\n";
894 0           print "&& tabix -p vcf ".$self->outdir."/vcf-annotate_final/".$self->fname.".allsamples.nonredundant.annovar.vcf.gz\n";
895              
896 0           print "## Finished processing file ".$self->file."\n\n";
897             }
898              
899              
900              
901             =head1 AUTHOR
902              
903             Jillian Rowe, C<< <jillian.e.rowe at gmail.com> >>
904              
905             =head1 BUGS
906              
907             Please report any bugs or feature requests to C<bug-annovar-wrapper at rt.cpan.org>, or through
908             the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Annovar-Wrapper>. I will be notified, and then you'll
909             automatically be notified of progress on your bug as I make changes.
910              
911             =head1 SUPPORT
912              
913             You can find documentation for this module with the perldoc command.
914              
915             perldoc Annovar::Wrapper
916              
917              
918             You can also look for information at:
919              
920             =over 4
921              
922             =item * RT: CPAN's request tracker (report bugs here)
923              
924             L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Annovar-Wrapper>
925              
926             =item * AnnoCPAN: Annotated CPAN documentation
927              
928             L<http://annocpan.org/dist/Annovar-Wrapper>
929              
930             =item * CPAN Ratings
931              
932             L<http://cpanratings.perl.org/d/Annovar-Wrapper>
933              
934             =item * Search CPAN
935              
936             L<http://search.cpan.org/dist/Annovar-Wrapper/>
937              
938             =back
939              
940              
941             =head1 ACKNOWLEDGEMENTS
942              
943             This module is a wrapper around the well developed annovar pipeline. The commands come straight from the documentation.
944              
945             This module was originally developed at and for Weill Cornell Medical College in Qatar within ITS Advanced Computing Team and LOTS of input and scientist debugging by Khalid Fahkro. With approval from WCMC-Q, this information was generalized and put on github, for which the authors would like to express their gratitude.
946              
947             =head1 LICENSE AND COPYRIGHT
948              
949             Copyright 2014 Jillian Rowe.
950              
951             This program is free software; you can redistribute it and/or modify it
952             under the terms of the the Artistic License (2.0). You may obtain a
953             copy of the full license at:
954              
955             L<http://www.perlfoundation.org/artistic_license_2_0>
956              
957             Any use, modification, and distribution of the Standard or Modified
958             Versions is governed by this Artistic License. By using, modifying or
959             distributing the Package, you accept this license. Do not use, modify,
960             or distribute the Package, if you do not accept this license.
961              
962             If your Modified Version has been derived from a Modified Version made
963             by someone other than you, you are nevertheless required to ensure that
964             your Modified Version complies with the requirements of this license.
965              
966             This license does not grant you the right to use any trademark, service
967             mark, tradename, or logo of the Copyright Holder.
968              
969             This license includes the non-exclusive, worldwide, free-of-charge
970             patent license to make, have made, use, offer to sell, sell, import and
971             otherwise transfer the Package with respect to any patent claims
972             licensable by the Copyright Holder that are necessarily infringed by the
973             Package. If you institute patent litigation (including a cross-claim or
974             counterclaim) against any party alleging that the Package constitutes
975             direct or contributory patent infringement, then this Artistic License
976             to you shall terminate on the date that such litigation is filed.
977              
978             Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
979             AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
980             THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
981             PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
982             YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
983             CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
984             CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
985             EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
986              
987              
988             =cut
989              
990             1; # End of Annovar::Wrapper