File Coverage

blib/lib/Bio/ViennaNGS/Tutorial.pm
Criterion Covered Total %
statement 15 15 100.0
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 20 20 100.0


line stmt bran cond sub pod time code
1             # -*-CPerl-*-
2             # Last changed Time-stamp: <2014-12-20 00:41:39 mtw>
3              
4             package Bio::ViennaNGS::Tutorial;
5              
6 1     1   1806 use 5.12.0;
  1         2  
  1         51  
7 1     1   7 use Exporter;
  1         1  
  1         46  
8 1     1   6 use version; our $VERSION = qv('0.12_07');
  1         2  
  1         25  
9 1     1   90 use strict;
  1         1  
  1         31  
10 1     1   6 use warnings;
  1         1  
  1         96  
11              
12             our @ISA = qw(Exporter);
13             our @EXPORT = ();
14              
15             our @EXPORT_OK = qw ();
16              
17             1;
18             __END__
19              
20             =head1 NAME
21              
22             Tutorial_pipeline01.pl - An example pipeline for the ViennaNGS toolbox
23              
24             =head2 SYNOPSIS
25            
26             ./Pipeline.pl path/to/R/libraries
27             Where path/to/R/libraries should point to the directory containing ggplot2
28              
29             =head2 Pipeline
30              
31             This script is a showcase for the usage of ViennaNGS in a real NGS example.
32             We start from a file containing ENSEMBL annotation information for human protein-coding genes.
33             We are insterested in finding sequence motifs in close proximity to the gene start (50nt upstream, 10nt into the gene).
34              
35             The first step is to initialize some variables and generate a chromosome_sizes hash.
36            
37             C<my $bed = 'hg19_highlyexpressed.bed';>
38             C<my $name = (split(/\./,$bed))[0];>
39             C<my $upstream = 50;>
40             C<my $into = 10;>
41             C<my $outfile = "$name.ext$upstream\_fromStart_$into\_downstream.bed";>
42             C<my $outfile2 = "$name.ext$upstream\_upstream.bed";>
43             C<my %sizes = %{fetch_chrom_sizes('hg19')};
44            
45             =cut
46              
47             =head2 Generate a Bio::ViennaNGS::FeatureChain object
48              
49             The bed file of interest is parsed, a feature array is generated and passed on to Bio::ViennaNGS::FeatureChain, which creates a new Moose Object of type FeatureChain, containing the original bed entries
50             C<my @featurelist = @{parse_bed6($bed)};
51             ### Now we create a Bio::ViennaNGS::FeatureChain from the featurelist above
52             my $chain = Bio::ViennaNGS::FeatureChain->new('type'=>'original','chain'=>\@featurelist);>
53            
54             =cut
55              
56             =head2 Extend the existing chain for motif analysis
57              
58             The newly created FeatureChain object will now be extended 50nt upstream of the gene start and 10nt into the gene, to retrieve a bed file which contains the putative sequence motifs.
59            
60             C<my $extended_chain = extend_chain(\%sizes,$chain,0,$into,$upstream,0);>
61            
62             For later purposes we also extend the whole U6 gene span 50nt upstream.
63             C<my $extended_chain2 = extend_chain(\%sizes,$chain,$upstream,0,0,0);>
64              
65             =cut
66              
67             =head2 Print extended Bio::ViennaNGS::FeatureChain objects to files
68              
69             Extended chains are now print out to make them available for external tools like bedtools.
70             C<my $out = $extended_chain->print_chain();
71             print $Out $out;
72             $out = $extended_chain2->print_chain();
73             print $Out2 $out;>
74              
75             =cut
76              
77             =head3 Summary of so far used methods
78              
79             =over 4
80              
81             =item fetch_chrom_sizes<as_string>
82              
83             Returns a chromosome-sizes hash reference for the specified species, e.g. hg19, mm9, mm10, etc.
84              
85             =item parse_bed6<as_string>
86              
87             Reads a bed6 file and returns a feature array.
88              
89             =item Bio::ViennaNGS::FeatureChain->new()<as_method>
90              
91             Generates a new Bio::ViennaNGS::FeatureChain object from a feature array
92              
93             =item Bio::ViennaNGS(extend_chain)<as_method>
94              
95             Extends a Bio::ViennaNGS::FeatureChain object by given constraints
96              
97             =back
98              
99             =cut
100              
101             =head2 Sequence analysis
102              
103             We now generate FASTA files from the extended bed files using bedtools getfasta method.
104             C<my $bedtools = `bedtools getfasta -s -fi hg19_chromchecked.fa -bed $outfile -fo $name.ext$upstream\_fromStart_$into\_downstream.fa`;>
105             C<print STDERR "$bedtools\n" if $?;>
106             C<$bedtools = `bedtools getfasta -s -fi hg19_chromchecked.fa -bed $outfile2 -fo $name.ext$upstream\_upstream.fa`;>
107             C<print STDERR "$bedtools\n" if $?;>
108              
109             To analyze putative sequence motifs in the newly generated Fasta files, we use two approaches.
110             First we analyze the k-mer content with the Bio::ViennaNGS(kmer_enrichment) method for k-mers of length 6 to 8 nt.
111              
112             C<open(IN,"<","$name.ext$upstream\_fromStart_$into\_downstream.fa") || die ("Could not open $name.ext$upstream\_fromStart_$into\_downstream.fa!\n@!\n");>
113              
114             C<my @fastaseqs;>
115             C<while(<IN>){
116             chomp (my $raw = $_);
117             next if ($_ =~ /^>/);
118             push @fastaseqs, $raw;
119             }>
120             C<close(IN);>
121              
122             C<for (6..8){
123             my %kmer = %{kmer_enrichment(\@fastaseqs, $_)};
124             my $total = sum values %kmer;
125             ### Print Output
126             open(KMER,">","$_\_mers") or die "Could not open file $_\_mers$!\n";
127             print KMER "$_\-mer\tCount\tRatio\n";
128             print KMER "TOTAL\t$total\t1\n";
129             foreach my $key (sort {$kmer{$b} <=> $kmer{$a} } keys %kmer) {
130             my $ratio = nearest(.0001,$kmer{$key}/$total);
131             print KMER "$key\t$kmer{$key}\t$ratio\n";
132             }
133             close(KMER);
134             }>
135             =cut
136              
137             =head1
138              
139             In a second approach we run MEME to retrieve the 20 most over-represented motifs of length 8.
140             C<meme hg19_highexpressed.ext50_fromStart_10_downstream.fa -oc MEME_hg19_highexpressed.ext50_fromStart_10_downstream.fa -w 8 -dna -maxsize 1000000000 -nmotifs 20>
141              
142             Once the meme run is done, we want to have a nice figure which shows the e-value and site coverage of the top 10 motifs
143              
144             C<`./scripts/MEME_xml_motif_extractor.pl -f Example_Pipeline_meme.xml -r $RLIBPATH -t Example_Pipeline`>
145              
146             =cut
147              
148             =head1 AUTHOR
149              
150             Joerg Fallmann E<lt>joerg.fallmann@univie.ac.atE<gt>
151              
152             =cut
153              
154             ##################################END################################