File Coverage

blib/lib/Bio/ViennaNGS/Tutorial.pm

Criterion	Covered	Total	%
statement	15	15	100.0
branch			n/a
condition			n/a
subroutine	5	5	100.0
pod			n/a
total	20	20	100.0

line	stmt	sub	time	code
1				# --CPerl--
2				# Last changed Time-stamp: <2014-12-20 00:41:39 mtw>
3
4				package Bio::ViennaNGS::Tutorial;
5
6	1	1	1806	use 5.12.0;
	1		2
	1		51
7	1	1	7	use Exporter;
	1		1
	1		46
8	1	1	6	use version; our $VERSION = qv('0.12_07');
	1		2
	1		25
9	1	1	90	use strict;
	1		1
	1		31
10	1	1	6	use warnings;
	1		1
	1		96
11
12				our @ISA = qw(Exporter);
13				our @EXPORT = ();
14
15				our @EXPORT_OK = qw ();
16
17				1;
18				__END__
19
20				=head1 NAME
21
22				Tutorial_pipeline01.pl - An example pipeline for the ViennaNGS toolbox
23
24				=head2 SYNOPSIS
25
26				./Pipeline.pl path/to/R/libraries
27				Where path/to/R/libraries should point to the directory containing ggplot2
28
29				=head2 Pipeline
30
31				This script is a showcase for the usage of ViennaNGS in a real NGS example.
32				We start from a file containing ENSEMBL annotation information for human protein-coding genes.
33				We are insterested in finding sequence motifs in close proximity to the gene start (50nt upstream, 10nt into the gene).
34
35				The first step is to initialize some variables and generate a chromosome_sizes hash.
36
37				C<my $bed = 'hg19_highlyexpressed.bed';>
38				C<my $name = (split(/\./,$bed))[0];>
39				C<my $upstream = 50;>
40				C<my $into = 10;>
41				C<my $outfile = "$name.ext$upstream\_fromStart_$into\_downstream.bed";>
42				C<my $outfile2 = "$name.ext$upstream\_upstream.bed";>
43				C<my %sizes = %{fetch_chrom_sizes('hg19')};
44
45				=cut
46
47				=head2 Generate a Bio::ViennaNGS::FeatureChain object
48
49				The bed file of interest is parsed, a feature array is generated and passed on to Bio::ViennaNGS::FeatureChain, which creates a new Moose Object of type FeatureChain, containing the original bed entries
50				C<my @featurelist = @{parse_bed6($bed)};
51				### Now we create a Bio::ViennaNGS::FeatureChain from the featurelist above
52				my $chain = Bio::ViennaNGS::FeatureChain->new('type'=>'original','chain'=>\@featurelist);>
53
54				=cut
55
56				=head2 Extend the existing chain for motif analysis
57
58				The newly created FeatureChain object will now be extended 50nt upstream of the gene start and 10nt into the gene, to retrieve a bed file which contains the putative sequence motifs.
59
60				C<my $extended_chain = extend_chain(\%sizes,$chain,0,$into,$upstream,0);>
61
62				For later purposes we also extend the whole U6 gene span 50nt upstream.
63				C<my $extended_chain2 = extend_chain(\%sizes,$chain,$upstream,0,0,0);>
64
65				=cut
66
67				=head2 Print extended Bio::ViennaNGS::FeatureChain objects to files
68
69				Extended chains are now print out to make them available for external tools like bedtools.
70				C<my $out = $extended_chain->print_chain();
71				print $Out $out;
72				$out = $extended_chain2->print_chain();
73				print $Out2 $out;>
74
75				=cut
76
77				=head3 Summary of so far used methods
78
79				=over 4
80
81				=item fetch_chrom_sizes<as_string>
82
83				Returns a chromosome-sizes hash reference for the specified species, e.g. hg19, mm9, mm10, etc.
84
85				=item parse_bed6<as_string>
86
87				Reads a bed6 file and returns a feature array.
88
89				=item Bio::ViennaNGS::FeatureChain->new()<as_method>
90
91				Generates a new Bio::ViennaNGS::FeatureChain object from a feature array
92
93				=item Bio::ViennaNGS(extend_chain)<as_method>
94
95				Extends a Bio::ViennaNGS::FeatureChain object by given constraints
96
97				=back
98
99				=cut
100
101				=head2 Sequence analysis
102
103				We now generate FASTA files from the extended bed files using bedtools getfasta method.
104				C<my $bedtools = `bedtools getfasta -s -fi hg19_chromchecked.fa -bed $outfile -fo $name.ext$upstream\_fromStart_$into\_downstream.fa`;>
105				C<print STDERR "$bedtools\n" if $?;>
106				C<$bedtools = `bedtools getfasta -s -fi hg19_chromchecked.fa -bed $outfile2 -fo $name.ext$upstream\_upstream.fa`;>
107				C<print STDERR "$bedtools\n" if $?;>
108
109				To analyze putative sequence motifs in the newly generated Fasta files, we use two approaches.
110				First we analyze the k-mer content with the Bio::ViennaNGS(kmer_enrichment) method for k-mers of length 6 to 8 nt.
111
112				C<open(IN,"<","$name.ext$upstream\_fromStart_$into\_downstream.fa") \|\| die ("Could not open $name.ext$upstream\_fromStart_$into\_downstream.fa!\n@!\n");>
113
114				C<my @fastaseqs;>
115				C<while(<IN>){
116				chomp (my $raw = $_);
117				next if ($_ =~ /^>/);
118				push @fastaseqs, $raw;
119				}>
120				C<close(IN);>
121
122				C<for (6..8){
123				my %kmer = %{kmer_enrichment(\@fastaseqs, $_)};
124				my $total = sum values %kmer;
125				### Print Output
126				open(KMER,">","$_\_mers") or die "Could not open file $_\_mers$!\n";
127				print KMER "$_\-mer\tCount\tRatio\n";
128				print KMER "TOTAL\t$total\t1\n";
129				foreach my $key (sort {$kmer{$b} <=> $kmer{$a} } keys %kmer) {
130				my $ratio = nearest(.0001,$kmer{$key}/$total);
131				print KMER "$key\t$kmer{$key}\t$ratio\n";
132				}
133				close(KMER);
134				}>
135				=cut
136
137				=head1
138
139				In a second approach we run MEME to retrieve the 20 most over-represented motifs of length 8.
140				C<meme hg19_highexpressed.ext50_fromStart_10_downstream.fa -oc MEME_hg19_highexpressed.ext50_fromStart_10_downstream.fa -w 8 -dna -maxsize 1000000000 -nmotifs 20>
141
142				Once the meme run is done, we want to have a nice figure which shows the e-value and site coverage of the top 10 motifs
143
144				C<`./scripts/MEME_xml_motif_extractor.pl -f Example_Pipeline_meme.xml -r $RLIBPATH -t Example_Pipeline`>
145
146				=cut
147
148				=head1 AUTHOR
149
150				Joerg Fallmann E<lt>joerg.fallmann@univie.ac.atE<gt>
151
152				=cut
153
154				##################################END################################