line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# -*-CPerl-*- |
2
|
|
|
|
|
|
|
# Last changed Time-stamp: <2014-12-20 00:41:39 mtw> |
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
package Bio::ViennaNGS::Tutorial; |
5
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
1806
|
use 5.12.0; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
51
|
|
7
|
1
|
|
|
1
|
|
7
|
use Exporter; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
46
|
|
8
|
1
|
|
|
1
|
|
6
|
use version; our $VERSION = qv('0.12_07'); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
25
|
|
9
|
1
|
|
|
1
|
|
90
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
31
|
|
10
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
96
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
13
|
|
|
|
|
|
|
our @EXPORT = (); |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
our @EXPORT_OK = qw (); |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
1; |
18
|
|
|
|
|
|
|
__END__ |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 NAME |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
Tutorial_pipeline01.pl - An example pipeline for the ViennaNGS toolbox |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=head2 SYNOPSIS |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
./Pipeline.pl path/to/R/libraries |
27
|
|
|
|
|
|
|
Where path/to/R/libraries should point to the directory containing ggplot2 |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
=head2 Pipeline |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
This script is a showcase for the usage of ViennaNGS in a real NGS example. |
32
|
|
|
|
|
|
|
We start from a file containing ENSEMBL annotation information for human protein-coding genes. |
33
|
|
|
|
|
|
|
We are insterested in finding sequence motifs in close proximity to the gene start (50nt upstream, 10nt into the gene). |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
The first step is to initialize some variables and generate a chromosome_sizes hash. |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
C<my $bed = 'hg19_highlyexpressed.bed';> |
38
|
|
|
|
|
|
|
C<my $name = (split(/\./,$bed))[0];> |
39
|
|
|
|
|
|
|
C<my $upstream = 50;> |
40
|
|
|
|
|
|
|
C<my $into = 10;> |
41
|
|
|
|
|
|
|
C<my $outfile = "$name.ext$upstream\_fromStart_$into\_downstream.bed";> |
42
|
|
|
|
|
|
|
C<my $outfile2 = "$name.ext$upstream\_upstream.bed";> |
43
|
|
|
|
|
|
|
C<my %sizes = %{fetch_chrom_sizes('hg19')}; |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=cut |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=head2 Generate a Bio::ViennaNGS::FeatureChain object |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
The bed file of interest is parsed, a feature array is generated and passed on to Bio::ViennaNGS::FeatureChain, which creates a new Moose Object of type FeatureChain, containing the original bed entries |
50
|
|
|
|
|
|
|
C<my @featurelist = @{parse_bed6($bed)}; |
51
|
|
|
|
|
|
|
### Now we create a Bio::ViennaNGS::FeatureChain from the featurelist above |
52
|
|
|
|
|
|
|
my $chain = Bio::ViennaNGS::FeatureChain->new('type'=>'original','chain'=>\@featurelist);> |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=cut |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=head2 Extend the existing chain for motif analysis |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
The newly created FeatureChain object will now be extended 50nt upstream of the gene start and 10nt into the gene, to retrieve a bed file which contains the putative sequence motifs. |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
C<my $extended_chain = extend_chain(\%sizes,$chain,0,$into,$upstream,0);> |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
For later purposes we also extend the whole U6 gene span 50nt upstream. |
63
|
|
|
|
|
|
|
C<my $extended_chain2 = extend_chain(\%sizes,$chain,$upstream,0,0,0);> |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=cut |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=head2 Print extended Bio::ViennaNGS::FeatureChain objects to files |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
Extended chains are now print out to make them available for external tools like bedtools. |
70
|
|
|
|
|
|
|
C<my $out = $extended_chain->print_chain(); |
71
|
|
|
|
|
|
|
print $Out $out; |
72
|
|
|
|
|
|
|
$out = $extended_chain2->print_chain(); |
73
|
|
|
|
|
|
|
print $Out2 $out;> |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
=cut |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=head3 Summary of so far used methods |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
=over 4 |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
=item fetch_chrom_sizes<as_string> |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
Returns a chromosome-sizes hash reference for the specified species, e.g. hg19, mm9, mm10, etc. |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
=item parse_bed6<as_string> |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
Reads a bed6 file and returns a feature array. |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
=item Bio::ViennaNGS::FeatureChain->new()<as_method> |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
Generates a new Bio::ViennaNGS::FeatureChain object from a feature array |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=item Bio::ViennaNGS(extend_chain)<as_method> |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
Extends a Bio::ViennaNGS::FeatureChain object by given constraints |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=back |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=cut |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
=head2 Sequence analysis |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
We now generate FASTA files from the extended bed files using bedtools getfasta method. |
104
|
|
|
|
|
|
|
C<my $bedtools = `bedtools getfasta -s -fi hg19_chromchecked.fa -bed $outfile -fo $name.ext$upstream\_fromStart_$into\_downstream.fa`;> |
105
|
|
|
|
|
|
|
C<print STDERR "$bedtools\n" if $?;> |
106
|
|
|
|
|
|
|
C<$bedtools = `bedtools getfasta -s -fi hg19_chromchecked.fa -bed $outfile2 -fo $name.ext$upstream\_upstream.fa`;> |
107
|
|
|
|
|
|
|
C<print STDERR "$bedtools\n" if $?;> |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
To analyze putative sequence motifs in the newly generated Fasta files, we use two approaches. |
110
|
|
|
|
|
|
|
First we analyze the k-mer content with the Bio::ViennaNGS(kmer_enrichment) method for k-mers of length 6 to 8 nt. |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
C<open(IN,"<","$name.ext$upstream\_fromStart_$into\_downstream.fa") || die ("Could not open $name.ext$upstream\_fromStart_$into\_downstream.fa!\n@!\n");> |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
C<my @fastaseqs;> |
115
|
|
|
|
|
|
|
C<while(<IN>){ |
116
|
|
|
|
|
|
|
chomp (my $raw = $_); |
117
|
|
|
|
|
|
|
next if ($_ =~ /^>/); |
118
|
|
|
|
|
|
|
push @fastaseqs, $raw; |
119
|
|
|
|
|
|
|
}> |
120
|
|
|
|
|
|
|
C<close(IN);> |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
C<for (6..8){ |
123
|
|
|
|
|
|
|
my %kmer = %{kmer_enrichment(\@fastaseqs, $_)}; |
124
|
|
|
|
|
|
|
my $total = sum values %kmer; |
125
|
|
|
|
|
|
|
### Print Output |
126
|
|
|
|
|
|
|
open(KMER,">","$_\_mers") or die "Could not open file $_\_mers$!\n"; |
127
|
|
|
|
|
|
|
print KMER "$_\-mer\tCount\tRatio\n"; |
128
|
|
|
|
|
|
|
print KMER "TOTAL\t$total\t1\n"; |
129
|
|
|
|
|
|
|
foreach my $key (sort {$kmer{$b} <=> $kmer{$a} } keys %kmer) { |
130
|
|
|
|
|
|
|
my $ratio = nearest(.0001,$kmer{$key}/$total); |
131
|
|
|
|
|
|
|
print KMER "$key\t$kmer{$key}\t$ratio\n"; |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
close(KMER); |
134
|
|
|
|
|
|
|
}> |
135
|
|
|
|
|
|
|
=cut |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
=head1 |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
In a second approach we run MEME to retrieve the 20 most over-represented motifs of length 8. |
140
|
|
|
|
|
|
|
C<meme hg19_highexpressed.ext50_fromStart_10_downstream.fa -oc MEME_hg19_highexpressed.ext50_fromStart_10_downstream.fa -w 8 -dna -maxsize 1000000000 -nmotifs 20> |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
Once the meme run is done, we want to have a nice figure which shows the e-value and site coverage of the top 10 motifs |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
C<`./scripts/MEME_xml_motif_extractor.pl -f Example_Pipeline_meme.xml -r $RLIBPATH -t Example_Pipeline`> |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=cut |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
=head1 AUTHOR |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
Joerg Fallmann E<lt>joerg.fallmann@univie.ac.atE<gt> |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
=cut |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
##################################END################################ |