File Coverage

blib/lib/Bio/RNA/SpliceSites/Scoring/MaxEntScan.pm
Criterion Covered Total %
statement 21 93 22.5
branch 0 32 0.0
condition n/a
subroutine 7 19 36.8
pod 12 12 100.0
total 40 156 25.6


line stmt bran cond sub pod time code
1             package Bio::RNA::SpliceSites::Scoring::MaxEntScan;
2              
3 1     1   13479 use 5.008;
  1         2  
  1         31  
4 1     1   4 use strict;
  1         1  
  1         24  
5 1     1   3 use warnings;
  1         4  
  1         26  
6 1     1   3 use Carp;
  1         1  
  1         61  
7              
8             #Data submodules; these names are from the original maxEntScan distribution.
9 1     1   114076 use Bio::RNA::SpliceSites::Scoring::me2x3acc;
  1         29  
  1         2820  
10 1     1   9380 use Bio::RNA::SpliceSites::Scoring::me2x5;
  1         5  
  1         53  
11 1     1   12427 use Bio::RNA::SpliceSites::Scoring::splice5sequences;
  1         21  
  1         1390  
12              
13             require Exporter;
14              
15             our @ISA = qw/ Exporter /;
16             our $VERSION = '0.05';
17              
18             my $functions = [ qw/ score5 score3 / ];
19             our %EXPORT_TAGS = ( 'all' => $functions , );
20             our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
21             our $VERSION = '0.04';
22              
23             sub log2 {
24 0     0 1   my $number = shift;
25 0           log($number)/log(2);
26             }
27              
28             sub is_scalar_reference {
29 0     0 1   my $reference_to_validate = shift;
30 0 0         ref( $reference_to_validate ) eq "SCALAR" ? 1 : 0;
31             }
32              
33             sub is_kmer {
34 0     0 1   my ( $reference_to_validate , $valid_length ) = @_;
35 0 0         length $$reference_to_validate == $valid_length ? 1 : 0;
36             }
37              
38             sub is_genetic_alphabet {
39 0     0 1   my $reference_to_validate = shift;
40 0 0         $$reference_to_validate =~ /^[ACTGactg]+$/ ? 1 : 0;
41             }
42              
43             sub split_sequence {
44             #Split the provided splice site sequence into the splice donor/acceptor dinucleotide and the concatenated remainder.
45 0     0 1   my ( $reference_to_sequence , $splice_site_type ) = @_;
46 0           my @sequence_array = split // , uc( $$reference_to_sequence );
47 0           my ( $dinucleotide , $outer_portion );
48 0 0         if ( $splice_site_type == 5 ) {
    0          
49 0           $dinucleotide = join "" , @sequence_array[3..4]; #Positions 4 and 5 are the splice donor dinucleotide.
50 0           $outer_portion = ( join "" , @sequence_array[0..2] ) . ( join "" , @sequence_array[5..8] );
51             }
52             elsif ( $splice_site_type == 3 ) {
53 0           $dinucleotide = join "" , @sequence_array[18..19]; #Positions 19 and 20 are the splice acceptor dinucleotide.
54 0           $outer_portion = ( join "" , @sequence_array[0..17] ) . ( join "" , @sequence_array[20..22] ); #Join nucleotides 1-18 and 21-23.
55             }
56             else {
57 0           croak "Invalid type of splice site to split: must be either 5 or 3.\n";
58             }
59 0           return ( $dinucleotide , $outer_portion );
60             }
61              
62             sub get_splice_5_sequence_matrix_value {
63 0     0 1   my $outer_portion = shift;
64 0 0         exists $Bio::RNA::SpliceSites::Scoring::SpliceModels::splice5sequences::table->{ $outer_portion } ?
65             return $Bio::RNA::SpliceSites::Scoring::SpliceModels::splice5sequences::table->{ $outer_portion } : carp "Unable to find sequence matrix for key $outer_portion.\n";
66             }
67              
68             sub get_splice_5_score_matrix_value {
69 0     0 1   my $sequence_matrix_value = shift; #The term "matrix" was used in the original maxEntScan programming and is retained here for clarity.
70 0 0         exists $Bio::RNA::SpliceSites::Scoring::SpliceModels::me2x5::table->[ $sequence_matrix_value ] ?
71             return $Bio::RNA::SpliceSites::Scoring::SpliceModels::me2x5::table->[ $sequence_matrix_value ] : carp "Index out of score matrix range: $sequence_matrix_value\n";
72             }
73              
74             sub score_consensus {
75 0     0 1   my ( $dinucleotide , $type ) = @_;
76 0           my %bgd = ( 'A' => 0.27 , 'C' => 0.23 , 'G' => 0.23 , 'T' => 0.27 ); #Shared between each type of splice site.
77 0           my ( %cons1 , %cons2 ); #Populate conditional to splice site type: donor or acceptor.
78              
79 0 0         if ( $type == 5 ) {
    0          
80 0 0         return 15.7507349436393 if $dinucleotide eq 'GT'; #Short circuit for the perfect GT splice donor dinucleotide.
81 0           %cons1 = ( 'A' => 0.004 , 'C' => 0.0032 , 'G' => 0.9896 , 'T' => 0.0032 );
82 0           %cons2 = ( 'A' => 0.0034 , 'C' => 0.0039 , 'G' => 0.0042 , 'T' => 0.9884 );
83             }
84             elsif ( $type == 3 ) {
85             #return ????? if $dinucleotide eq 'AG'; #Short circuit for perfect AG splice acceptor dinucleotide.
86 0           %cons1 = ( 'A' => 0.9903 , 'C' => 0.0032 , 'G' => 0.0034 , 'T' => 0.003 );
87 0           %cons2 = ( 'A' => 0.0027 , 'C' => 0.0037 , 'G' => 0.9905 , 'T' => 0.003 );
88             }
89             else {
90 0           croak "Invalid type of splice site to score consensus for: must be either 5 or 3.\n";
91             }
92              
93 0           my ( $first_nucleotide , $second_nucleotide ) = split // , $dinucleotide;
94 0           return ( $cons1{$first_nucleotide} * $cons2{$second_nucleotide} ) / ( $bgd{$first_nucleotide} * $bgd{$second_nucleotide} );
95             }
96              
97             sub score5 {
98 0     0 1   my $sequence_reference = shift;
99             #Validate argument:
100 0 0         unless ( is_scalar_reference( $sequence_reference ) ) {
101 0           carp "Not a scalar reference.\n";
102 0           return 'invalid_invocation';
103             }
104 0 0         unless ( is_kmer( $sequence_reference , 9 ) ) {
105 0           carp "Invalid 5'ss length: must be 9 nucleotides long.\n";
106 0           return 'invalid_length';
107             }
108 0 0         unless ( is_genetic_alphabet( $sequence_reference ) ) {
109 0           carp "Invalid alphabet: must be only [ACTGactg] with no 'n' nucleotides.\n";
110 0           return 'invalid_alphabet';
111             }
112              
113 0           my ( $dinucleotide , $outer_portion ) = split_sequence( $sequence_reference , 5 );
114              
115             #Compute the log2 of the product of the score_5_consensus() subroutine return for the entire sequence (left_side_of_product)
116             # and the me2x5 (score matrix) value for the sequence matrix value of the "outer portion" of the splice site, which is the heptamer with the donor dinucleotide removed
117             # from the splice site nonamer, which is assigned to the scalar variable $outer_portion
118              
119 0           my $score = log2( score_consensus( $dinucleotide , 5 ) * get_splice_5_score_matrix_value( get_splice_5_sequence_matrix_value( $outer_portion ) ) );
120 0           return sprintf( "%.2f" , $score ); #2 decimals.
121             }
122              
123             sub hash_seq {
124             #Returns a hash key for a sequence as a 4-radix integer.
125             #E.g. given sequence 'CAGAAGT', returns 4619 as a scalar.
126 0     0 1   my $sequence = shift;
127 0           $sequence=~ y/ACGT/0123/;
128 0           my @sequence_array = split // , $sequence;
129 0           my $sum = 0;
130 0           my $end = length( $sequence ) - 1;
131              
132 0           my @four_radix = qw/ 1 4 16 64 256 1024 4096 16384 /;
133 0           $sum += $sequence_array[$_] * $four_radix[ $end - $_ ] for 0 .. $end;
134 0           return $sum;
135             }
136              
137             sub get_max_ent_score {
138 0     0 1   my ( $sequence , $table_ref ) = @_; #Table ref is a reference to an array of references to hashes prepared in the me2x3acc submodule.
139 0           my @partial_score = ( $table_ref->[0]{ hash_seq( substr $sequence , 0 , 7 ) } ,
140             $table_ref->[1]{ hash_seq( substr $sequence , 7 , 7 ) } ,
141             $table_ref->[2]{ hash_seq( substr $sequence , 14 , 7 ) } ,
142             $table_ref->[3]{ hash_seq( substr $sequence , 4 , 7 ) } ,
143             $table_ref->[4]{ hash_seq( substr $sequence , 11 , 7 ) } ,
144             $table_ref->[5]{ hash_seq( substr $sequence , 4 , 3 ) } ,
145             $table_ref->[6]{ hash_seq( substr $sequence , 7 , 4 ) } ,
146             $table_ref->[7]{ hash_seq( substr $sequence , 11 , 3 ) } ,
147             $table_ref->[8]{ hash_seq( substr $sequence , 14 , 4 ) } );
148 0           my $final_score = $partial_score[0] * $partial_score[1] * $partial_score[2] * $partial_score[3] * $partial_score[4] /
149             ( $partial_score[5] * $partial_score[6] * $partial_score[7] * $partial_score[8] );
150 0           return $final_score;
151             }
152              
153             sub score3 {
154 0     0 1   my $sequence_reference = shift; #Should be 23nt long.
155             #Validate argument:
156 0 0         unless ( is_scalar_reference( $sequence_reference ) ) {
157 0           carp "Not a scalar reference.\n";
158 0           return 'invalid_invocation';
159             }
160 0 0         unless ( is_kmer( $sequence_reference , 23 ) ) {
161 0           carp "Invalid 3'ss length: must be 23nt long.\n";
162 0           return 'invalid_length';
163             }
164 0 0         unless ( is_genetic_alphabet( $sequence_reference ) ) {
165 0           carp "Invalid alphabet: must be only [ACTGactg] with no 'n' nucleotides.\n";
166 0           return 'invalid_alphabet';
167             }
168              
169 0           my ( $dinucleotide , $outer_portion ) = split_sequence( $sequence_reference , 3 );
170 0           my $score = log2( score_consensus( $dinucleotide , 3 ) * get_max_ent_score( $outer_portion , $Bio::RNA::SpliceSites::Scoring::SpliceModels::me2x3acc::table ) );
171 0           return sprintf( "%.2f" , $score ); #2 decimals.
172             }
173              
174             1;
175              
176             __END__