|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package Pheno::Ranker::Stats;  | 
| 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
3
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
20
 | 
 use strict;  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
90
 | 
    | 
| 
4
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
16
 | 
 use warnings;  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
70
 | 
    | 
| 
5
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
14
 | 
 use autodie;  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
30
 | 
    | 
| 
6
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
16675
 | 
 use feature qw(say);  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
298
 | 
    | 
| 
7
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
20
 | 
 use Data::Dumper;  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
228
 | 
    | 
| 
8
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
1488
 | 
 use Math::CDF qw(pnorm pbinom);  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9160
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
200
 | 
    | 
| 
9
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
1658
 | 
 use Statistics::Descriptive;  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
42600
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
108
 | 
    | 
| 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
11
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
21
 | 
 use Exporter 'import';  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
160
 | 
    | 
| 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our @EXPORT =  | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   qw(hd_fast jaccard_similarity estimate_hamming_stats z_score p_value_from_z_score _p_value add_stats);  | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
15
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
19
 | 
 use constant DEVEL_MODE => 0;  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
18
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1578
 | 
    | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub hd_fast {  | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # Hamming Distance  | 
| 
20
 | 
2892
 | 
 
 | 
 
 | 
  
2892
  
 | 
  
0
  
 | 
4868
 | 
     return ( $_[0] ^ $_[1] ) =~ tr/\001-\255//;  | 
| 
21
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub jaccard_similarity {  | 
| 
24
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
25
 | 
36
 | 
 
 | 
 
 | 
  
36
  
 | 
  
0
  
 | 
51
 | 
     my ( $str1, $str2 ) = @_;  | 
| 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
27
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # NB: It does not take into account 0 --- 0  | 
| 
28
 | 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
49
 | 
     my ( $intersection, $union ) = ( 0, 0 );  | 
| 
29
 | 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
64
 | 
     for my $i ( 0 .. length($str1) - 1 ) {  | 
| 
30
 | 
6012
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6654
 | 
         my $char1 = substr( $str1, $i, 1 );  | 
| 
31
 | 
6012
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5945
 | 
         my $char2 = substr( $str2, $i, 1 );  | 
| 
32
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
33
 | 
6012
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
11407
 | 
         if ( $char1 eq '1' || $char2 eq '1' ) {  | 
| 
34
 | 
3290
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3141
 | 
             $union++;  | 
| 
35
 | 
3290
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
7075
 | 
             $intersection++ if $char1 eq '1' && $char2 eq '1';  | 
| 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
38
 | 
36
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
114
 | 
     return $union == 0 ? 0 : $intersection / $union;  | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub estimate_hamming_stats {  | 
| 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
43
 | 
36
 | 
 
 | 
 
 | 
  
36
  
 | 
  
0
  
 | 
52
 | 
     my $length               = shift;  | 
| 
44
 | 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
38
 | 
     my $probability_mismatch = 0.5;  | 
| 
45
 | 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
67
 | 
     my $estimated_average    = $length * $probability_mismatch;  | 
| 
46
 | 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
53
 | 
     my $estimated_std_dev =  | 
| 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       sqrt( $length * $probability_mismatch * ( 1 - $probability_mismatch ) );  | 
| 
48
 | 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
99
 | 
     return $estimated_average, $estimated_std_dev;  | 
| 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub z_score {  | 
| 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
53
 | 
108
 | 
 
 | 
 
 | 
  
108
  
 | 
  
0
  
 | 
145
 | 
     my ( $observed_value, $expected_value, $std_dev ) = @_;  | 
| 
54
 | 
108
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
171
 | 
     return 0 if $std_dev == 0;  | 
| 
55
 | 
108
 | 
 
 | 
 
 | 
 
 | 
 
 | 
174
 | 
     return ( $observed_value - $expected_value ) / $std_dev;  | 
| 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub p_value_from_z_score {  | 
| 
59
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
60
 | 
72
 | 
 
 | 
 
 | 
  
72
  
 | 
  
0
  
 | 
300
 | 
     return pnorm(shift)    # One-tailed test  | 
| 
61
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
62
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #sub _p_value {  | 
| 
64
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
65
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #    my ( $hamming_distance, $string_length ) = @_;  | 
| 
66
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #    my $probability_mismatch = 0.5;  | 
| 
67
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #    return 2 * (1 - pbinom($hamming_distance - 1, $string_length, $probability_mismatch))  | 
| 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #}  | 
| 
69
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub add_stats {  | 
| 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
72
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
  
0
  
 | 
5
 | 
     my $array = shift;  | 
| 
73
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
     my $hash_out;  | 
| 
74
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
23
 | 
     my $stat = Statistics::Descriptive::Full->new();  | 
| 
75
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
217
 | 
     $stat->add_data($array);  | 
| 
76
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
378
 | 
     $hash_out->{mean}   = $stat->mean();  | 
| 
77
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
19
 | 
     $hash_out->{sd}     = $stat->standard_deviation();  | 
| 
78
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
105
 | 
     $hash_out->{count}  = $stat->count();  | 
| 
79
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
17
 | 
     $hash_out->{per25}  = $stat->percentile(25);  | 
| 
80
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
427
 | 
     $hash_out->{per75}  = $stat->percentile(75);  | 
| 
81
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
61
 | 
     $hash_out->{min}    = $stat->min();  | 
| 
82
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
16
 | 
     $hash_out->{max}    = $stat->max();  | 
| 
83
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14
 | 
     $hash_out->{median} = $stat->median();  | 
| 
84
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
125
 | 
     $hash_out->{sum}    = $stat->sum();  | 
| 
85
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
86
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
28
 | 
     return $hash_out;  | 
| 
87
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
88
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  |