File Coverage

lib/Pheno/Ranker/Stats.pm
Criterion Covered Total %
statement 65 65 100.0
branch 7 10 70.0
condition 6 6 100.0
subroutine 16 16 100.0
pod 0 7 0.0
total 94 104 90.3


line stmt bran cond sub pod time code
1             package Pheno::Ranker::Stats;
2              
3 5     5   25 use strict;
  5         8  
  5         139  
4 5     5   16 use warnings;
  5         6  
  5         216  
5 5     5   19 use autodie;
  5         6  
  5         45  
6 5     5   17861 use feature qw(say);
  5         9  
  5         571  
7 5     5   23 use Data::Dumper;
  5         6  
  5         298  
8 5     5   2223 use Math::CDF qw(pnorm pbinom);
  5         13589  
  5         353  
9 5     5   2368 use Statistics::Descriptive;
  5         61017  
  5         145  
10              
11 5     5   28 use Exporter 'import';
  5         5  
  5         255  
12             our @EXPORT =
13             qw(hd_fast jaccard_similarity jaccard_similarity_formatted estimate_hamming_stats z_score p_value_from_z_score _p_value add_stats);
14              
15 5     5   19 use constant DEVEL_MODE => 0;
  5         7  
  5         2652  
16              
17             sub hd_fast {
18              
19             # Hamming Distance
20 6746     6746 0 8231 return ( $_[0] ^ $_[1] ) =~ tr/\001-\255//;
21             }
22              
23             sub jaccard_similarity {
24              
25 702     702 0 685 my ( $str1, $str2 ) = @_;
26              
27             # Initialize intersection and union counts
28 702         647 my ( $intersection, $union ) = ( 0, 0 );
29              
30             # Ensure both strings are of the same length
31 702         571 my $length = length($str1);
32 702 50       835 die "Strings must be of the same length" unless $length == length($str2);
33              
34 702         779 for my $i ( 0 .. $length - 1 ) {
35 111906         89218 my $char1 = substr( $str1, $i, 1 );
36 111906         90645 my $char2 = substr( $str2, $i, 1 );
37              
38             # Increment union if either character is '1'
39 111906 100 100     159237 if ( $char1 eq '1' || $char2 eq '1' ) {
40 56822         38429 $union++;
41             # Increment intersection if both characters are '1'
42 56822 100 100     92357 $intersection++ if ( $char1 eq '1' && $char2 eq '1' );
43             }
44             }
45              
46             # Calculate Jaccard similarity
47 702 50       939 my $jaccard = $union == 0 ? 0 : $intersection / $union;
48              
49             # Return both intersection and Jaccard similarity
50 702         1084 return ($jaccard, $intersection);
51             }
52              
53             sub jaccard_similarity_formatted {
54              
55             # *** IMPORTANT ****
56             # mrueda Dec-27-23
57             # Direct formatting in jaccard_similarity adds minor overhead (verified by testing),
58             # but prevents errors on some CPAN FreeBSD architectures.
59 630     630 0 699 my ($result, undef) = jaccard_similarity(@_);
60 630         1874 return sprintf( "%.6f", $result );
61             }
62              
63             sub estimate_hamming_stats {
64              
65             # Estimate Hamming stats using a binomial distribution model. Assumes each bit position
66             # in the binary strings has an independent 50% chance of mismatch, to calculate
67             # the mean and standard deviation of the Hamming distance.
68              
69 72     72 0 85 my $length = shift;
70 72         75 my $probability_mismatch = 0.5;
71 72         91 my $estimated_average = $length * $probability_mismatch;
72 72         91 my $estimated_std_dev =
73             sqrt( $length * $probability_mismatch * ( 1 - $probability_mismatch ) );
74 72         174 return $estimated_average, $estimated_std_dev;
75             }
76              
77             sub z_score {
78              
79 216     216 0 219 my ( $observed_value, $expected_value, $std_dev ) = @_;
80 216 50       229 return 0 if $std_dev == 0;
81 216         265 return ( $observed_value - $expected_value ) / $std_dev;
82             }
83              
84             sub p_value_from_z_score {
85              
86 144     144 0 393 return pnorm(shift) # One-tailed test
87             }
88              
89             #sub _p_value {
90             #
91             # my ( $hamming_distance, $string_length ) = @_;
92             # my $probability_mismatch = 0.5;
93             # return 2 * (1 - pbinom($hamming_distance - 1, $string_length, $probability_mismatch))
94             #}
95              
96             sub add_stats {
97              
98 4     4 0 8 my $array = shift;
99 4         5 my $hash_out;
100 4         33 my $stat = Statistics::Descriptive::Full->new();
101 4         285 $stat->add_data($array);
102 4         555 $hash_out->{mean} = $stat->mean();
103 4         27 $hash_out->{sd} = $stat->standard_deviation();
104 4         142 $hash_out->{count} = $stat->count();
105 4         21 $hash_out->{per25} = $stat->percentile(25);
106 4         612 $hash_out->{per75} = $stat->percentile(75);
107 4         86 $hash_out->{min} = $stat->min();
108 4         21 $hash_out->{max} = $stat->max();
109 4         19 $hash_out->{median} = $stat->median();
110 4         172 $hash_out->{sum} = $stat->sum();
111              
112 4         36 return $hash_out;
113             }
114             1;