File Coverage

lib/Pheno/Ranker/Metrics.pm

Criterion	Covered	Total	%
statement	73	85	85.8
branch	3	14	21.4
condition	0	6	0.0
subroutine	20	22	90.9
pod	0	7	0.0
total	96	134	71.6

line	stmt	bran	cond	sub	pod	time	code
1							package Pheno::Ranker::Metrics;
2
3	5			5		35	use strict;
	5					10
	5					227
4	5			5		25	use warnings;
	5					9
	5					250
5	5			5		23	use autodie;
	5					12
	5					46
6	5			5		33425	use feature qw(say);
	5					11
	5					937
7
8							#use Data::Dumper;
9	5			5		3510	use File::HomeDir;
	5					38443
	5					517
10	5			5		46	use File::Path qw(make_path);
	5					9
	5					370
11	5			5		34	use File::Spec::Functions qw(catdir);
	5					12
	5					315
12	5			5		3159	use Math::CDF qw(pnorm pbinom);
	5					20145
	5					609
13	5			5		3428	use Statistics::Descriptive;
	5					83331
	5					243
14
15	5			5		52	use Exporter 'import';
	5					9
	5					395
16							our @EXPORT =
17							qw(hd_fast jaccard_similarity jaccard_similarity_formatted estimate_hamming_stats z_score p_value_from_z_score _p_value add_stats);
18
19	5			5		34	use constant DEVEL_MODE => 0;
	5					29
	5					1171
20
21							# Define a hidden directory in the user's home for Inline's compiled code
22							my $inline_dir = catdir( File::HomeDir->my_home, '.Inline' );
23
24							# Create the directory if it does not exist
25							unless ( -d $inline_dir ) {
26							make_path($inline_dir) or die "Cannot create directory $inline_dir: $!";
27							}
28
29							# Configure Inline C to use this directory
30	5			5		4410	use Inline C => Config => directory => $inline_dir;
	5					309683
	5					49
31
32							# Inline C implementation using XS style (old-style syntax)
33	5			5		1403	use Inline C => <<'END_C';
	5					12
	5					47
34							#include "EXTERN.h"
35							#include "perl.h"
36							#include "XSUB.h"
37
38							SV* c_jaccard_similarity(char* str1, char* str2, int length) {
39							int union_count = 0, intersection = 0, i;
40							for(i = 0; i < length; i++){
41							if(str1[i] == '1' \|\| str2[i] == '1'){
42							union_count++;
43							if(str1[i] == '1' && str2[i] == '1')
44							intersection++;
45							}
46							}
47							double similarity = union_count ? ((double)intersection) / union_count : 0.0;
48
49							/* Create a new array (AV) and push the two results */
50							AV* av = newAV();
51							av_push(av, newSVnv(similarity)); /* push similarity (double) */
52							av_push(av, newSViv(intersection)); /* push intersection (int) */
53
54							/* Wrap the array in a reference and increment the reference count.
55							Do not call sv_2mortal on the resulting RV. */
56							SV* rv = newRV_inc((SV*)av);
57							return rv;
58							}
59
60							/*
61							This function computes the Hamming distance between two strings,
62							assuming they are both composed of '0' and '1' characters and have equal length.
63							It is defined as c_hd_fast to avoid a naming conflict with the Perl wrapper.
64							*/
65							int c_hd_fast(char* s1, char* s2, int len) {
66							int diff = 0;
67							int i;
68							for(i = 0; i < len; i++){
69							if (s1[i] != s2[i])
70							diff++;
71							}
72							return diff;
73							}
74							END_C
75
76							###########
77							# HAMMING #
78							###########
79
80							# Perl wrapper: calls the Inline C function "c_hd_fast"
81							sub hd_fast {
82	6746			6746	0	10020	my ( $s1, $s2 ) = @_;
83	6746	50				11410	die "Strings must be the same length" if length($s1) != length($s2);
84	6746					18862	return c_hd_fast( $s1, $s2, length($s1) );
85							}
86
87							# Original
88							sub _hd_fast {
89
90							# Hamming Distance
91	0			0		0	return ( $_[0] ^ $_[1] ) =~ tr/\001-\255//;
92							}
93
94							###########
95							# JACCARD #
96							###########
97
98							# Perl wrapper: calls the Inline C function "c_jaccard_similarity"
99							sub jaccard_similarity {
100	702			702	0	781	my ( $str1, $str2 ) = @_;
101	702					568	my $len = length($str1);
102	702	50				805	die "Strings must be of equal length" if $len != length($str2);
103							my ( $jaccard, $intersection ) =
104	702					528	@{ c_jaccard_similarity( $str1, $str2, $len ) };
	702					1513
105	702					918	return ( $jaccard, $intersection );
106							}
107
108							# Original (using vec)
109							sub _jaccard_similarity {
110	0			0		0	my ( $str1, $str2 ) = @_;
111	0					0	my $len = length($str1);
112	0	0				0	die "Strings must be of equal length" if $len != length($str2);
113	0					0	my ( $intersection, $union ) = ( 0, 0 );
114	0					0	for my $i ( 0 .. $len - 1 ) {
115	0					0	my $b1 = vec( $str1, $i, 8 );
116	0					0	my $b2 = vec( $str2, $i, 8 );
117	0	0	0			0	if ( $b1 == ord('1') \|\| $b2 == ord('1') ) {
118	0					0	$union++;
119	0	0	0			0	$intersection++ if ( $b1 == ord('1') && $b2 == ord('1') );
120							}
121							}
122	0	0				0	return $union == 0 ? ( 0, 0 ) : ( $intersection / $union, $intersection );
123							}
124
125							sub jaccard_similarity_formatted {
126
127							# * IMPORTANT **
128							# mrueda Dec-27-23
129							# Direct formatting in jaccard_similarity adds minor overhead (verified by testing),
130							# but prevents errors on some CPAN FreeBSD architectures.
131	630			630	0	668	my ( $result, undef ) = jaccard_similarity(@_);
132	630					1246	return sprintf( "%.6f", $result );
133							}
134
135							#########
136							# STATS #
137							#########
138
139							sub estimate_hamming_stats {
140
141							# Estimate Hamming stats using a binomial distribution model. Assumes each bit position
142							# in the binary strings has an independent 50% chance of mismatch, to calculate
143							# the mean and standard deviation of the Hamming distance.
144
145	72			72	0	144	my $length = shift;
146	72					97	my $probability_mismatch = 0.5;
147	72					158	my $estimated_average = $length * $probability_mismatch;
148	72					143	my $estimated_std_dev =
149							sqrt( $length * $probability_mismatch * ( 1 - $probability_mismatch ) );
150	72					391	return $estimated_average, $estimated_std_dev;
151							}
152
153							sub z_score {
154	216			216	0	375	my ( $observed_value, $expected_value, $std_dev ) = @_;
155	216	50				433	return 0 if $std_dev == 0;
156	216					438	return ( $observed_value - $expected_value ) / $std_dev;
157							}
158
159							sub p_value_from_z_score {
160	144			144	0	658	return pnorm(shift) # One-tailed test
161							}
162
163							#sub _p_value {
164							# my ( $hamming_distance, $string_length ) = @_;
165							# my $probability_mismatch = 0.5;
166							# return 2 * (1 - pbinom($hamming_distance - 1, $string_length, $probability_mismatch))
167							#}
168
169							sub add_stats {
170	4			4	0	7	my $array = shift;
171	4					6	my %hash_out;
172	4					36	my $stat = Statistics::Descriptive::Full->new();
173	4					392	$stat->add_data($array);
174	4					691	$hash_out{mean} = $stat->mean();
175	4					33	$hash_out{sd} = $stat->standard_deviation();
176	4					233	$hash_out{count} = $stat->count();
177	4					38	$hash_out{per25} = $stat->percentile(25);
178	4					863	$hash_out{per75} = $stat->percentile(75);
179	4					123	$hash_out{min} = $stat->min();
180	4					29	$hash_out{max} = $stat->max();
181	4					37	$hash_out{median} = $stat->median();
182	4					219	$hash_out{sum} = $stat->sum();
183
184	4					56	return \%hash_out;
185							}
186							1;