| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
#ifndef _GNU_SOURCE |
|
2
|
|
|
|
|
|
|
#define _GNU_SOURCE /* glibc / Linux */ |
|
3
|
|
|
|
|
|
|
#endif |
|
4
|
|
|
|
|
|
|
#ifndef __EXTENSIONS__ |
|
5
|
|
|
|
|
|
|
#define __EXTENSIONS__ 1 /* Solaris/illumos: expose off64_t, sigjmp_buf under -std=c99 */ |
|
6
|
|
|
|
|
|
|
#endif |
|
7
|
|
|
|
|
|
|
#define PERL_NO_GET_CONTEXT |
|
8
|
|
|
|
|
|
|
#include "EXTERN.h" |
|
9
|
|
|
|
|
|
|
#include "perl.h" |
|
10
|
|
|
|
|
|
|
#include "XSUB.h" |
|
11
|
|
|
|
|
|
|
#include "ppport.h" |
|
12
|
|
|
|
|
|
|
#include |
|
13
|
|
|
|
|
|
|
#include |
|
14
|
|
|
|
|
|
|
#include |
|
15
|
|
|
|
|
|
|
#include |
|
16
|
|
|
|
|
|
|
#include |
|
17
|
|
|
|
|
|
|
#include |
|
18
|
|
|
|
|
|
|
#include /* uint64_t — harmless if perl.h already pulled it in */ |
|
19
|
|
|
|
|
|
|
/* |
|
20
|
|
|
|
|
|
|
XS words: |
|
21
|
|
|
|
|
|
|
SvROK = scalar value reference is OK |
|
22
|
|
|
|
|
|
|
*/ |
|
23
|
|
|
|
|
|
|
/* sample(): private splitmix64 PRNG |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
sample() gets its own PRNG state, completely separate from Drand01. |
|
26
|
|
|
|
|
|
|
That means generate_binomial(), ruif(), rbinom(), and every other caller |
|
27
|
|
|
|
|
|
|
of Drand01() are unaffected — their streams are never advanced or reseeded |
|
28
|
|
|
|
|
|
|
by anything sample() does. |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
Seeding is lazy (first call) and reads from /dev/urandom; falls back to |
|
31
|
|
|
|
|
|
|
time()^PID on systems without it. No aTHX needed: all calls are plain C. |
|
32
|
|
|
|
|
|
|
PERL_NO_GET_CONTEXT is therefore not a concern here. */ |
|
33
|
|
|
|
|
|
|
static uint64_t sample__state = 0; |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
PERL_STATIC_INLINE uint64_t |
|
36
|
|
|
|
|
|
|
sample__mix64(void) |
|
37
|
|
|
|
|
|
|
{ |
|
38
|
|
|
|
|
|
|
uint64_t z = (sample__state += UINT64_C(0x9e3779b97f4a7c15)); |
|
39
|
|
|
|
|
|
|
z = (z ^ (z >> 30)) * UINT64_C(0xbf58476d1ce4e5b9); |
|
40
|
|
|
|
|
|
|
z = (z ^ (z >> 27)) * UINT64_C(0x94d049bb133111eb); |
|
41
|
|
|
|
|
|
|
return z ^ (z >> 31); |
|
42
|
|
|
|
|
|
|
} |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
/* * Helper function to increment the count for a given SV. |
|
45
|
|
|
|
|
|
|
* Skips NULL or Undefined values as requested. */ |
|
46
|
26
|
|
|
|
|
|
static void increment_count(pTHX_ HV* counts_hv, SV* val) { |
|
47
|
|
|
|
|
|
|
/* Skip null pointers or undef (non-OK) values */ |
|
48
|
26
|
50
|
|
|
|
|
if (!val || !SvOK(val)) return; |
|
|
|
50
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
STRLEN len; |
|
50
|
|
|
|
|
|
|
// SvPV forces stringification (so numbers become string keys) |
|
51
|
26
|
|
|
|
|
|
char*restrict str = SvPV(val, len); |
|
52
|
|
|
|
|
|
|
// hv_fetch with lval=1 creates the key if it doesn't exist |
|
53
|
26
|
|
|
|
|
|
SV**restrict svp = hv_fetch(counts_hv, str, len, 1); |
|
54
|
26
|
50
|
|
|
|
|
if (svp) { |
|
55
|
26
|
100
|
|
|
|
|
if (!SvOK(*svp)) { |
|
56
|
17
|
|
|
|
|
|
sv_setuv(*svp, 1);// Initialize count to 1 as an Unsigned Value (UV) |
|
57
|
|
|
|
|
|
|
} else { |
|
58
|
9
|
|
|
|
|
|
sv_setuv(*svp, SvUV(*svp) + 1);// Increment existing Unsigned Value |
|
59
|
|
|
|
|
|
|
} |
|
60
|
|
|
|
|
|
|
} |
|
61
|
|
|
|
|
|
|
} |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
// Uniform integer in [0, upper) — rejection loop, no modulo bias |
|
64
|
|
|
|
|
|
|
PERL_STATIC_INLINE size_t |
|
65
|
|
|
|
|
|
|
sample__rand(size_t upper) { |
|
66
|
|
|
|
|
|
|
const uint64_t u = (uint64_t)upper; |
|
67
|
|
|
|
|
|
|
const uint64_t t = (uint64_t)(-(uint64_t)u) % u; |
|
68
|
|
|
|
|
|
|
uint64_t r; |
|
69
|
|
|
|
|
|
|
do { r = sample__mix64(); } while (r < t); |
|
70
|
|
|
|
|
|
|
return (size_t)(r % u); |
|
71
|
|
|
|
|
|
|
} |
|
72
|
|
|
|
|
|
|
// end sample() private PRNG |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
// Ensure Perl's PRNG is seeded, matching the lazy-evaluation of Perl's rand() |
|
75
|
|
|
|
|
|
|
#define AUTO_SEED_PRNG() \ |
|
76
|
|
|
|
|
|
|
do { \ |
|
77
|
|
|
|
|
|
|
if (!PL_srand_called) { \ |
|
78
|
|
|
|
|
|
|
(void)seedDrand01((Rand_seed_t)Perl_seed(aTHX)); \ |
|
79
|
|
|
|
|
|
|
PL_srand_called = TRUE; \ |
|
80
|
|
|
|
|
|
|
} \ |
|
81
|
|
|
|
|
|
|
} while (0) |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
// Helpers for Random Number Generation |
|
84
|
|
|
|
|
|
|
#ifndef M_PI |
|
85
|
|
|
|
|
|
|
#define M_PI 3.14159265358979323846 |
|
86
|
|
|
|
|
|
|
#endif |
|
87
|
|
|
|
|
|
|
// C helper for EXACT Non-central T-distribution CDF via Numerical Integration. |
|
88
|
|
|
|
|
|
|
// This perfectly replicates R's pt(..., ncp) exactness without requiring complex Beta functions. |
|
89
|
229
|
|
|
|
|
|
static double exact_pnt(double t, double df, double ncp) { |
|
90
|
229
|
50
|
|
|
|
|
if (df <= 0.0) return 0.0; |
|
91
|
229
|
|
|
|
|
|
unsigned short int n_steps = 30000; |
|
92
|
229
|
|
|
|
|
|
double step = 1.0 / n_steps; |
|
93
|
229
|
|
|
|
|
|
double integral = 0.0, half_df = df / 2.0; |
|
94
|
229
|
|
|
|
|
|
double log_coef = log(2.0) + half_df * log(half_df) - lgamma(half_df); |
|
95
|
229
|
|
|
|
|
|
double root_half = 0.70710678118654752440; // 1 / sqrt(2) |
|
96
|
6870000
|
100
|
|
|
|
|
for (unsigned short i = 1; i < n_steps; i++) { |
|
97
|
6869771
|
|
|
|
|
|
double u = i * step; |
|
98
|
6869771
|
|
|
|
|
|
double w = u / (1.0 - u); |
|
99
|
|
|
|
|
|
|
// Scaled Chi-distribution log-density |
|
100
|
6869771
|
|
|
|
|
|
double log_M = log_coef + (df - 1.0) * log(w) - half_df * w * w; |
|
101
|
6869771
|
|
|
|
|
|
double M = exp(log_M); |
|
102
|
|
|
|
|
|
|
// Exact Normal CDF using the C standard library's erfc function |
|
103
|
6869771
|
|
|
|
|
|
double z = t * w - ncp; |
|
104
|
6869771
|
|
|
|
|
|
double pnorm_val = 0.5 * erfc(-z * root_half); |
|
105
|
6869771
|
100
|
|
|
|
|
double weight = (i % 2 != 0) ? 4.0 : 2.0; |
|
106
|
6869771
|
|
|
|
|
|
integral += weight * (pnorm_val * M / ((1.0 - u) * (1.0 - u))); |
|
107
|
|
|
|
|
|
|
} |
|
108
|
229
|
|
|
|
|
|
return integral * (step / 3.0); |
|
109
|
|
|
|
|
|
|
} |
|
110
|
|
|
|
|
|
|
// --- Math Helpers for P-values and Confidence Intervals --- |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
// Ranking helper with tie adjustment (matches R's tie handling) |
|
113
|
|
|
|
|
|
|
typedef struct { double val; size_t idx; double rank; } RankInfo; |
|
114
|
75
|
|
|
|
|
|
static int compare_rank(const void *restrict a, const void *restrict b) { |
|
115
|
75
|
|
|
|
|
|
double diff = ((RankInfo*)a)->val - ((RankInfo*)b)->val; |
|
116
|
75
|
|
|
|
|
|
return (diff > 0) - (diff < 0); |
|
117
|
|
|
|
|
|
|
} |
|
118
|
|
|
|
|
|
|
|
|
119
|
75
|
|
|
|
|
|
static int compare_index(const void *restrict a, const void *restrict b) { |
|
120
|
75
|
|
|
|
|
|
return ((RankInfo*)a)->idx - ((RankInfo*)b)->idx; |
|
121
|
|
|
|
|
|
|
} |
|
122
|
|
|
|
|
|
|
|
|
123
|
6
|
|
|
|
|
|
static void compute_ranks(double *restrict data, double *restrict ranks, size_t n) { |
|
124
|
6
|
|
|
|
|
|
RankInfo *restrict items = safemalloc(n * sizeof(RankInfo)); |
|
125
|
56
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
126
|
50
|
|
|
|
|
|
items[i].val = data[i]; |
|
127
|
50
|
|
|
|
|
|
items[i].idx = i; |
|
128
|
|
|
|
|
|
|
} |
|
129
|
6
|
|
|
|
|
|
qsort(items, n, sizeof(RankInfo), compare_rank); |
|
130
|
|
|
|
|
|
|
// Handle ties by averaging ranks |
|
131
|
56
|
100
|
|
|
|
|
for (size_t i = 0; i < n; ) { |
|
132
|
50
|
|
|
|
|
|
size_t j = i + 1; |
|
133
|
50
|
100
|
|
|
|
|
while (j < n && items[j].val == items[i].val) j++; |
|
|
|
50
|
|
|
|
|
|
|
134
|
50
|
|
|
|
|
|
double avg_rank = (i + 1 + j) / 2.0; |
|
135
|
100
|
100
|
|
|
|
|
for (size_t k = i; k < j; k++) items[k].rank = avg_rank; |
|
136
|
50
|
|
|
|
|
|
i = j; |
|
137
|
|
|
|
|
|
|
} |
|
138
|
6
|
|
|
|
|
|
qsort(items, n, sizeof(RankInfo), compare_index); |
|
139
|
56
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) ranks[i] = items[i].rank; |
|
140
|
6
|
|
|
|
|
|
Safefree(items); |
|
141
|
6
|
|
|
|
|
|
} |
|
142
|
|
|
|
|
|
|
// Generates a single binomial random variate. |
|
143
|
|
|
|
|
|
|
//Uses the standard Bernoulli trial loop. Drand01() taps into Perl's PRNG. |
|
144
|
20499
|
|
|
|
|
|
static size_t generate_binomial(pTHX_ const size_t size, const double prob) { |
|
145
|
20499
|
100
|
|
|
|
|
if (prob <= 0.0) return 0; |
|
146
|
20399
|
100
|
|
|
|
|
if (prob >= 1.0) return size; |
|
147
|
|
|
|
|
|
|
|
|
148
|
20299
|
|
|
|
|
|
size_t successes = 0; |
|
149
|
312290
|
100
|
|
|
|
|
for (size_t i = 0; i < size; i++) { |
|
150
|
291991
|
100
|
|
|
|
|
if (Drand01() <= prob) successes++; |
|
151
|
|
|
|
|
|
|
} |
|
152
|
20299
|
|
|
|
|
|
return successes; |
|
153
|
|
|
|
|
|
|
} |
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
#define FT_EPS 2.220446049250313e-16 |
|
156
|
|
|
|
|
|
|
#define FT_TOL 0.0001220703125 /* .Machine$double.eps^0.25, R uniroot default */ |
|
157
|
|
|
|
|
|
|
|
|
158
|
198
|
|
|
|
|
|
static double ft_lchoose(long n, long k) { |
|
159
|
198
|
50
|
|
|
|
|
if (k < 0 || k > n || n < 0) return -INFINITY; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
160
|
198
|
|
|
|
|
|
return lgamma((double)n + 1) - lgamma((double)k + 1) - lgamma((double)(n - k) + 1); |
|
161
|
|
|
|
|
|
|
} |
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
typedef struct { |
|
164
|
|
|
|
|
|
|
long lo, hi, ns, m, n, k, x; |
|
165
|
|
|
|
|
|
|
double *logdc; /* central log hypergeometric density over the support */ |
|
166
|
|
|
|
|
|
|
} ft_support; |
|
167
|
|
|
|
|
|
|
|
|
168
|
10
|
|
|
|
|
|
static int ft_init(ft_support *S, long a, long b, long c, long d) { |
|
169
|
10
|
|
|
|
|
|
S->m = a + c; S->n = b + d; S->k = a + b; S->x = a; |
|
170
|
10
|
|
|
|
|
|
S->lo = (S->k - S->n > 0) ? (S->k - S->n) : 0; |
|
171
|
10
|
|
|
|
|
|
S->hi = (S->k < S->m) ? S->k : S->m; |
|
172
|
10
|
|
|
|
|
|
S->ns = S->hi - S->lo + 1; |
|
173
|
10
|
50
|
|
|
|
|
if (S->ns <= 0) { S->logdc = NULL; return 0; } |
|
174
|
10
|
50
|
|
|
|
|
Newx(S->logdc, S->ns, double); |
|
175
|
76
|
100
|
|
|
|
|
for (long i = 0; i < S->ns; i++) { |
|
176
|
66
|
|
|
|
|
|
long j = S->lo + i; |
|
177
|
66
|
|
|
|
|
|
S->logdc[i] = ft_lchoose(S->m, j) + ft_lchoose(S->n, S->k - j) |
|
178
|
66
|
|
|
|
|
|
- ft_lchoose(S->m + S->n, S->k); |
|
179
|
|
|
|
|
|
|
} |
|
180
|
10
|
|
|
|
|
|
return 1; |
|
181
|
|
|
|
|
|
|
} |
|
182
|
10
|
|
|
|
|
|
static void ft_free(ft_support *S) { Safefree(S->logdc); S->logdc = NULL; } |
|
183
|
|
|
|
|
|
|
|
|
184
|
90
|
|
|
|
|
|
static void ft_dnhyper(const ft_support *S, double ncp, double *out) { |
|
185
|
90
|
|
|
|
|
|
double lncp = log(ncp), mx = -INFINITY; |
|
186
|
780
|
100
|
|
|
|
|
for (long i = 0; i < S->ns; i++) { |
|
187
|
690
|
|
|
|
|
|
out[i] = S->logdc[i] + lncp * (double)(S->lo + i); |
|
188
|
690
|
100
|
|
|
|
|
if (out[i] > mx) mx = out[i]; |
|
189
|
|
|
|
|
|
|
} |
|
190
|
90
|
|
|
|
|
|
double s = 0; |
|
191
|
780
|
100
|
|
|
|
|
for (long i = 0; i < S->ns; i++) { out[i] = exp(out[i] - mx); s += out[i]; } |
|
192
|
780
|
100
|
|
|
|
|
for (long i = 0; i < S->ns; i++) out[i] /= s; |
|
193
|
90
|
|
|
|
|
|
} |
|
194
|
|
|
|
|
|
|
|
|
195
|
37
|
|
|
|
|
|
static double ft_mnhyper(const ft_support *S, double ncp, double *scratch) { |
|
196
|
37
|
50
|
|
|
|
|
if (ncp == 0) return (double)S->lo; |
|
197
|
37
|
50
|
|
|
|
|
if (isinf(ncp)) return (double)S->hi; |
|
198
|
37
|
|
|
|
|
|
ft_dnhyper(S, ncp, scratch); |
|
199
|
37
|
|
|
|
|
|
double mu = 0; |
|
200
|
302
|
100
|
|
|
|
|
for (long i = 0; i < S->ns; i++) mu += (double)(S->lo + i) * scratch[i]; |
|
201
|
37
|
|
|
|
|
|
return mu; |
|
202
|
|
|
|
|
|
|
} |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
/* upper != 0 => P(X >= q), upper == 0 => P(X <= q) */ |
|
205
|
68
|
|
|
|
|
|
static double ft_pnhyper(const ft_support *S, long q, double ncp, int upper, double *scratch) { |
|
206
|
68
|
100
|
|
|
|
|
if (ncp == 1.0) { |
|
207
|
16
|
|
|
|
|
|
double s = 0; |
|
208
|
128
|
100
|
|
|
|
|
for (long i = 0; i < S->ns; i++) { |
|
209
|
112
|
|
|
|
|
|
long j = S->lo + i; |
|
210
|
112
|
100
|
|
|
|
|
if (upper ? (j >= q) : (j <= q)) s += exp(S->logdc[i]); |
|
|
|
100
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
} |
|
212
|
16
|
|
|
|
|
|
return s; |
|
213
|
|
|
|
|
|
|
} |
|
214
|
52
|
100
|
|
|
|
|
if (ncp == 0.0) return upper ? (double)(q <= S->lo) : (double)(q >= S->lo); |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
215
|
50
|
50
|
|
|
|
|
if (isinf(ncp)) return upper ? (double)(q <= S->hi) : (double)(q >= S->hi); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
216
|
50
|
|
|
|
|
|
ft_dnhyper(S, ncp, scratch); |
|
217
|
50
|
|
|
|
|
|
double s = 0; |
|
218
|
452
|
100
|
|
|
|
|
for (long i = 0; i < S->ns; i++) { |
|
219
|
402
|
|
|
|
|
|
long j = S->lo + i; |
|
220
|
402
|
100
|
|
|
|
|
if (upper ? (j >= q) : (j <= q)) s += scratch[i]; |
|
|
|
100
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
} |
|
222
|
50
|
|
|
|
|
|
return s; |
|
223
|
|
|
|
|
|
|
} |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
/* R's src/library/stats/src/zeroin.c (Brent-Dekker) */ |
|
226
|
|
|
|
|
|
|
typedef double (*ft_fn)(double t, void *ctx); |
|
227
|
11
|
|
|
|
|
|
static double ft_zeroin(double ax, double bx, ft_fn f, void *ctx, double tol, int maxit) { |
|
228
|
11
|
|
|
|
|
|
double a = ax, b = bx, fa = f(a, ctx), fb = f(b, ctx), c = a, fc = fa; |
|
229
|
81
|
50
|
|
|
|
|
while (maxit-- > 0) { |
|
230
|
81
|
|
|
|
|
|
double prev = b - a; |
|
231
|
81
|
100
|
|
|
|
|
if (fabs(fc) < fabs(fb)) { a = b; b = c; c = a; fa = fb; fb = fc; fc = fa; } |
|
232
|
81
|
|
|
|
|
|
double tol_act = 2 * FT_EPS * fabs(b) + tol / 2; |
|
233
|
81
|
|
|
|
|
|
double step = (c - b) / 2; |
|
234
|
81
|
100
|
|
|
|
|
if (fabs(step) <= tol_act || fb == 0.0) return b; |
|
|
|
50
|
|
|
|
|
|
|
235
|
70
|
50
|
|
|
|
|
if (fabs(prev) >= tol_act && fabs(fa) > fabs(fb)) { |
|
|
|
50
|
|
|
|
|
|
|
236
|
70
|
|
|
|
|
|
double cb = c - b, p, q; |
|
237
|
70
|
100
|
|
|
|
|
if (a == c) { double t1 = fb / fa; p = cb * t1; q = 1.0 - t1; } |
|
238
|
|
|
|
|
|
|
else { |
|
239
|
25
|
|
|
|
|
|
double q0 = fa / fc, t1 = fb / fc, t2 = fb / fa; |
|
240
|
25
|
|
|
|
|
|
p = t2 * (cb * q0 * (q0 - t1) - (b - a) * (t1 - 1.0)); |
|
241
|
25
|
|
|
|
|
|
q = (q0 - 1.0) * (t1 - 1.0) * (t2 - 1.0); |
|
242
|
|
|
|
|
|
|
} |
|
243
|
70
|
100
|
|
|
|
|
if (p > 0) q = -q; else p = -p; |
|
244
|
70
|
100
|
|
|
|
|
if (p < 0.75 * cb * q - fabs(tol_act * q) / 2 && p < fabs(prev * q / 2)) step = p / q; |
|
|
|
100
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
} |
|
246
|
70
|
100
|
|
|
|
|
if (fabs(step) < tol_act) step = step > 0 ? tol_act : -tol_act; |
|
|
|
100
|
|
|
|
|
|
|
247
|
70
|
|
|
|
|
|
a = b; fa = fb; b += step; fb = f(b, ctx); |
|
248
|
70
|
100
|
|
|
|
|
if ((fb > 0) == (fc > 0)) { c = a; fc = fa; } |
|
249
|
|
|
|
|
|
|
} |
|
250
|
0
|
|
|
|
|
|
return b; |
|
251
|
|
|
|
|
|
|
} |
|
252
|
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
typedef struct { const ft_support *S; double target; double *scratch; int mode; } ft_rc; |
|
254
|
|
|
|
|
|
|
/* mode 0: mnhyper(t)-target 1: mnhyper(1/t)-target |
|
255
|
|
|
|
|
|
|
mode 2: pnhyper(x,t,low)-tgt 3: pnhyper(x,1/t,low)-tgt |
|
256
|
|
|
|
|
|
|
mode 4: pnhyper(x,t,up)-tgt 5: pnhyper(x,1/t,up)-tgt */ |
|
257
|
92
|
|
|
|
|
|
static double ft_rootf(double t, void *ctx) { |
|
258
|
92
|
|
|
|
|
|
ft_rc *r = (ft_rc *)ctx; const ft_support *S = r->S; |
|
259
|
92
|
|
|
|
|
|
switch (r->mode) { |
|
260
|
0
|
|
|
|
|
|
case 0: return ft_mnhyper(S, t, r->scratch) - r->target; |
|
261
|
33
|
|
|
|
|
|
case 1: return ft_mnhyper(S, 1.0 / t, r->scratch) - r->target; |
|
262
|
0
|
|
|
|
|
|
case 2: return ft_pnhyper(S, S->x, t, 0, r->scratch) - r->target; |
|
263
|
22
|
|
|
|
|
|
case 3: return ft_pnhyper(S, S->x, 1.0 / t, 0, r->scratch) - r->target; |
|
264
|
17
|
|
|
|
|
|
case 4: return ft_pnhyper(S, S->x, t, 1, r->scratch) - r->target; |
|
265
|
20
|
|
|
|
|
|
default:return ft_pnhyper(S, S->x, 1.0 / t, 1, r->scratch) - r->target; |
|
266
|
|
|
|
|
|
|
} |
|
267
|
|
|
|
|
|
|
} |
|
268
|
|
|
|
|
|
|
|
|
269
|
5
|
|
|
|
|
|
static double exact_p_value(long a, long b, long c, long d, const char *alt) { |
|
270
|
|
|
|
|
|
|
ft_support S; |
|
271
|
5
|
50
|
|
|
|
|
if (!ft_init(&S, a, b, c, d)) return 1.0; |
|
272
|
5
|
50
|
|
|
|
|
double *restrict sc; Newx(sc, S.ns, double); |
|
273
|
|
|
|
|
|
|
double p; |
|
274
|
5
|
100
|
|
|
|
|
if (!strcmp(alt, "less")) p = ft_pnhyper(&S, S.x, 1.0, 0, sc); |
|
275
|
4
|
100
|
|
|
|
|
else if (!strcmp(alt, "greater")) p = ft_pnhyper(&S, S.x, 1.0, 1, sc); |
|
276
|
|
|
|
|
|
|
else { |
|
277
|
3
|
|
|
|
|
|
ft_dnhyper(&S, 1.0, sc); |
|
278
|
3
|
|
|
|
|
|
double dx = sc[S.x - S.lo], relErr = 1 + 1e-7, s = 0; |
|
279
|
26
|
100
|
|
|
|
|
for (long i = 0; i < S.ns; i++) if (sc[i] <= dx * relErr) s += sc[i]; |
|
|
|
100
|
|
|
|
|
|
|
280
|
3
|
|
|
|
|
|
p = s; |
|
281
|
|
|
|
|
|
|
} |
|
282
|
5
|
50
|
|
|
|
|
if (p < 0) p = 0; if (p > 1) p = 1; |
|
|
|
50
|
|
|
|
|
|
|
283
|
5
|
|
|
|
|
|
Safefree(sc); ft_free(&S); |
|
284
|
5
|
|
|
|
|
|
return p; |
|
285
|
|
|
|
|
|
|
} |
|
286
|
|
|
|
|
|
|
|
|
287
|
5
|
|
|
|
|
|
static void calculate_exact_stats(long a, long b, long c, long d, double conf, |
|
288
|
|
|
|
|
|
|
const char *alt, double *orp, double *lop, double *hip) { |
|
289
|
|
|
|
|
|
|
ft_support S; |
|
290
|
5
|
50
|
|
|
|
|
if (!ft_init(&S, a, b, c, d)) { *orp = NAN; *lop = NAN; *hip = NAN; return; } |
|
291
|
5
|
50
|
|
|
|
|
double *restrict sc; Newx(sc, S.ns, double); |
|
292
|
5
|
|
|
|
|
|
long x = S.x, lo = S.lo, hi = S.hi; |
|
293
|
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
/* conditional MLE of the odds ratio */ |
|
295
|
|
|
|
|
|
|
double est; |
|
296
|
5
|
50
|
|
|
|
|
if (x == lo) est = 0.0; |
|
297
|
5
|
100
|
|
|
|
|
else if (x == hi) est = INFINITY; |
|
298
|
|
|
|
|
|
|
else { |
|
299
|
4
|
|
|
|
|
|
double mu = ft_mnhyper(&S, 1.0, sc); |
|
300
|
4
|
|
|
|
|
|
ft_rc r = { &S, (double)x, sc, 0 }; |
|
301
|
4
|
50
|
|
|
|
|
if (mu > x) { r.mode = 0; est = ft_zeroin(0, 1, ft_rootf, &r, FT_TOL, 1000); } |
|
302
|
4
|
50
|
|
|
|
|
else if (mu < x) { r.mode = 1; est = 1.0 / ft_zeroin(FT_EPS, 1, ft_rootf, &r, FT_TOL, 1000); } |
|
303
|
0
|
|
|
|
|
|
else est = 1.0; |
|
304
|
|
|
|
|
|
|
} |
|
305
|
5
|
|
|
|
|
|
*orp = est; |
|
306
|
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
/* confidence interval via inversion of the noncentral hypergeometric */ |
|
308
|
|
|
|
|
|
|
double clo, chi; |
|
309
|
5
|
|
|
|
|
|
ft_rc r = { &S, 0, sc, 0 }; |
|
310
|
|
|
|
|
|
|
#define FT_NCP_L(alpha, dst) do { \ |
|
311
|
|
|
|
|
|
|
if (x == lo) { dst = 0.0; } else { \ |
|
312
|
|
|
|
|
|
|
double p = ft_pnhyper(&S, x, 1.0, 1, sc); \ |
|
313
|
|
|
|
|
|
|
if (p > (alpha)) { r.mode = 4; r.target = (alpha); dst = ft_zeroin(0, 1, ft_rootf, &r, FT_TOL, 1000); } \ |
|
314
|
|
|
|
|
|
|
else if (p < (alpha)) { r.mode = 5; r.target = (alpha); dst = 1.0 / ft_zeroin(FT_EPS, 1, ft_rootf, &r, FT_TOL, 1000); } \ |
|
315
|
|
|
|
|
|
|
else dst = 1.0; } } while (0) |
|
316
|
|
|
|
|
|
|
#define FT_NCP_U(alpha, dst) do { \ |
|
317
|
|
|
|
|
|
|
if (x == hi) { dst = INFINITY; } else { \ |
|
318
|
|
|
|
|
|
|
double p = ft_pnhyper(&S, x, 1.0, 0, sc); \ |
|
319
|
|
|
|
|
|
|
if (p < (alpha)) { r.mode = 2; r.target = (alpha); dst = ft_zeroin(0, 1, ft_rootf, &r, FT_TOL, 1000); } \ |
|
320
|
|
|
|
|
|
|
else if (p > (alpha)) { r.mode = 3; r.target = (alpha); dst = 1.0 / ft_zeroin(FT_EPS, 1, ft_rootf, &r, FT_TOL, 1000); } \ |
|
321
|
|
|
|
|
|
|
else dst = 1.0; } } while (0) |
|
322
|
|
|
|
|
|
|
|
|
323
|
5
|
100
|
|
|
|
|
if (!strcmp(alt, "less")) { clo = 0.0; FT_NCP_U(1 - conf, chi); } |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
324
|
4
|
100
|
|
|
|
|
else if (!strcmp(alt, "greater")) { FT_NCP_L(1 - conf, clo); chi = INFINITY; } |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
325
|
3
|
50
|
|
|
|
|
else { double al = (1 - conf) / 2; FT_NCP_L(al, clo); FT_NCP_U(al, chi); } |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
|
|
327
|
5
|
|
|
|
|
|
*lop = clo; *hip = chi; |
|
328
|
5
|
|
|
|
|
|
Safefree(sc); ft_free(&S); |
|
329
|
|
|
|
|
|
|
} |
|
330
|
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
// small helper: fetch a nonnegative integer cell from an SV, with validation |
|
332
|
20
|
|
|
|
|
|
static long ft_cell(pTHX_ SV *sv, const char *what) { |
|
333
|
20
|
50
|
|
|
|
|
if (!sv || !SvOK(sv)) croak("fisher_test: %s is undef", what); |
|
|
|
50
|
|
|
|
|
|
|
334
|
20
|
50
|
|
|
|
|
if (!looks_like_number(sv)) croak("fisher_test: %s is not a number", what); |
|
335
|
20
|
|
|
|
|
|
IV v = SvIV(sv); |
|
336
|
20
|
50
|
|
|
|
|
if (v < 0) croak("fisher_test: %s must be nonnegative (got %" IVdf ")", what, v); |
|
337
|
20
|
|
|
|
|
|
return (long)v; |
|
338
|
|
|
|
|
|
|
} |
|
339
|
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
/*Helpers for lm Linear Regression: OLS Matrix Math & Formula Parsing |
|
341
|
|
|
|
|
|
|
* ----------------------------------------------------------------------- |
|
342
|
|
|
|
|
|
|
Sweep operator for symmetric positive-definite matrices (e.g., XtX). |
|
343
|
|
|
|
|
|
|
This gracefully handles collinearity by bypassing aliased columns. |
|
344
|
|
|
|
|
|
|
Utilizes a relative tolerance check to prevent dropping micro-variance features.*/ |
|
345
|
70
|
|
|
|
|
|
static int sweep_matrix_ols(double *restrict A, size_t n, bool *restrict aliased) { |
|
346
|
70
|
|
|
|
|
|
int rank = 0; |
|
347
|
70
|
|
|
|
|
|
double *restrict orig_diag = (double*)safemalloc(n * sizeof(double)); |
|
348
|
|
|
|
|
|
|
// Save the original diagonal values to use as a baseline for relative variance |
|
349
|
246
|
100
|
|
|
|
|
for (size_t k = 0; k < n; k++) { |
|
350
|
176
|
|
|
|
|
|
aliased[k] = FALSE; |
|
351
|
176
|
|
|
|
|
|
orig_diag[k] = A[k * n + k]; |
|
352
|
|
|
|
|
|
|
} |
|
353
|
246
|
100
|
|
|
|
|
for (size_t k = 0; k < n; k++) { |
|
354
|
|
|
|
|
|
|
// Check pivot for collinearity using a RELATIVE tolerance |
|
355
|
|
|
|
|
|
|
// (Fallback to a tiny absolute tolerance of 1e-24 to catch literal zero vectors) |
|
356
|
176
|
100
|
|
|
|
|
if (fabs(A[k * n + k]) <= 1e-10 * orig_diag[k] || fabs(A[k * n + k]) < 1e-24) { |
|
|
|
50
|
|
|
|
|
|
|
357
|
1
|
|
|
|
|
|
aliased[k] = TRUE; |
|
358
|
|
|
|
|
|
|
// Isolate this column so it doesn't affect the rest of the matrix |
|
359
|
4
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
360
|
3
|
|
|
|
|
|
A[k * n + i] = 0.0; |
|
361
|
3
|
|
|
|
|
|
A[i * n + k] = 0.0; |
|
362
|
|
|
|
|
|
|
} |
|
363
|
1
|
|
|
|
|
|
continue; |
|
364
|
|
|
|
|
|
|
} |
|
365
|
175
|
|
|
|
|
|
rank++; |
|
366
|
175
|
|
|
|
|
|
double pivot = 1.0 / A[k * n + k]; |
|
367
|
175
|
|
|
|
|
|
A[k * n + k] = 1.0; |
|
368
|
640
|
100
|
|
|
|
|
for (size_t j = 0; j < n; j++) A[k * n + j] *= pivot; |
|
369
|
640
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
370
|
465
|
100
|
|
|
|
|
if (i != k && A[i * n + k] != 0.0) { |
|
|
|
100
|
|
|
|
|
|
|
371
|
284
|
|
|
|
|
|
double factor = A[i * n + k]; |
|
372
|
284
|
|
|
|
|
|
A[i * n + k] = 0.0; |
|
373
|
1090
|
100
|
|
|
|
|
for (size_t j = 0; j < n; j++) { |
|
374
|
806
|
|
|
|
|
|
A[i * n + j] -= factor * A[k * n + j]; |
|
375
|
|
|
|
|
|
|
} |
|
376
|
|
|
|
|
|
|
} |
|
377
|
|
|
|
|
|
|
} |
|
378
|
|
|
|
|
|
|
} |
|
379
|
70
|
|
|
|
|
|
Safefree(orig_diag); |
|
380
|
70
|
|
|
|
|
|
return rank; |
|
381
|
|
|
|
|
|
|
} |
|
382
|
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
// Internal extractor resolving single data values. Returns NAN on missing or non-numeric. |
|
384
|
1805
|
|
|
|
|
|
static double get_data_value(pTHX_ HV *restrict data_hoa, HV **restrict row_hashes, unsigned int i, const char *restrict var) { |
|
385
|
1805
|
|
|
|
|
|
SV **restrict val = NULL; |
|
386
|
1805
|
100
|
|
|
|
|
if (row_hashes) { |
|
387
|
1184
|
|
|
|
|
|
val = hv_fetch(row_hashes[i], var, strlen(var), 0); |
|
388
|
1184
|
50
|
|
|
|
|
if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
389
|
1184
|
|
|
|
|
|
AV*restrict av = (AV*)SvRV(*val); |
|
390
|
1184
|
|
|
|
|
|
val = av_fetch(av, 0, 0); |
|
391
|
|
|
|
|
|
|
} |
|
392
|
621
|
50
|
|
|
|
|
} else if (data_hoa) { |
|
393
|
621
|
|
|
|
|
|
SV**restrict col = hv_fetch(data_hoa, var, strlen(var), 0); |
|
394
|
621
|
50
|
|
|
|
|
if (col && SvROK(*col) && SvTYPE(SvRV(*col)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
395
|
621
|
|
|
|
|
|
AV*restrict av = (AV*)SvRV(*col); |
|
396
|
621
|
|
|
|
|
|
val = av_fetch(av, i, 0); |
|
397
|
|
|
|
|
|
|
} |
|
398
|
|
|
|
|
|
|
} |
|
399
|
1805
|
50
|
|
|
|
|
if (val && SvOK(*val)) { |
|
|
|
100
|
|
|
|
|
|
|
400
|
1802
|
100
|
|
|
|
|
if (looks_like_number(*val)) return SvNV(*val); |
|
401
|
49
|
|
|
|
|
|
return NAN; // Catch strings like "blue" |
|
402
|
|
|
|
|
|
|
} |
|
403
|
3
|
|
|
|
|
|
return NAN; // Catch undef/missing keys |
|
404
|
|
|
|
|
|
|
} |
|
405
|
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
// Helper: Get all available columns for the '.' operator expansion |
|
407
|
9
|
|
|
|
|
|
static AV* get_all_columns(pTHX_ HV *restrict data_hoa, HV **restrict row_hashes, size_t n) { |
|
408
|
9
|
|
|
|
|
|
AV *restrict cols = newAV(); |
|
409
|
9
|
50
|
|
|
|
|
if (data_hoa) { |
|
410
|
9
|
|
|
|
|
|
hv_iterinit(data_hoa); |
|
411
|
|
|
|
|
|
|
HE *restrict entry; |
|
412
|
33
|
100
|
|
|
|
|
while ((entry = hv_iternext(data_hoa))) { |
|
413
|
24
|
|
|
|
|
|
av_push(cols, newSVsv(hv_iterkeysv(entry))); |
|
414
|
|
|
|
|
|
|
} |
|
415
|
0
|
0
|
|
|
|
|
} else if (row_hashes && n > 0 && row_hashes[0]) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
416
|
0
|
|
|
|
|
|
hv_iterinit(row_hashes[0]); |
|
417
|
|
|
|
|
|
|
HE *restrict entry; |
|
418
|
0
|
0
|
|
|
|
|
while ((entry = hv_iternext(row_hashes[0]))) { |
|
419
|
0
|
|
|
|
|
|
av_push(cols, newSVsv(hv_iterkeysv(entry))); |
|
420
|
|
|
|
|
|
|
} |
|
421
|
|
|
|
|
|
|
} |
|
422
|
9
|
|
|
|
|
|
return cols; |
|
423
|
|
|
|
|
|
|
} |
|
424
|
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
// Recursive formula resolver with tightened NaN and Null handling |
|
426
|
1837
|
|
|
|
|
|
static double evaluate_term(pTHX_ HV *restrict data_hoa, HV **restrict row_hashes, unsigned int i, const char *restrict term) { |
|
427
|
1837
|
50
|
|
|
|
|
if (!term || term[0] == '\0') return NAN; |
|
|
|
50
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
|
|
429
|
1837
|
|
|
|
|
|
char *restrict term_cpy = savepv(term); |
|
430
|
1837
|
|
|
|
|
|
char *restrict colon = strchr(term_cpy, ':'); |
|
431
|
1837
|
100
|
|
|
|
|
if (colon) { |
|
432
|
32
|
|
|
|
|
|
*colon = '\0'; |
|
433
|
32
|
|
|
|
|
|
double left = evaluate_term(aTHX_ data_hoa, row_hashes, i, term_cpy); |
|
434
|
32
|
|
|
|
|
|
double right = evaluate_term(aTHX_ data_hoa, row_hashes, i, colon + 1); |
|
435
|
32
|
|
|
|
|
|
Safefree(term_cpy); |
|
436
|
32
|
50
|
|
|
|
|
if (isnan(left) || isnan(right)) return NAN; |
|
|
|
50
|
|
|
|
|
|
|
437
|
32
|
|
|
|
|
|
return left * right; |
|
438
|
|
|
|
|
|
|
} |
|
439
|
1805
|
50
|
|
|
|
|
if (strncmp(term_cpy, "I(", 2) == 0) { |
|
440
|
0
|
|
|
|
|
|
char *restrict end = strrchr(term_cpy, ')'); |
|
441
|
0
|
0
|
|
|
|
|
if (end) *end = '\0'; |
|
442
|
0
|
|
|
|
|
|
char *restrict inner = term_cpy + 2; |
|
443
|
0
|
|
|
|
|
|
char *restrict caret = strchr(inner, '^'); |
|
444
|
0
|
|
|
|
|
|
int power = 1; |
|
445
|
0
|
0
|
|
|
|
|
if (caret) { |
|
446
|
0
|
|
|
|
|
|
*caret = '\0'; |
|
447
|
0
|
|
|
|
|
|
power = atoi(caret + 1); |
|
448
|
|
|
|
|
|
|
} |
|
449
|
0
|
|
|
|
|
|
double v = get_data_value(aTHX_ data_hoa, row_hashes, i, inner); |
|
450
|
0
|
|
|
|
|
|
Safefree(term_cpy); |
|
451
|
|
|
|
|
|
|
|
|
452
|
0
|
0
|
|
|
|
|
if (isnan(v)) return NAN; |
|
453
|
0
|
0
|
|
|
|
|
return power == 1 ? v : pow(v, power); |
|
454
|
|
|
|
|
|
|
} |
|
455
|
1805
|
|
|
|
|
|
double result = get_data_value(aTHX_ data_hoa, row_hashes, i, term_cpy); |
|
456
|
1805
|
|
|
|
|
|
Safefree(term_cpy); |
|
457
|
1805
|
|
|
|
|
|
return result; |
|
458
|
|
|
|
|
|
|
} |
|
459
|
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
// Helper to infer column type from its first valid element |
|
461
|
58
|
|
|
|
|
|
static bool is_column_categorical(pTHX_ HV *restrict data_hoa, HV **restrict row_hashes, size_t n, const char *restrict var) { |
|
462
|
90
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
463
|
89
|
|
|
|
|
|
SV **restrict val = NULL; |
|
464
|
89
|
100
|
|
|
|
|
if (row_hashes) { |
|
465
|
55
|
|
|
|
|
|
val = hv_fetch(row_hashes[i], var, strlen(var), 0); |
|
466
|
55
|
100
|
|
|
|
|
if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
467
|
23
|
|
|
|
|
|
AV*restrict av = (AV*)SvRV(*val); |
|
468
|
23
|
|
|
|
|
|
val = av_fetch(av, 0, 0); |
|
469
|
|
|
|
|
|
|
} |
|
470
|
34
|
50
|
|
|
|
|
} else if (data_hoa) { |
|
471
|
34
|
|
|
|
|
|
SV **restrict col = hv_fetch(data_hoa, var, strlen(var), 0); |
|
472
|
34
|
50
|
|
|
|
|
if (col && SvROK(*col) && SvTYPE(SvRV(*col)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
473
|
34
|
|
|
|
|
|
AV*restrict av = (AV*)SvRV(*col); |
|
474
|
34
|
|
|
|
|
|
val = av_fetch(av, i, 0); |
|
475
|
|
|
|
|
|
|
} |
|
476
|
|
|
|
|
|
|
} |
|
477
|
89
|
100
|
|
|
|
|
if (val && SvOK(*val)) { |
|
|
|
50
|
|
|
|
|
|
|
478
|
57
|
100
|
|
|
|
|
if (looks_like_number(*val)) return FALSE; // First valid is number -> Numeric Column |
|
479
|
10
|
|
|
|
|
|
return TRUE; // First valid is string -> Categorical Column |
|
480
|
|
|
|
|
|
|
} |
|
481
|
|
|
|
|
|
|
} |
|
482
|
1
|
|
|
|
|
|
return FALSE; |
|
483
|
|
|
|
|
|
|
} |
|
484
|
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
/* Internal extractor resolving single data string values using dynamic allocation. */ |
|
486
|
371
|
|
|
|
|
|
static char* get_data_string_alloc(pTHX_ HV *restrict data_hoa, HV **restrict row_hashes, size_t i, const char *restrict var) { |
|
487
|
371
|
|
|
|
|
|
SV **restrict val = NULL; |
|
488
|
371
|
50
|
|
|
|
|
if (row_hashes) { |
|
489
|
0
|
|
|
|
|
|
val = hv_fetch(row_hashes[i], var, strlen(var), 0); |
|
490
|
0
|
0
|
|
|
|
|
if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVAV) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
491
|
0
|
|
|
|
|
|
AV*restrict av = (AV*)SvRV(*val); |
|
492
|
0
|
|
|
|
|
|
val = av_fetch(av, 0, 0); |
|
493
|
|
|
|
|
|
|
} |
|
494
|
371
|
50
|
|
|
|
|
} else if (data_hoa) { |
|
495
|
371
|
|
|
|
|
|
SV **restrict col = hv_fetch(data_hoa, var, strlen(var), 0); |
|
496
|
371
|
50
|
|
|
|
|
if (col && SvROK(*col) && SvTYPE(SvRV(*col)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
497
|
371
|
|
|
|
|
|
AV*restrict av = (AV*)SvRV(*col); |
|
498
|
371
|
|
|
|
|
|
val = av_fetch(av, i, 0); |
|
499
|
|
|
|
|
|
|
} |
|
500
|
|
|
|
|
|
|
} |
|
501
|
371
|
50
|
|
|
|
|
if (val && SvOK(*val)) { |
|
|
|
50
|
|
|
|
|
|
|
502
|
371
|
|
|
|
|
|
return savepv(SvPV_nolen(*val)); /* Allocates and returns string */ |
|
503
|
|
|
|
|
|
|
} |
|
504
|
0
|
|
|
|
|
|
return NULL; |
|
505
|
|
|
|
|
|
|
} |
|
506
|
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
// Struct for sorting p-values while remembering their original index |
|
508
|
|
|
|
|
|
|
typedef struct { |
|
509
|
|
|
|
|
|
|
double p; |
|
510
|
|
|
|
|
|
|
size_t orig_idx; |
|
511
|
|
|
|
|
|
|
} PVal; |
|
512
|
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
// Comparator for qsort |
|
514
|
1519
|
|
|
|
|
|
static int cmp_pval(const void *restrict a, const void *restrict b) { |
|
515
|
1519
|
|
|
|
|
|
double diff = ((PVal*)a)->p - ((PVal*)b)->p; |
|
516
|
1519
|
100
|
|
|
|
|
if (diff < 0) return -1; |
|
517
|
812
|
50
|
|
|
|
|
if (diff > 0) return 1; |
|
518
|
|
|
|
|
|
|
/* Stabilize sort by falling back to original index */ |
|
519
|
0
|
|
|
|
|
|
return ((PVal*)a)->orig_idx - ((PVal*)b)->orig_idx; |
|
520
|
|
|
|
|
|
|
} |
|
521
|
|
|
|
|
|
|
/* ----------------------------------------------------------------------- |
|
522
|
|
|
|
|
|
|
* Helpers for cor(): ranking (Spearman), Pearson r, Kendall tau-b |
|
523
|
|
|
|
|
|
|
* ----------------------------------------------------------------------- */ |
|
524
|
|
|
|
|
|
|
/* Item used to sort values while remembering their original index, |
|
525
|
|
|
|
|
|
|
* needed for average-rank tie-breaking in Spearman correlation. */ |
|
526
|
|
|
|
|
|
|
typedef struct { |
|
527
|
|
|
|
|
|
|
double val; |
|
528
|
|
|
|
|
|
|
size_t idx; |
|
529
|
|
|
|
|
|
|
} RankItem; |
|
530
|
|
|
|
|
|
|
|
|
531
|
57
|
|
|
|
|
|
static int cmp_rank_item(const void *restrict a, const void *restrict b) { |
|
532
|
57
|
|
|
|
|
|
double diff = ((RankItem*)a)->val - ((RankItem*)b)->val; |
|
533
|
57
|
100
|
|
|
|
|
if (diff < 0) return -1; |
|
534
|
4
|
100
|
|
|
|
|
if (diff > 0) return 1; |
|
535
|
1
|
|
|
|
|
|
return 0; |
|
536
|
|
|
|
|
|
|
} |
|
537
|
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
/* Compute 1-based average ranks with tie-breaking into out[]. |
|
539
|
|
|
|
|
|
|
* in[] is not modified. */ |
|
540
|
8
|
|
|
|
|
|
static void rank_data(const double *restrict in, double *restrict out, size_t n) { |
|
541
|
|
|
|
|
|
|
RankItem *restrict ri; |
|
542
|
8
|
50
|
|
|
|
|
Newx(ri, n, RankItem); |
|
543
|
56
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { ri[i].val = in[i]; ri[i].idx = i; } |
|
544
|
8
|
|
|
|
|
|
qsort(ri, n, sizeof(RankItem), cmp_rank_item); |
|
545
|
|
|
|
|
|
|
|
|
546
|
8
|
|
|
|
|
|
size_t i = 0; |
|
547
|
55
|
100
|
|
|
|
|
while (i < n) { |
|
548
|
47
|
|
|
|
|
|
size_t j = i; |
|
549
|
|
|
|
|
|
|
/* Find the full extent of this tie group */ |
|
550
|
48
|
100
|
|
|
|
|
while (j + 1 < n && ri[j + 1].val == ri[j].val) j++; |
|
|
|
100
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
/* All members get the average of ranks i+1 … j+1 (1-based) */ |
|
552
|
47
|
|
|
|
|
|
double avg = (double)(i + j) / 2.0 + 1.0; |
|
553
|
95
|
100
|
|
|
|
|
for (size_t k = i; k <= j; k++) out[ri[k].idx] = avg; |
|
554
|
47
|
|
|
|
|
|
i = j + 1; |
|
555
|
|
|
|
|
|
|
} |
|
556
|
8
|
|
|
|
|
|
Safefree(ri); |
|
557
|
8
|
|
|
|
|
|
} |
|
558
|
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
/* Pearson product-moment r between two n-element arrays. |
|
560
|
|
|
|
|
|
|
* Returns NAN when either variable has zero variance (matches R). */ |
|
561
|
61
|
|
|
|
|
|
static double pearson_corr(const double *restrict x, const double *restrict y, size_t n) { |
|
562
|
61
|
|
|
|
|
|
double sx = 0, sy = 0, sxy = 0, sx2 = 0, sy2 = 0; |
|
563
|
364
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
564
|
303
|
|
|
|
|
|
sx += x[i]; sy += y[i]; |
|
565
|
303
|
|
|
|
|
|
sxy += x[i]*y[i]; sx2 += x[i]*x[i]; sy2 += y[i]*y[i]; |
|
566
|
|
|
|
|
|
|
} |
|
567
|
61
|
|
|
|
|
|
double num = (double)n * sxy - sx * sy; |
|
568
|
61
|
|
|
|
|
|
double den = sqrt(((double)n * sx2 - sx*sx) * ((double)n * sy2 - sy*sy)); |
|
569
|
61
|
50
|
|
|
|
|
if (den == 0.0) return NAN; |
|
570
|
61
|
|
|
|
|
|
return num / den; |
|
571
|
|
|
|
|
|
|
} |
|
572
|
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
/* Kendall's tau-b between two n-element arrays. |
|
574
|
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
* tau-b = (C − D) / sqrt((C + D + T_x)(C + D + T_y)) |
|
576
|
|
|
|
|
|
|
* |
|
577
|
|
|
|
|
|
|
* where C = concordant pairs, D = discordant, T_x = pairs tied only on |
|
578
|
|
|
|
|
|
|
* x, T_y = pairs tied only on y. Joint ties (both zero) are excluded |
|
579
|
|
|
|
|
|
|
* from numerator and denominator, matching R's cor(method="kendall"). |
|
580
|
|
|
|
|
|
|
* Returns NAN when the denominator is zero. */ |
|
581
|
1
|
|
|
|
|
|
static double kendall_tau_b(const double *restrict x, const double *restrict y, unsigned int n) { |
|
582
|
1
|
|
|
|
|
|
size_t C = 0, D = 0, tie_x = 0, tie_y = 0; |
|
583
|
9
|
100
|
|
|
|
|
for (size_t i = 0; i < n - 1; i++) { |
|
584
|
44
|
100
|
|
|
|
|
for (size_t j = i + 1; j < n; j++) { |
|
585
|
36
|
|
|
|
|
|
int sx = (x[i] > x[j]) - (x[i] < x[j]); /* sign of x[i]-x[j] */ |
|
586
|
36
|
|
|
|
|
|
int sy = (y[i] > y[j]) - (y[i] < y[j]); |
|
587
|
36
|
100
|
|
|
|
|
if (sx == 0 && sy == 0) { /* joint tie — not counted */ } |
|
|
|
50
|
|
|
|
|
|
|
588
|
36
|
100
|
|
|
|
|
else if (sx == 0) tie_x++; |
|
589
|
35
|
50
|
|
|
|
|
else if (sy == 0) tie_y++; |
|
590
|
35
|
50
|
|
|
|
|
else if (sx == sy) C++; |
|
591
|
0
|
|
|
|
|
|
else D++; |
|
592
|
|
|
|
|
|
|
} |
|
593
|
|
|
|
|
|
|
} |
|
594
|
1
|
|
|
|
|
|
double denom = sqrt((double)(C + D + tie_x) * (double)(C + D + tie_y)); |
|
595
|
1
|
50
|
|
|
|
|
if (denom == 0.0) return NAN; |
|
596
|
1
|
|
|
|
|
|
return (double)(C - D) / denom; |
|
597
|
|
|
|
|
|
|
} |
|
598
|
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
/* Single dispatch: compute correlation according to method string. |
|
600
|
|
|
|
|
|
|
* Allocates and frees temporary rank arrays internally for Spearman. */ |
|
601
|
62
|
|
|
|
|
|
static double compute_cor(const double *restrict x, const double *restrict y, |
|
602
|
|
|
|
|
|
|
size_t n, const char *restrict method) { |
|
603
|
62
|
100
|
|
|
|
|
if (strcmp(method, "spearman") == 0) { |
|
604
|
|
|
|
|
|
|
double *restrict rx, *restrict ry; |
|
605
|
3
|
50
|
|
|
|
|
Newx(rx, n, double); Newx(ry, n, double); |
|
|
|
50
|
|
|
|
|
|
|
606
|
3
|
|
|
|
|
|
rank_data(x, rx, n); |
|
607
|
3
|
|
|
|
|
|
rank_data(y, ry, n); |
|
608
|
3
|
|
|
|
|
|
double r = pearson_corr(rx, ry, n); |
|
609
|
3
|
|
|
|
|
|
Safefree(rx); Safefree(ry); |
|
610
|
3
|
|
|
|
|
|
return r; |
|
611
|
|
|
|
|
|
|
} |
|
612
|
59
|
100
|
|
|
|
|
if (strcmp(method, "kendall") == 0) |
|
613
|
1
|
|
|
|
|
|
return kendall_tau_b(x, y, n); |
|
614
|
|
|
|
|
|
|
/* default: pearson */ |
|
615
|
58
|
|
|
|
|
|
return pearson_corr(x, y, n); |
|
616
|
|
|
|
|
|
|
} |
|
617
|
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
// Math macros |
|
619
|
|
|
|
|
|
|
#define MAX_ITER 500 |
|
620
|
|
|
|
|
|
|
#define EPS 3.0e-15 |
|
621
|
|
|
|
|
|
|
#define FPMIN 1.0e-30 |
|
622
|
|
|
|
|
|
|
|
|
623
|
8623
|
|
|
|
|
|
static double _incbeta_cf(double a, double b, double x) { |
|
624
|
|
|
|
|
|
|
int m; |
|
625
|
|
|
|
|
|
|
double aa, c, d, del, h, qab, qam, qap; |
|
626
|
8623
|
|
|
|
|
|
qab = a + b; qap = a + 1.0; qam = a - 1.0; |
|
627
|
8623
|
|
|
|
|
|
c = 1.0; d = 1.0 - qab * x / qap; |
|
628
|
8623
|
50
|
|
|
|
|
if (fabs(d) < FPMIN) d = FPMIN; |
|
629
|
8623
|
|
|
|
|
|
d = 1.0 / d; h = d; |
|
630
|
183201
|
50
|
|
|
|
|
for (m = 1; m <= MAX_ITER; m++) { |
|
631
|
183201
|
|
|
|
|
|
int m2 = 2 * m; |
|
632
|
183201
|
|
|
|
|
|
aa = m * (b - m) * x / ((qam + m2) * (a + m2)); |
|
633
|
183201
|
|
|
|
|
|
d = 1.0 + aa * d; |
|
634
|
183201
|
50
|
|
|
|
|
if (fabs(d) < FPMIN) d = FPMIN; |
|
635
|
183201
|
|
|
|
|
|
c = 1.0 + aa / c; |
|
636
|
183201
|
50
|
|
|
|
|
if (fabs(c) < FPMIN) c = FPMIN; |
|
637
|
183201
|
|
|
|
|
|
d = 1.0 / d; h *= d * c; |
|
638
|
183201
|
|
|
|
|
|
aa = -(a + m) * (qab + m) * x / ((a + m2) * (qap + m2)); |
|
639
|
183201
|
|
|
|
|
|
d = 1.0 + aa * d; |
|
640
|
183201
|
50
|
|
|
|
|
if (fabs(d) < FPMIN) d = FPMIN; |
|
641
|
183201
|
|
|
|
|
|
c = 1.0 + aa / c; |
|
642
|
183201
|
50
|
|
|
|
|
if (fabs(c) < FPMIN) c = FPMIN; |
|
643
|
183201
|
|
|
|
|
|
d = 1.0 / d; del = d * c; h *= del; |
|
644
|
183201
|
100
|
|
|
|
|
if (fabs(del - 1.0) < EPS) break; |
|
645
|
|
|
|
|
|
|
} |
|
646
|
8623
|
|
|
|
|
|
return h; |
|
647
|
|
|
|
|
|
|
} |
|
648
|
|
|
|
|
|
|
|
|
649
|
8669
|
|
|
|
|
|
static double incbeta(double a, double b, double x) { |
|
650
|
8669
|
100
|
|
|
|
|
if (x <= 0.0) return 0.0; |
|
651
|
8664
|
100
|
|
|
|
|
if (x >= 1.0) return 1.0; |
|
652
|
8623
|
|
|
|
|
|
double bt = exp(lgamma(a + b) - lgamma(a) - lgamma(b) + a * log(x) + b * log(1.0 - x)); |
|
653
|
8623
|
100
|
|
|
|
|
if (x < (a + 1.0) / (a + b + 2.0)) return bt * _incbeta_cf(a, b, x) / a; |
|
654
|
1589
|
|
|
|
|
|
return 1.0 - bt * _incbeta_cf(b, a, 1.0 - x) / b; |
|
655
|
|
|
|
|
|
|
} |
|
656
|
|
|
|
|
|
|
|
|
657
|
8365
|
|
|
|
|
|
static double get_t_pvalue(double t, double df, const char*restrict alt) { |
|
658
|
8365
|
|
|
|
|
|
double x = df / (df + t * t); |
|
659
|
8365
|
|
|
|
|
|
double prob_2tail = incbeta(df / 2.0, 0.5, x); |
|
660
|
8365
|
100
|
|
|
|
|
if (strcmp(alt, "less") == 0) return (t < 0) ? 0.5 * prob_2tail : 1.0 - 0.5 * prob_2tail; |
|
|
|
100
|
|
|
|
|
|
|
661
|
8363
|
100
|
|
|
|
|
if (strcmp(alt, "greater") == 0) return (t > 0) ? 0.5 * prob_2tail : 1.0 - 0.5 * prob_2tail; |
|
|
|
50
|
|
|
|
|
|
|
662
|
115
|
|
|
|
|
|
return prob_2tail; |
|
663
|
|
|
|
|
|
|
} |
|
664
|
|
|
|
|
|
|
|
|
665
|
|
|
|
|
|
|
// Bisection algorithm to find the inverse t-distribution (Critical t-value) |
|
666
|
277
|
|
|
|
|
|
static double qt_tail(double df, double p_tail) { |
|
667
|
277
|
|
|
|
|
|
double low = 0.0, high = 1.0; |
|
668
|
|
|
|
|
|
|
// Find upper bound |
|
669
|
661
|
100
|
|
|
|
|
while (get_t_pvalue(high, df, "greater") > p_tail) { |
|
670
|
384
|
|
|
|
|
|
low = high; |
|
671
|
384
|
|
|
|
|
|
high *= 2.0; |
|
672
|
384
|
50
|
|
|
|
|
if (high > 1000000.0) break; /* Fallback limit */ |
|
673
|
|
|
|
|
|
|
} |
|
674
|
|
|
|
|
|
|
// Bisect to find the root |
|
675
|
7586
|
50
|
|
|
|
|
for (unsigned short int i = 0; i < 100; i++) { |
|
676
|
7586
|
|
|
|
|
|
double mid = (low + high) / 2.0; |
|
677
|
7586
|
|
|
|
|
|
double p_mid = get_t_pvalue(mid, df, "greater"); |
|
678
|
7586
|
100
|
|
|
|
|
if (p_mid > p_tail) { |
|
679
|
3711
|
|
|
|
|
|
low = mid; |
|
680
|
|
|
|
|
|
|
} else { |
|
681
|
3875
|
|
|
|
|
|
high = mid; |
|
682
|
|
|
|
|
|
|
} |
|
683
|
7586
|
100
|
|
|
|
|
if (high - low < 1e-8) break; |
|
684
|
|
|
|
|
|
|
} |
|
685
|
277
|
|
|
|
|
|
return (low + high) / 2.0; |
|
686
|
|
|
|
|
|
|
} |
|
687
|
|
|
|
|
|
|
|
|
688
|
2335
|
|
|
|
|
|
int compare_doubles(const void *restrict a, const void *restrict b) { |
|
689
|
2335
|
|
|
|
|
|
double da = *(const double*restrict)a; |
|
690
|
2335
|
|
|
|
|
|
double db = *(const double*restrict)b; |
|
691
|
2335
|
|
|
|
|
|
return (da > db) - (da < db); |
|
692
|
|
|
|
|
|
|
} |
|
693
|
|
|
|
|
|
|
/* Helper to calculate the number of bins using Sturges' formula: log2(n) + 1 */ |
|
694
|
0
|
|
|
|
|
|
static size_t calculate_sturges_bins(size_t n) { |
|
695
|
0
|
0
|
|
|
|
|
if (n == 0) return 1; |
|
696
|
0
|
|
|
|
|
|
return (size_t)(log((double)n) / log(2.0) + 1.0); |
|
697
|
|
|
|
|
|
|
} |
|
698
|
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
// Logic for distributing data into bins (Optimized to O(N)) |
|
700
|
5
|
|
|
|
|
|
static void compute_hist_logic(double *restrict x, size_t n, double *restrict breaks, size_t n_bins, |
|
701
|
|
|
|
|
|
|
size_t *restrict counts, double *restrict mids, double *restrict density) { |
|
702
|
5
|
|
|
|
|
|
double total_n = (double)n; |
|
703
|
5
|
|
|
|
|
|
double min_val = breaks[0]; |
|
704
|
5
|
50
|
|
|
|
|
double step = (n_bins > 0) ? (breaks[1] - breaks[0]) : 0.0; |
|
705
|
|
|
|
|
|
|
// Initialize counts and compute midpoints |
|
706
|
23
|
100
|
|
|
|
|
for (size_t i = 0; i < n_bins; i++) { |
|
707
|
18
|
|
|
|
|
|
counts[i] = 0; |
|
708
|
18
|
|
|
|
|
|
mids[i] = (breaks[i] + breaks[i+1]) / 2.0; |
|
709
|
|
|
|
|
|
|
} |
|
710
|
|
|
|
|
|
|
// Single O(N) pass to assign elements to bins |
|
711
|
5
|
100
|
|
|
|
|
if (step > 0.0) { |
|
712
|
2017
|
100
|
|
|
|
|
for (size_t j = 0; j < n; j++) { |
|
713
|
2014
|
|
|
|
|
|
double val = x[j]; |
|
714
|
|
|
|
|
|
|
// Ignore out-of-bounds or invalid values |
|
715
|
2014
|
50
|
|
|
|
|
if (isnan(val) || isinf(val) || val < min_val) continue; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
716
|
|
|
|
|
|
|
// Calculate initial bin index mathematically |
|
717
|
2014
|
|
|
|
|
|
size_t idx = (size_t)((val - min_val) / step); |
|
718
|
|
|
|
|
|
|
// Clamp to valid array bounds first to prevent overflow */ |
|
719
|
2014
|
100
|
|
|
|
|
if (idx >= n_bins) { |
|
720
|
3
|
|
|
|
|
|
idx = n_bins - 1; |
|
721
|
|
|
|
|
|
|
} |
|
722
|
|
|
|
|
|
|
/* Adjust for exact boundaries (R's right-inclusive default: (a, b]) */ |
|
723
|
|
|
|
|
|
|
/* If value is exactly on or slightly below the lower boundary of the assigned bin, |
|
724
|
|
|
|
|
|
|
it belongs in the previous bin. (First bin [a, b] is inclusive on both ends) */ |
|
725
|
2023
|
100
|
|
|
|
|
while (idx > 0 && val <= breaks[idx]) { |
|
|
|
100
|
|
|
|
|
|
|
726
|
9
|
|
|
|
|
|
idx--; |
|
727
|
|
|
|
|
|
|
} |
|
728
|
|
|
|
|
|
|
// Conversely, if floating-point truncation placed it too low, push it up |
|
729
|
2014
|
100
|
|
|
|
|
while (idx < n_bins - 1 && val > breaks[idx + 1]) { |
|
|
|
50
|
|
|
|
|
|
|
730
|
0
|
|
|
|
|
|
idx++; |
|
731
|
|
|
|
|
|
|
} |
|
732
|
2014
|
|
|
|
|
|
counts[idx]++; |
|
733
|
|
|
|
|
|
|
} |
|
734
|
2
|
50
|
|
|
|
|
} else if (n_bins > 0) { |
|
735
|
|
|
|
|
|
|
// Edge case: All data points have the exact same value (step == 0) |
|
736
|
2
|
|
|
|
|
|
counts[0] = n; |
|
737
|
|
|
|
|
|
|
} |
|
738
|
|
|
|
|
|
|
// Compute densities |
|
739
|
23
|
100
|
|
|
|
|
for (size_t i = 0; i < n_bins; i++) { |
|
740
|
18
|
|
|
|
|
|
double bin_width = breaks[i+1] - breaks[i]; |
|
741
|
18
|
100
|
|
|
|
|
if (bin_width > 0) { |
|
742
|
16
|
|
|
|
|
|
density[i] = (double)counts[i] / (total_n * bin_width); |
|
743
|
|
|
|
|
|
|
} else { |
|
744
|
2
|
50
|
|
|
|
|
density[i] = (n_bins == 1) ? 1.0 : 0.0; |
|
745
|
|
|
|
|
|
|
} |
|
746
|
|
|
|
|
|
|
} |
|
747
|
5
|
|
|
|
|
|
} |
|
748
|
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
// Standard Normal CDF approximation |
|
750
|
59
|
|
|
|
|
|
double approx_pnorm(double x) { |
|
751
|
59
|
|
|
|
|
|
return 0.5 * erfc(-x * 0.70710678118654752440); // 0.707... = 1/sqrt(2) |
|
752
|
|
|
|
|
|
|
} |
|
753
|
|
|
|
|
|
|
#ifndef M_SQRT1_2 |
|
754
|
|
|
|
|
|
|
#define M_SQRT1_2 0.70710678118654752440 |
|
755
|
|
|
|
|
|
|
#endif |
|
756
|
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
/* Macro for exact Wilcoxon 3D array indexing */ |
|
758
|
|
|
|
|
|
|
#define DP_INDEX(i, j, k, n2, max_u) ((i) * ((n2) + 1) * ((max_u) + 1) + (j) * ((max_u) + 1) + (k)) |
|
759
|
30
|
|
|
|
|
|
static double inverse_normal_cdf(double p) { |
|
760
|
30
|
|
|
|
|
|
double a[4] = {2.50662823884, -18.61500062529, 41.39119773534, -25.44106049637}; |
|
761
|
30
|
|
|
|
|
|
double b[4] = {-8.47351093090, 23.08336743743, -21.06224101826, 3.13082909833}; |
|
762
|
30
|
|
|
|
|
|
double c[9] = {0.3374754822726147, 0.9761690190917186, 0.1607979714918209, |
|
763
|
|
|
|
|
|
|
0.0276438810333863, 0.0038405729373609, 0.0003951896511919, |
|
764
|
|
|
|
|
|
|
0.0000321767881768, 0.0000002888167364, 0.0000003960315187}; |
|
765
|
|
|
|
|
|
|
double x, r, y; |
|
766
|
30
|
|
|
|
|
|
y = p - 0.5; |
|
767
|
30
|
100
|
|
|
|
|
if (fabs(y) < 0.42) { |
|
768
|
22
|
|
|
|
|
|
r = y * y; |
|
769
|
22
|
|
|
|
|
|
x = y * (((a[3]*r + a[2])*r + a[1])*r + a[0]) / |
|
770
|
22
|
|
|
|
|
|
((((b[3]*r + b[2])*r + b[1])*r + b[0])*r + 1.0); |
|
771
|
|
|
|
|
|
|
} else { |
|
772
|
8
|
|
|
|
|
|
r = p; |
|
773
|
8
|
100
|
|
|
|
|
if (y > 0) r = 1.0 - p; |
|
774
|
8
|
|
|
|
|
|
r = log(-log(r)); |
|
775
|
8
|
|
|
|
|
|
x = c[0] + r * (c[1] + r * (c[2] + r * (c[3] + r * (c[4] + |
|
776
|
8
|
|
|
|
|
|
r * (c[5] + r * (c[6] + r * (c[7] + r * c[8]))))))); |
|
777
|
8
|
100
|
|
|
|
|
if (y < 0) x = -x; |
|
778
|
|
|
|
|
|
|
} |
|
779
|
30
|
|
|
|
|
|
return x; |
|
780
|
|
|
|
|
|
|
} |
|
781
|
|
|
|
|
|
|
/* ----------------------------------------------------------------------- |
|
782
|
|
|
|
|
|
|
* Exact Spearman p-value via exhaustive permutation enumeration. |
|
783
|
|
|
|
|
|
|
* |
|
784
|
|
|
|
|
|
|
* Under H0, all n! orderings of ranks are equally probable. We visit |
|
785
|
|
|
|
|
|
|
* every permutation of {1..n} with Heap's algorithm (O(n!), no allocs |
|
786
|
|
|
|
|
|
|
* inside the loop) and count how many yield S ≤ s_obs ("lower tail", |
|
787
|
|
|
|
|
|
|
* i.e. rho ≥ rho_obs) and how many yield S ≥ s_obs ("upper tail"). |
|
788
|
|
|
|
|
|
|
* |
|
789
|
|
|
|
|
|
|
* Mirrors R's default: exact = (n < 10) with no ties. |
|
790
|
|
|
|
|
|
|
* Valid up to n = 9 (362 880 iterations — negligible cost). |
|
791
|
|
|
|
|
|
|
* ----------------------------------------------------------------------- */ |
|
792
|
1
|
|
|
|
|
|
static double spearman_exact_pvalue(double s_obs, size_t n, const char *restrict alt) { |
|
793
|
1
|
|
|
|
|
|
int *restrict perm = (int*)safemalloc(n * sizeof(int)); |
|
794
|
1
|
|
|
|
|
|
int *restrict c = (int*)safemalloc(n * sizeof(int)); |
|
795
|
6
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { perm[i] = i + 1; c[i] = 0; } |
|
796
|
|
|
|
|
|
|
|
|
797
|
1
|
|
|
|
|
|
long count_le = 0, count_ge = 0, total = 0; |
|
798
|
|
|
|
|
|
|
|
|
799
|
|
|
|
|
|
|
#define TALLY_PERM() do { \ |
|
800
|
|
|
|
|
|
|
double s_ = 0.0; \ |
|
801
|
|
|
|
|
|
|
for (int ii = 0; ii < n; ii++) { \ |
|
802
|
|
|
|
|
|
|
double d_ = (double)(ii + 1) - (double)perm[ii];\ |
|
803
|
|
|
|
|
|
|
s_ += d_ * d_; \ |
|
804
|
|
|
|
|
|
|
} \ |
|
805
|
|
|
|
|
|
|
if (s_ <= s_obs + 1e-9) count_le++; \ |
|
806
|
|
|
|
|
|
|
if (s_ >= s_obs - 1e-9) count_ge++; \ |
|
807
|
|
|
|
|
|
|
total++; \ |
|
808
|
|
|
|
|
|
|
} while (0) |
|
809
|
|
|
|
|
|
|
|
|
810
|
6
|
100
|
|
|
|
|
TALLY_PERM(); /* initial permutation [1, 2, ..., n] */ |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
811
|
|
|
|
|
|
|
|
|
812
|
1
|
|
|
|
|
|
unsigned int k = 1; |
|
813
|
206
|
100
|
|
|
|
|
while (k < n) { |
|
814
|
205
|
100
|
|
|
|
|
if (c[k] < k) { |
|
815
|
|
|
|
|
|
|
int tmp; |
|
816
|
119
|
100
|
|
|
|
|
if (k % 2 == 0) { |
|
817
|
44
|
|
|
|
|
|
tmp = perm[0]; perm[0] = perm[k]; perm[k] = tmp; |
|
818
|
|
|
|
|
|
|
} else { |
|
819
|
75
|
|
|
|
|
|
tmp = perm[c[k]]; perm[c[k]] = perm[k]; perm[k] = tmp; |
|
820
|
|
|
|
|
|
|
} |
|
821
|
714
|
100
|
|
|
|
|
TALLY_PERM(); |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
822
|
119
|
|
|
|
|
|
c[k]++; |
|
823
|
119
|
|
|
|
|
|
k = 1; |
|
824
|
|
|
|
|
|
|
} else { |
|
825
|
86
|
|
|
|
|
|
c[k] = 0; |
|
826
|
86
|
|
|
|
|
|
k++; |
|
827
|
|
|
|
|
|
|
} |
|
828
|
|
|
|
|
|
|
} |
|
829
|
|
|
|
|
|
|
#undef TALLY_PERM |
|
830
|
1
|
|
|
|
|
|
Safefree(perm); Safefree(c); |
|
831
|
|
|
|
|
|
|
/* p_le = P(S ≤ s_obs) ≡ P(rho ≥ rho_obs) — upper rho tail |
|
832
|
|
|
|
|
|
|
* p_ge = P(S ≥ s_obs) ≡ P(rho ≤ rho_obs) — lower rho tail */ |
|
833
|
1
|
|
|
|
|
|
double p_le = (double)count_le / (double)total; |
|
834
|
1
|
|
|
|
|
|
double p_ge = (double)count_ge / (double)total; |
|
835
|
|
|
|
|
|
|
|
|
836
|
1
|
50
|
|
|
|
|
if (strcmp(alt, "greater") == 0) return p_le; |
|
837
|
1
|
50
|
|
|
|
|
if (strcmp(alt, "less") == 0) return p_ge; |
|
838
|
|
|
|
|
|
|
/* two.sided: 2 × the smaller tail, clamped to 1 */ |
|
839
|
1
|
50
|
|
|
|
|
double p = 2.0 * (p_le < p_ge ? p_le : p_ge); |
|
840
|
1
|
50
|
|
|
|
|
return (p > 1.0) ? 1.0 : p; |
|
841
|
|
|
|
|
|
|
} |
|
842
|
|
|
|
|
|
|
/* ----------------------------------------------------------------------- |
|
843
|
|
|
|
|
|
|
* Exact Kendall p-value via Mahonian Numbers (Inversions distribution) |
|
844
|
|
|
|
|
|
|
* Matches R's behavior for N < 50 without ties. |
|
845
|
|
|
|
|
|
|
* ----------------------------------------------------------------------- */ |
|
846
|
2
|
|
|
|
|
|
static double kendall_exact_pvalue(size_t n, double s_obs, const char *restrict alt) { |
|
847
|
2
|
|
|
|
|
|
long max_inv = (long)n * (n - 1) / 2; |
|
848
|
2
|
|
|
|
|
|
double *restrict dp = (double*)safemalloc((max_inv + 1) * sizeof(double)); |
|
849
|
24
|
100
|
|
|
|
|
for (long i = 0; i <= max_inv; i++) dp[i] = 0.0; |
|
850
|
2
|
|
|
|
|
|
dp[0] = 1.0; |
|
851
|
|
|
|
|
|
|
/* Build the distribution of inversions via DP */ |
|
852
|
10
|
100
|
|
|
|
|
for (size_t i = 2; i <= n; i++) { |
|
853
|
8
|
|
|
|
|
|
double *restrict next_dp = (double*)safemalloc((max_inv + 1) * sizeof(double)); |
|
854
|
96
|
100
|
|
|
|
|
for (long k = 0; k <= max_inv; k++) next_dp[k] = 0.0; |
|
855
|
8
|
|
|
|
|
|
int current_max_inv = i * (i - 1) / 2; |
|
856
|
56
|
100
|
|
|
|
|
for (int k = 0; k <= current_max_inv; k++) { |
|
857
|
48
|
|
|
|
|
|
double sum = 0; |
|
858
|
206
|
100
|
|
|
|
|
for (int j = 0; j <= i - 1 && k - j >= 0; j++) { |
|
|
|
100
|
|
|
|
|
|
|
859
|
158
|
|
|
|
|
|
sum += dp[k - j]; |
|
860
|
|
|
|
|
|
|
} |
|
861
|
|
|
|
|
|
|
// Divide by 'i' directly to keep array as pure probabilities and prevent overflow |
|
862
|
48
|
|
|
|
|
|
next_dp[k] = sum / (double)i; |
|
863
|
|
|
|
|
|
|
} |
|
864
|
8
|
|
|
|
|
|
Safefree(dp); |
|
865
|
8
|
|
|
|
|
|
dp = next_dp; |
|
866
|
|
|
|
|
|
|
} |
|
867
|
|
|
|
|
|
|
// Convert S statistic to target number of inversions |
|
868
|
2
|
|
|
|
|
|
long i_obs = (long)round((max_inv - s_obs) / 2.0); |
|
869
|
2
|
50
|
|
|
|
|
if (i_obs < 0) i_obs = 0; |
|
870
|
2
|
50
|
|
|
|
|
if (i_obs > max_inv) i_obs = max_inv; |
|
871
|
2
|
|
|
|
|
|
double p_le = 0.0; /* P(S <= S_obs) */ |
|
872
|
20
|
100
|
|
|
|
|
for (long k = i_obs; k <= max_inv; k++) p_le += dp[k]; |
|
873
|
2
|
|
|
|
|
|
double p_ge = 0.0; /* P(S >= S_obs) */ |
|
874
|
8
|
100
|
|
|
|
|
for (long k = 0; k <= i_obs; k++) p_ge += dp[k]; |
|
875
|
2
|
|
|
|
|
|
Safefree(dp); |
|
876
|
2
|
50
|
|
|
|
|
if (strcmp(alt, "greater") == 0) return p_ge; |
|
877
|
2
|
100
|
|
|
|
|
if (strcmp(alt, "less") == 0) return p_le; |
|
878
|
|
|
|
|
|
|
// two.sided |
|
879
|
1
|
50
|
|
|
|
|
double p = 2.0 * (p_ge < p_le ? p_ge : p_le); |
|
880
|
1
|
50
|
|
|
|
|
return p > 1.0 ? 1.0 : p; |
|
881
|
|
|
|
|
|
|
} |
|
882
|
|
|
|
|
|
|
// F-distribution Cumulative Distribution Function P(F <= f) |
|
883
|
304
|
|
|
|
|
|
static double pf(double f, double df1, double df2) { |
|
884
|
304
|
50
|
|
|
|
|
if (f <= 0.0) return 0.0; |
|
885
|
304
|
|
|
|
|
|
double x = (df1 * f) / (df1 * f + df2); |
|
886
|
304
|
|
|
|
|
|
return incbeta(df1 / 2.0, df2 / 2.0, x); |
|
887
|
|
|
|
|
|
|
} |
|
888
|
|
|
|
|
|
|
|
|
889
|
|
|
|
|
|
|
/* Householder QR Decomposition for Sequential Sums of Squares */ |
|
890
|
|
|
|
|
|
|
/* Householder QR Decomposition for Sequential Sums of Squares */ |
|
891
|
7
|
|
|
|
|
|
static void apply_householder_aov(double** restrict X, double* restrict y, size_t n, size_t p, bool* restrict aliased, size_t* restrict rank_map) { |
|
892
|
7
|
|
|
|
|
|
size_t r = 0; // Rank/Row tracker |
|
893
|
27
|
100
|
|
|
|
|
for (size_t k = 0; k < p; k++) { |
|
894
|
20
|
|
|
|
|
|
aliased[k] = FALSE; |
|
895
|
20
|
50
|
|
|
|
|
if (r >= n) { |
|
896
|
0
|
|
|
|
|
|
aliased[k] = TRUE; |
|
897
|
0
|
|
|
|
|
|
continue; |
|
898
|
|
|
|
|
|
|
} |
|
899
|
|
|
|
|
|
|
|
|
900
|
20
|
|
|
|
|
|
double max_val = 0; |
|
901
|
188
|
100
|
|
|
|
|
for (size_t i = r; i < n; i++) { |
|
902
|
168
|
100
|
|
|
|
|
if (fabs(X[i][k]) > max_val) max_val = fabs(X[i][k]); |
|
903
|
|
|
|
|
|
|
} |
|
904
|
20
|
100
|
|
|
|
|
if (max_val < 1e-10) { |
|
905
|
1
|
|
|
|
|
|
aliased[k] = TRUE; |
|
906
|
1
|
|
|
|
|
|
continue; |
|
907
|
|
|
|
|
|
|
} // Collinear or zero column |
|
908
|
|
|
|
|
|
|
|
|
909
|
19
|
|
|
|
|
|
double norm = 0; |
|
910
|
184
|
100
|
|
|
|
|
for (size_t i = r; i < n; i++) { |
|
911
|
165
|
|
|
|
|
|
X[i][k] /= max_val; |
|
912
|
165
|
|
|
|
|
|
norm += X[i][k] * X[i][k]; |
|
913
|
|
|
|
|
|
|
} |
|
914
|
19
|
|
|
|
|
|
norm = sqrt(norm); |
|
915
|
19
|
100
|
|
|
|
|
double s = (X[r][k] > 0) ? -norm : norm; |
|
916
|
19
|
|
|
|
|
|
double u1 = X[r][k] - s; |
|
917
|
19
|
|
|
|
|
|
X[r][k] = s * max_val; |
|
918
|
|
|
|
|
|
|
|
|
919
|
39
|
100
|
|
|
|
|
for (size_t j = k + 1; j < p; j++) { |
|
920
|
20
|
|
|
|
|
|
double dot = u1 * X[r][j]; |
|
921
|
202
|
100
|
|
|
|
|
for (size_t i = r + 1; i < n; i++) dot += X[i][j] * X[i][k]; |
|
922
|
20
|
|
|
|
|
|
double tau = dot / (s * u1); |
|
923
|
20
|
|
|
|
|
|
X[r][j] += tau * u1; |
|
924
|
202
|
100
|
|
|
|
|
for (size_t i = r + 1; i < n; i++) X[i][j] += tau * X[i][k]; |
|
925
|
|
|
|
|
|
|
} |
|
926
|
|
|
|
|
|
|
|
|
927
|
|
|
|
|
|
|
// Transform the response vector y |
|
928
|
19
|
|
|
|
|
|
double dot_y = u1 * y[r]; |
|
929
|
165
|
100
|
|
|
|
|
for (size_t i = r + 1; i < n; i++) dot_y += y[i] * X[i][k]; |
|
930
|
19
|
|
|
|
|
|
double tau_y = dot_y / (s * u1); |
|
931
|
19
|
|
|
|
|
|
y[r] += tau_y * u1; |
|
932
|
165
|
100
|
|
|
|
|
for (size_t i = r + 1; i < n; i++) y[i] += tau_y * X[i][k]; |
|
933
|
|
|
|
|
|
|
|
|
934
|
19
|
|
|
|
|
|
rank_map[k] = r; // Map original column index to orthogonal row index |
|
935
|
19
|
|
|
|
|
|
r++; |
|
936
|
|
|
|
|
|
|
} |
|
937
|
7
|
|
|
|
|
|
} |
|
938
|
|
|
|
|
|
|
|
|
939
|
|
|
|
|
|
|
// --- write_table Helpers --- |
|
940
|
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
// Sorts string arrays alphabetically |
|
942
|
56
|
|
|
|
|
|
static int cmp_string_wt(const void *a, const void *b) { |
|
943
|
56
|
|
|
|
|
|
return strcmp(*(const char**)a, *(const char**)b); |
|
944
|
|
|
|
|
|
|
} |
|
945
|
|
|
|
|
|
|
|
|
946
|
|
|
|
|
|
|
// Emulates Perl's /\D/ check |
|
947
|
13
|
|
|
|
|
|
static bool contains_nondigit(pTHX_ SV *restrict sv) { |
|
948
|
13
|
50
|
|
|
|
|
if (!sv || !SvOK(sv)) return 0; |
|
|
|
50
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
STRLEN len; |
|
950
|
13
|
|
|
|
|
|
char *restrict s = SvPVbyte(sv, len); |
|
951
|
25
|
100
|
|
|
|
|
for (size_t i = 0; i < len; i++) { |
|
952
|
13
|
100
|
|
|
|
|
if (!isdigit(s[i])) return 1; |
|
953
|
|
|
|
|
|
|
} |
|
954
|
12
|
|
|
|
|
|
return 0; |
|
955
|
|
|
|
|
|
|
} |
|
956
|
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
// Writes a properly quoted string dynamically |
|
958
|
371
|
|
|
|
|
|
static void print_str_quoted(PerlIO *fh, const char *str, const char *sep) { |
|
959
|
371
|
50
|
|
|
|
|
if (!str) str = ""; |
|
960
|
371
|
|
|
|
|
|
bool needs_quotes = 0; |
|
961
|
371
|
100
|
|
|
|
|
if (strstr(str, sep) != NULL || strchr(str, '"') != NULL || strchr(str, '\r') != NULL || strchr(str, '\n') != NULL) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
962
|
16
|
|
|
|
|
|
needs_quotes = 1; |
|
963
|
|
|
|
|
|
|
} |
|
964
|
|
|
|
|
|
|
|
|
965
|
371
|
100
|
|
|
|
|
if (needs_quotes) { |
|
966
|
16
|
|
|
|
|
|
PerlIO_putc(fh, '"'); |
|
967
|
156
|
100
|
|
|
|
|
for (const char *restrict p = str; *p; p++) { |
|
968
|
140
|
100
|
|
|
|
|
if (*p == '"') { |
|
969
|
9
|
|
|
|
|
|
PerlIO_putc(fh, '"'); |
|
970
|
9
|
|
|
|
|
|
PerlIO_putc(fh, '"'); |
|
971
|
|
|
|
|
|
|
} else { |
|
972
|
131
|
|
|
|
|
|
PerlIO_putc(fh, *p); |
|
973
|
|
|
|
|
|
|
} |
|
974
|
|
|
|
|
|
|
} |
|
975
|
16
|
|
|
|
|
|
PerlIO_putc(fh, '"'); |
|
976
|
|
|
|
|
|
|
} else { |
|
977
|
355
|
|
|
|
|
|
PerlIO_puts(fh, str); |
|
978
|
|
|
|
|
|
|
} |
|
979
|
371
|
|
|
|
|
|
} |
|
980
|
|
|
|
|
|
|
|
|
981
|
|
|
|
|
|
|
// Writes an array of strings joined by sep |
|
982
|
112
|
|
|
|
|
|
static void print_string_row(pTHX_ PerlIO *fh, const char **row, size_t len, const char *sep) { |
|
983
|
112
|
|
|
|
|
|
size_t sep_len = strlen(sep); |
|
984
|
483
|
100
|
|
|
|
|
for (size_t i = 0; i < len; i++) { |
|
985
|
371
|
100
|
|
|
|
|
if (i > 0) PerlIO_write(fh, sep, sep_len); |
|
986
|
371
|
100
|
|
|
|
|
if (row[i]) { |
|
987
|
364
|
|
|
|
|
|
print_str_quoted(fh, row[i], sep); |
|
988
|
|
|
|
|
|
|
} else { |
|
989
|
7
|
|
|
|
|
|
print_str_quoted(fh, "", sep); |
|
990
|
|
|
|
|
|
|
} |
|
991
|
|
|
|
|
|
|
} |
|
992
|
112
|
|
|
|
|
|
PerlIO_putc(fh, '\n'); |
|
993
|
112
|
|
|
|
|
|
} |
|
994
|
|
|
|
|
|
|
// Calculates the Regularized Upper Incomplete Gamma Function Q(a, x) |
|
995
|
|
|
|
|
|
|
// This perfectly replicates R's pchisq(..., lower.tail=FALSE) |
|
996
|
11
|
|
|
|
|
|
double igamc(double a, double x) { |
|
997
|
11
|
50
|
|
|
|
|
if (x < 0.0 || a <= 0.0) return 1.0; |
|
|
|
50
|
|
|
|
|
|
|
998
|
11
|
50
|
|
|
|
|
if (x == 0.0) return 1.0; |
|
999
|
|
|
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
// Series expansion for x < a + 1 |
|
1001
|
11
|
100
|
|
|
|
|
if (x < a + 1.0) { |
|
1002
|
4
|
|
|
|
|
|
double sum = 1.0 / a; |
|
1003
|
4
|
|
|
|
|
|
double term = 1.0 / a; |
|
1004
|
4
|
|
|
|
|
|
double n = 1.0; |
|
1005
|
62
|
100
|
|
|
|
|
while (fabs(term) > 1e-15) { |
|
1006
|
58
|
|
|
|
|
|
term *= x / (a + n); |
|
1007
|
58
|
|
|
|
|
|
sum += term; |
|
1008
|
58
|
|
|
|
|
|
n += 1.0; |
|
1009
|
|
|
|
|
|
|
} |
|
1010
|
4
|
|
|
|
|
|
return 1.0 - (sum * exp(-x + a * log(x) - lgamma(a))); |
|
1011
|
|
|
|
|
|
|
} |
|
1012
|
|
|
|
|
|
|
|
|
1013
|
|
|
|
|
|
|
// Continued fraction for x >= a + 1 |
|
1014
|
7
|
|
|
|
|
|
double b = x + 1.0 - a; |
|
1015
|
7
|
|
|
|
|
|
double c = 1.0 / 1e-30; |
|
1016
|
7
|
|
|
|
|
|
double d = 1.0 / b; |
|
1017
|
7
|
|
|
|
|
|
double h = d, i = 1.0; |
|
1018
|
105
|
50
|
|
|
|
|
while (i < 10000) { // Safety bound |
|
1019
|
105
|
|
|
|
|
|
double an = -i * (i - a); |
|
1020
|
105
|
|
|
|
|
|
b += 2.0; |
|
1021
|
105
|
|
|
|
|
|
d = an * d + b; |
|
1022
|
105
|
50
|
|
|
|
|
if (fabs(d) < 1e-30) d = 1e-30; |
|
1023
|
105
|
|
|
|
|
|
c = b + an / c; |
|
1024
|
105
|
50
|
|
|
|
|
if (fabs(c) < 1e-30) c = 1e-30; |
|
1025
|
105
|
|
|
|
|
|
d = 1.0 / d; |
|
1026
|
105
|
|
|
|
|
|
double del = d * c; |
|
1027
|
105
|
|
|
|
|
|
h *= del; |
|
1028
|
105
|
100
|
|
|
|
|
if (fabs(del - 1.0) < 1e-15) break; |
|
1029
|
98
|
|
|
|
|
|
i += 1.0; |
|
1030
|
|
|
|
|
|
|
} |
|
1031
|
7
|
|
|
|
|
|
return h * exp(-x + a * log(x) - lgamma(a)); |
|
1032
|
|
|
|
|
|
|
} |
|
1033
|
|
|
|
|
|
|
|
|
1034
|
|
|
|
|
|
|
// Chi-Squared p-value is simply the Incomplete Gamma of (df/2, stat/2) |
|
1035
|
11
|
|
|
|
|
|
double get_p_value(double stat, int df) { |
|
1036
|
11
|
50
|
|
|
|
|
if (df <= 0) return 1.0; |
|
1037
|
11
|
50
|
|
|
|
|
if (stat <= 0.0) return 1.0; |
|
1038
|
11
|
|
|
|
|
|
return igamc((double)df / 2.0, stat / 2.0); |
|
1039
|
|
|
|
|
|
|
} |
|
1040
|
|
|
|
|
|
|
|
|
1041
|
|
|
|
|
|
|
#ifndef M_SQRT1_2 |
|
1042
|
|
|
|
|
|
|
#define M_SQRT1_2 0.70710678118654752440 |
|
1043
|
|
|
|
|
|
|
#endif |
|
1044
|
|
|
|
|
|
|
|
|
1045
|
|
|
|
|
|
|
// Robust Binomial Coefficient using long double |
|
1046
|
2
|
|
|
|
|
|
static long double choose_comb(int n, int k) { |
|
1047
|
2
|
50
|
|
|
|
|
if (k < 0 || k > n) return 0.0L; |
|
|
|
50
|
|
|
|
|
|
|
1048
|
2
|
50
|
|
|
|
|
if (k > n / 2) k = n - k; |
|
1049
|
2
|
|
|
|
|
|
long double res = 1.0L; |
|
1050
|
8
|
100
|
|
|
|
|
for (int i = 1; i <= k; i++) { |
|
1051
|
6
|
|
|
|
|
|
res = res * (long double)(n - i + 1) / (long double)i; |
|
1052
|
|
|
|
|
|
|
} |
|
1053
|
2
|
|
|
|
|
|
return res; |
|
1054
|
|
|
|
|
|
|
} |
|
1055
|
|
|
|
|
|
|
|
|
1056
|
|
|
|
|
|
|
/* Exact CDF for Mann-Whitney U: P(U <= q) |
|
1057
|
|
|
|
|
|
|
Mathematically identical to R's cwilcox generating function */ |
|
1058
|
4
|
|
|
|
|
|
static double exact_pwilcox(double q, int m, int n) { |
|
1059
|
4
|
|
|
|
|
|
int k = (int)floor(q + 1e-7); // R uses 1e-7 fuzz |
|
1060
|
4
|
|
|
|
|
|
int max_u = m * n; |
|
1061
|
4
|
100
|
|
|
|
|
if (k < 0) return 0.0; |
|
1062
|
2
|
50
|
|
|
|
|
if (k >= max_u) return 1.0; |
|
1063
|
|
|
|
|
|
|
|
|
1064
|
2
|
|
|
|
|
|
long double *restrict w = (long double *)safecalloc(max_u + 1, sizeof(long double)); |
|
1065
|
2
|
|
|
|
|
|
w[0] = 1.0L; |
|
1066
|
|
|
|
|
|
|
|
|
1067
|
8
|
100
|
|
|
|
|
for (int j = 1; j <= n; j++) { |
|
1068
|
54
|
100
|
|
|
|
|
for (int i = j; i <= max_u; i++) w[i] += w[i - j]; |
|
1069
|
36
|
100
|
|
|
|
|
for (int i = max_u; i >= j + m; i--) w[i] -= w[i - j - m]; |
|
1070
|
|
|
|
|
|
|
} |
|
1071
|
|
|
|
|
|
|
|
|
1072
|
2
|
|
|
|
|
|
long double cum_p = 0.0L; |
|
1073
|
4
|
100
|
|
|
|
|
for (int i = 0; i <= k; i++) cum_p += w[i]; |
|
1074
|
|
|
|
|
|
|
|
|
1075
|
2
|
|
|
|
|
|
long double total = choose_comb(m + n, n); |
|
1076
|
2
|
|
|
|
|
|
double result = (double)(cum_p / total); |
|
1077
|
|
|
|
|
|
|
|
|
1078
|
2
|
|
|
|
|
|
Safefree(w); |
|
1079
|
2
|
|
|
|
|
|
return result; |
|
1080
|
|
|
|
|
|
|
} |
|
1081
|
|
|
|
|
|
|
|
|
1082
|
|
|
|
|
|
|
/* Exact CDF for Wilcoxon Signed Rank: P(V <= q) |
|
1083
|
|
|
|
|
|
|
Mathematically identical to R's csignrank subset-sum DP */ |
|
1084
|
6
|
|
|
|
|
|
static double exact_psignrank(double q, int n) { |
|
1085
|
6
|
|
|
|
|
|
int k = (int)floor(q + 1e-7); |
|
1086
|
6
|
|
|
|
|
|
int max_v = n * (n + 1) / 2; |
|
1087
|
6
|
50
|
|
|
|
|
if (k < 0) return 0.0; |
|
1088
|
6
|
100
|
|
|
|
|
if (k >= max_v) return 1.0; |
|
1089
|
|
|
|
|
|
|
|
|
1090
|
5
|
|
|
|
|
|
long double *restrict w = (long double *)safecalloc(max_v + 1, sizeof(long double)); |
|
1091
|
5
|
|
|
|
|
|
w[0] = 1.0L; |
|
1092
|
|
|
|
|
|
|
|
|
1093
|
46
|
100
|
|
|
|
|
for (int i = 1; i <= n; i++) { |
|
1094
|
1582
|
100
|
|
|
|
|
for (int j = max_v; j >= i; j--) w[j] += w[j - i]; |
|
1095
|
|
|
|
|
|
|
} |
|
1096
|
|
|
|
|
|
|
|
|
1097
|
5
|
|
|
|
|
|
long double cum_p = 0.0L; |
|
1098
|
182
|
100
|
|
|
|
|
for (int i = 0; i <= k; i++) cum_p += w[i]; |
|
1099
|
|
|
|
|
|
|
|
|
1100
|
5
|
|
|
|
|
|
long double total = powl(2.0L, (long double)n); |
|
1101
|
5
|
|
|
|
|
|
double result = (double)(cum_p / total); |
|
1102
|
|
|
|
|
|
|
|
|
1103
|
5
|
|
|
|
|
|
Safefree(w); |
|
1104
|
5
|
|
|
|
|
|
return result; |
|
1105
|
|
|
|
|
|
|
} |
|
1106
|
|
|
|
|
|
|
|
|
1107
|
301
|
|
|
|
|
|
static int cmp_rank_info(const void *a, const void *b) { |
|
1108
|
301
|
|
|
|
|
|
double da = ((const RankInfo*)a)->val; |
|
1109
|
301
|
|
|
|
|
|
double db = ((const RankInfo*)b)->val; |
|
1110
|
301
|
|
|
|
|
|
return (da > db) - (da < db); |
|
1111
|
|
|
|
|
|
|
} |
|
1112
|
|
|
|
|
|
|
|
|
1113
|
11
|
|
|
|
|
|
static double rank_and_count_ties(RankInfo *restrict ri, size_t n, bool *restrict has_ties) { |
|
1114
|
11
|
50
|
|
|
|
|
if (n == 0) return 0.0; |
|
1115
|
11
|
|
|
|
|
|
qsort(ri, n, sizeof(RankInfo), cmp_rank_info); |
|
1116
|
11
|
|
|
|
|
|
size_t i = 0; |
|
1117
|
11
|
|
|
|
|
|
double tie_adj = 0.0; |
|
1118
|
11
|
|
|
|
|
|
*has_ties = 0; |
|
1119
|
124
|
100
|
|
|
|
|
while (i < n) { |
|
1120
|
113
|
|
|
|
|
|
size_t j = i + 1; |
|
1121
|
121
|
100
|
|
|
|
|
while (j < n && ri[j].val == ri[i].val) j++; |
|
|
|
100
|
|
|
|
|
|
|
1122
|
113
|
|
|
|
|
|
double r = (double)(i + 1 + j) / 2.0; |
|
1123
|
234
|
100
|
|
|
|
|
for (size_t k = i; k < j; k++) ri[k].rank = r; |
|
1124
|
113
|
|
|
|
|
|
size_t t = j - i; |
|
1125
|
113
|
100
|
|
|
|
|
if (t > 1) { *has_ties = 1; tie_adj += ((double)t * t * t - t); } |
|
1126
|
113
|
|
|
|
|
|
i = j; |
|
1127
|
|
|
|
|
|
|
} |
|
1128
|
11
|
|
|
|
|
|
return tie_adj; |
|
1129
|
|
|
|
|
|
|
} |
|
1130
|
|
|
|
|
|
|
/* --- KS-TEST C HELPER SECTION --- */ |
|
1131
|
|
|
|
|
|
|
#ifndef M_PI_2 |
|
1132
|
|
|
|
|
|
|
#define M_PI_2 1.57079632679489661923 |
|
1133
|
|
|
|
|
|
|
#endif |
|
1134
|
|
|
|
|
|
|
#ifndef M_PI_4 |
|
1135
|
|
|
|
|
|
|
#define M_PI_4 0.78539816339744830962 |
|
1136
|
|
|
|
|
|
|
#endif |
|
1137
|
|
|
|
|
|
|
#ifndef M_1_SQRT_2PI |
|
1138
|
|
|
|
|
|
|
#define M_1_SQRT_2PI 0.39894228040143267794 |
|
1139
|
|
|
|
|
|
|
#endif |
|
1140
|
|
|
|
|
|
|
|
|
1141
|
|
|
|
|
|
|
// Scalar integer power used by K2x |
|
1142
|
39
|
|
|
|
|
|
static double r_pow_di(double x, int n) { |
|
1143
|
39
|
50
|
|
|
|
|
if (n == 0) return 1.0; |
|
1144
|
39
|
50
|
|
|
|
|
if (n < 0) return 1.0 / r_pow_di(x, -n); |
|
1145
|
39
|
|
|
|
|
|
double val = 1.0; |
|
1146
|
438
|
100
|
|
|
|
|
for (int i = 0; i < n; i++) val *= x; |
|
1147
|
39
|
|
|
|
|
|
return val; |
|
1148
|
|
|
|
|
|
|
} |
|
1149
|
|
|
|
|
|
|
|
|
1150
|
|
|
|
|
|
|
// Two-sample two-sided asymptotic distribution |
|
1151
|
0
|
|
|
|
|
|
static double K2l(double x, int lower, double tol) { |
|
1152
|
|
|
|
|
|
|
double s, z, p; |
|
1153
|
|
|
|
|
|
|
int k; |
|
1154
|
0
|
0
|
|
|
|
|
if(x <= 0.) { |
|
1155
|
0
|
0
|
|
|
|
|
if(lower) p = 0.; |
|
1156
|
0
|
|
|
|
|
|
else p = 1.; |
|
1157
|
0
|
0
|
|
|
|
|
} else if(x < 1.) { |
|
1158
|
0
|
|
|
|
|
|
int k_max = (int) sqrt(2.0 - log(tol)); |
|
1159
|
0
|
|
|
|
|
|
double w = log(x); |
|
1160
|
0
|
|
|
|
|
|
z = - (M_PI_2 * M_PI_4) / (x * x); |
|
1161
|
0
|
|
|
|
|
|
s = 0; |
|
1162
|
0
|
0
|
|
|
|
|
for(k = 1; k < k_max; k += 2) { |
|
1163
|
0
|
|
|
|
|
|
s += exp(k * k * z - w); |
|
1164
|
|
|
|
|
|
|
} |
|
1165
|
0
|
|
|
|
|
|
p = s / M_1_SQRT_2PI; |
|
1166
|
0
|
0
|
|
|
|
|
if(!lower) p = 1.0 - p; |
|
1167
|
|
|
|
|
|
|
} else { |
|
1168
|
|
|
|
|
|
|
double new_val, old_val; |
|
1169
|
0
|
|
|
|
|
|
z = -2.0 * x * x; |
|
1170
|
0
|
|
|
|
|
|
s = -1.0; |
|
1171
|
0
|
0
|
|
|
|
|
if(lower) { |
|
1172
|
0
|
|
|
|
|
|
k = 1; old_val = 0.0; new_val = 1.0; |
|
1173
|
|
|
|
|
|
|
} else { |
|
1174
|
0
|
|
|
|
|
|
k = 2; old_val = 0.0; new_val = 2.0 * exp(z); |
|
1175
|
|
|
|
|
|
|
} |
|
1176
|
0
|
0
|
|
|
|
|
while(fabs(old_val - new_val) > tol) { |
|
1177
|
0
|
|
|
|
|
|
old_val = new_val; |
|
1178
|
0
|
|
|
|
|
|
new_val += 2.0 * s * exp(z * k * k); |
|
1179
|
0
|
|
|
|
|
|
s *= -1.0; |
|
1180
|
0
|
|
|
|
|
|
k++; |
|
1181
|
|
|
|
|
|
|
} |
|
1182
|
0
|
|
|
|
|
|
p = new_val; |
|
1183
|
|
|
|
|
|
|
} |
|
1184
|
0
|
|
|
|
|
|
return p; |
|
1185
|
|
|
|
|
|
|
} |
|
1186
|
|
|
|
|
|
|
|
|
1187
|
|
|
|
|
|
|
// Auxiliary routines used by K2x() for matrix operations |
|
1188
|
7
|
|
|
|
|
|
static void m_multiply(double *A, double *B, double *C, unsigned int m) { |
|
1189
|
140
|
100
|
|
|
|
|
for(unsigned int i = 0; i < m; i++) { |
|
1190
|
2660
|
100
|
|
|
|
|
for(unsigned int j = 0; j < m; j++) { |
|
1191
|
2527
|
|
|
|
|
|
double s = 0.; |
|
1192
|
50540
|
100
|
|
|
|
|
for(unsigned int k = 0; k < m; k++) s += A[i * m + k] * B[k * m + j]; |
|
1193
|
2527
|
|
|
|
|
|
C[i * m + j] = s; |
|
1194
|
|
|
|
|
|
|
} |
|
1195
|
|
|
|
|
|
|
} |
|
1196
|
7
|
|
|
|
|
|
} |
|
1197
|
|
|
|
|
|
|
|
|
1198
|
6
|
|
|
|
|
|
static void m_power(double *A, int eA, double *V, int *eV, int m, int n) { |
|
1199
|
6
|
100
|
|
|
|
|
if(n == 1) { |
|
1200
|
362
|
100
|
|
|
|
|
for(int i = 0; i < m * m; i++) V[i] = A[i]; |
|
1201
|
1
|
|
|
|
|
|
*eV = eA; |
|
1202
|
1
|
|
|
|
|
|
return; |
|
1203
|
|
|
|
|
|
|
} |
|
1204
|
5
|
|
|
|
|
|
m_power(A, eA, V, eV, m, n / 2); |
|
1205
|
5
|
|
|
|
|
|
double *restrict B = (double*) safecalloc(m * m, sizeof(double)); |
|
1206
|
5
|
|
|
|
|
|
m_multiply(V, V, B, m); |
|
1207
|
5
|
|
|
|
|
|
int eB = 2 * (*eV); |
|
1208
|
5
|
100
|
|
|
|
|
if((n % 2) == 0) { |
|
1209
|
1086
|
100
|
|
|
|
|
for(int i = 0; i < m * m; i++) V[i] = B[i]; |
|
1210
|
3
|
|
|
|
|
|
*eV = eB; |
|
1211
|
|
|
|
|
|
|
} else { |
|
1212
|
2
|
|
|
|
|
|
m_multiply(A, B, V, m); |
|
1213
|
2
|
|
|
|
|
|
*eV = eA + eB; |
|
1214
|
|
|
|
|
|
|
} |
|
1215
|
5
|
50
|
|
|
|
|
if(V[(m / 2) * m + (m / 2)] > 1e140) { |
|
1216
|
0
|
0
|
|
|
|
|
for(int i = 0; i < m * m; i++) V[i] = V[i] * 1e-140; |
|
1217
|
0
|
|
|
|
|
|
*eV += 140; |
|
1218
|
|
|
|
|
|
|
} |
|
1219
|
5
|
|
|
|
|
|
Safefree(B); |
|
1220
|
|
|
|
|
|
|
} |
|
1221
|
|
|
|
|
|
|
|
|
1222
|
|
|
|
|
|
|
// One-sample two-sided exact distribution |
|
1223
|
1
|
|
|
|
|
|
static double K2x(int n, double d) { |
|
1224
|
1
|
|
|
|
|
|
int k = (int) (n * d) + 1; |
|
1225
|
1
|
|
|
|
|
|
int m = 2 * k - 1; |
|
1226
|
1
|
|
|
|
|
|
double h = k - n * d; |
|
1227
|
1
|
|
|
|
|
|
double *restrict H = (double*) safecalloc(m * m, sizeof(double)); |
|
1228
|
1
|
|
|
|
|
|
double *restrict Q = (double*) safecalloc(m * m, sizeof(double)); |
|
1229
|
|
|
|
|
|
|
|
|
1230
|
20
|
100
|
|
|
|
|
for(int i = 0; i < m; i++) { |
|
1231
|
380
|
100
|
|
|
|
|
for(int j = 0; j < m; j++) { |
|
1232
|
361
|
100
|
|
|
|
|
if(i - j + 1 < 0) H[i * m + j] = 0; |
|
1233
|
208
|
|
|
|
|
|
else H[i * m + j] = 1; |
|
1234
|
|
|
|
|
|
|
} |
|
1235
|
|
|
|
|
|
|
} |
|
1236
|
20
|
100
|
|
|
|
|
for(int i = 0; i < m; i++) { |
|
1237
|
19
|
|
|
|
|
|
H[i * m] -= r_pow_di(h, i + 1); |
|
1238
|
19
|
|
|
|
|
|
H[(m - 1) * m + i] -= r_pow_di(h, (m - i)); |
|
1239
|
|
|
|
|
|
|
} |
|
1240
|
1
|
50
|
|
|
|
|
H[(m - 1) * m] += ((2 * h - 1 > 0) ? r_pow_di(2 * h - 1, m) : 0); |
|
1241
|
|
|
|
|
|
|
|
|
1242
|
20
|
100
|
|
|
|
|
for(int i = 0; i < m; i++) { |
|
1243
|
380
|
100
|
|
|
|
|
for(int j = 0; j < m; j++) { |
|
1244
|
361
|
100
|
|
|
|
|
if(i - j + 1 > 0) { |
|
1245
|
1520
|
100
|
|
|
|
|
for(int g = 1; g <= i - j + 1; g++) H[i * m + j] /= g; |
|
1246
|
|
|
|
|
|
|
} |
|
1247
|
|
|
|
|
|
|
} |
|
1248
|
|
|
|
|
|
|
} |
|
1249
|
|
|
|
|
|
|
|
|
1250
|
1
|
|
|
|
|
|
int eH = 0, eQ; |
|
1251
|
1
|
|
|
|
|
|
m_power(H, eH, Q, &eQ, m, n); |
|
1252
|
1
|
|
|
|
|
|
double s = Q[(k - 1) * m + k - 1]; |
|
1253
|
|
|
|
|
|
|
|
|
1254
|
51
|
100
|
|
|
|
|
for(int i = 1; i <= n; i++) { |
|
1255
|
50
|
|
|
|
|
|
s = s * (double)i / (double)n; |
|
1256
|
50
|
50
|
|
|
|
|
if(s < 1e-140) { |
|
1257
|
0
|
|
|
|
|
|
s *= 1e140; |
|
1258
|
0
|
|
|
|
|
|
eQ -= 140; |
|
1259
|
|
|
|
|
|
|
} |
|
1260
|
|
|
|
|
|
|
} |
|
1261
|
1
|
|
|
|
|
|
s *= pow(10.0, eQ); |
|
1262
|
1
|
|
|
|
|
|
Safefree(H); |
|
1263
|
1
|
|
|
|
|
|
Safefree(Q); |
|
1264
|
1
|
|
|
|
|
|
return s; |
|
1265
|
|
|
|
|
|
|
} |
|
1266
|
|
|
|
|
|
|
// Calculate D (two-sided), D+ (greater), and D- (less) simultaneously |
|
1267
|
9
|
|
|
|
|
|
static void calc_2sample_stats(double *x, size_t nx, double *y, size_t ny, |
|
1268
|
|
|
|
|
|
|
double *d, double *d_plus, double *d_minus) { |
|
1269
|
9
|
|
|
|
|
|
qsort(x, nx, sizeof(double), compare_doubles); |
|
1270
|
9
|
|
|
|
|
|
qsort(y, ny, sizeof(double), compare_doubles); |
|
1271
|
9
|
|
|
|
|
|
double max_d = 0.0, max_d_plus = 0.0, max_d_minus = 0.0; |
|
1272
|
9
|
|
|
|
|
|
size_t i = 0, j = 0; |
|
1273
|
309
|
100
|
|
|
|
|
while(i < nx || j < ny) { |
|
|
|
100
|
|
|
|
|
|
|
1274
|
|
|
|
|
|
|
double val; |
|
1275
|
300
|
100
|
|
|
|
|
if (i < nx && j < ny) val = (x[i] < y[j]) ? x[i] : y[j]; |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
1276
|
69
|
100
|
|
|
|
|
else if (i < nx) val = x[i]; |
|
1277
|
15
|
|
|
|
|
|
else val = y[j]; |
|
1278
|
480
|
100
|
|
|
|
|
while(i < nx && x[i] <= val) i++; |
|
|
|
100
|
|
|
|
|
|
|
1279
|
420
|
100
|
|
|
|
|
while(j < ny && y[j] <= val) j++; |
|
|
|
100
|
|
|
|
|
|
|
1280
|
300
|
|
|
|
|
|
double cdf1 = (double)i / nx; |
|
1281
|
300
|
|
|
|
|
|
double cdf2 = (double)j / ny; |
|
1282
|
300
|
|
|
|
|
|
double diff = cdf1 - cdf2; |
|
1283
|
300
|
100
|
|
|
|
|
if (diff > max_d_plus) max_d_plus = diff; |
|
1284
|
300
|
100
|
|
|
|
|
if (-diff > max_d_minus) max_d_minus = -diff; |
|
1285
|
300
|
100
|
|
|
|
|
if (fabs(diff) > max_d) max_d = fabs(diff); |
|
1286
|
|
|
|
|
|
|
} |
|
1287
|
9
|
|
|
|
|
|
*d = max_d; |
|
1288
|
9
|
|
|
|
|
|
*d_plus = max_d_plus; |
|
1289
|
9
|
|
|
|
|
|
*d_minus = max_d_minus; |
|
1290
|
9
|
|
|
|
|
|
} |
|
1291
|
|
|
|
|
|
|
|
|
1292
|
|
|
|
|
|
|
// Branch the DP boundary check based on the 'alternative' |
|
1293
|
4950
|
|
|
|
|
|
static int psmirnov_exact_test(double q, double r, double s, int two_sided) { |
|
1294
|
4950
|
100
|
|
|
|
|
if (two_sided) return (fabs(r - s) >= q); |
|
1295
|
3160
|
|
|
|
|
|
return ((r - s) >= q); // Used for both D+ and D- via symmetry |
|
1296
|
|
|
|
|
|
|
} |
|
1297
|
|
|
|
|
|
|
|
|
1298
|
|
|
|
|
|
|
// Evaluate the exact 2-sample probability |
|
1299
|
9
|
|
|
|
|
|
static double psmirnov_exact_uniq_upper(double q, int m, int n, int two_sided) { |
|
1300
|
9
|
|
|
|
|
|
double md = (double) m, nd = (double) n; |
|
1301
|
9
|
|
|
|
|
|
double *restrict u = (double *) safecalloc(n + 1, sizeof(double)); |
|
1302
|
9
|
|
|
|
|
|
u[0] = 0.; |
|
1303
|
|
|
|
|
|
|
|
|
1304
|
129
|
100
|
|
|
|
|
for(unsigned int j = 1; j <= n; j++) { |
|
1305
|
120
|
100
|
|
|
|
|
if(psmirnov_exact_test(q, 0., j / nd, two_sided)) u[j] = 1.; |
|
1306
|
96
|
|
|
|
|
|
else u[j] = u[j - 1]; |
|
1307
|
|
|
|
|
|
|
} |
|
1308
|
189
|
100
|
|
|
|
|
for(unsigned int i = 1; i <= m; i++) { |
|
1309
|
180
|
100
|
|
|
|
|
if(psmirnov_exact_test(q, i / md, 0., two_sided)) u[0] = 1.; |
|
1310
|
4830
|
100
|
|
|
|
|
for(int j = 1; j <= n; j++) { |
|
1311
|
4650
|
100
|
|
|
|
|
if(psmirnov_exact_test(q, i / md, j / nd, two_sided)) u[j] = 1.; |
|
1312
|
|
|
|
|
|
|
else { |
|
1313
|
3484
|
|
|
|
|
|
double v = (double)(i) / (double)(i + j); |
|
1314
|
3484
|
|
|
|
|
|
double w = (double)(j) / (double)(i + j); |
|
1315
|
3484
|
|
|
|
|
|
u[j] = v * u[j] + w * u[j - 1]; |
|
1316
|
|
|
|
|
|
|
} |
|
1317
|
|
|
|
|
|
|
} |
|
1318
|
|
|
|
|
|
|
} |
|
1319
|
9
|
|
|
|
|
|
double res = u[n]; |
|
1320
|
9
|
|
|
|
|
|
Safefree(u); |
|
1321
|
9
|
|
|
|
|
|
return res; |
|
1322
|
|
|
|
|
|
|
} |
|
1323
|
|
|
|
|
|
|
|
|
1324
|
229
|
|
|
|
|
|
static double p_body(double n, double delta, double sd, double sig_level, int tsample, int tside, bool strict) { |
|
1325
|
229
|
|
|
|
|
|
double nu = (n - 1.0) * (double)tsample; |
|
1326
|
229
|
50
|
|
|
|
|
if (nu < 1e-7) nu = 1e-7; |
|
1327
|
|
|
|
|
|
|
|
|
1328
|
|
|
|
|
|
|
// Ensure sig_level/tside is not truncated |
|
1329
|
229
|
|
|
|
|
|
double p_tail = sig_level / (double)tside; |
|
1330
|
229
|
|
|
|
|
|
double qu = qt_tail(nu, p_tail); // qt(p, df, lower.tail=FALSE) |
|
1331
|
|
|
|
|
|
|
|
|
1332
|
229
|
|
|
|
|
|
double ncp = sqrt(n / (double)tsample) * (delta / sd); |
|
1333
|
|
|
|
|
|
|
|
|
1334
|
229
|
50
|
|
|
|
|
if (strict && tside == 2) { |
|
|
|
0
|
|
|
|
|
|
|
1335
|
|
|
|
|
|
|
// Use R-style tail calls: 1 - P(T < qu) + P(T < -qu) |
|
1336
|
0
|
|
|
|
|
|
return (1.0 - exact_pnt(qu, nu, ncp)) + exact_pnt(-qu, nu, ncp); |
|
1337
|
|
|
|
|
|
|
} else { |
|
1338
|
|
|
|
|
|
|
// Default: 1 - P(T < qu) |
|
1339
|
|
|
|
|
|
|
// Ensure exact_pnt is using a convergence tolerance of at least 1e-15 |
|
1340
|
229
|
|
|
|
|
|
return 1.0 - exact_pnt(qu, nu, ncp); |
|
1341
|
|
|
|
|
|
|
} |
|
1342
|
|
|
|
|
|
|
} |
|
1343
|
|
|
|
|
|
|
|
|
1344
|
|
|
|
|
|
|
// Bisection algorithm to find the inverse F-distribution (Quantile function) |
|
1345
|
|
|
|
|
|
|
// Equivalent to R's qf(p, df1, df2) |
|
1346
|
6
|
|
|
|
|
|
static double qf_bisection(double p, double df1, double df2) { |
|
1347
|
6
|
50
|
|
|
|
|
if (p <= 0.0) return 0.0; |
|
1348
|
6
|
50
|
|
|
|
|
if (p >= 1.0) return INFINITY; |
|
1349
|
6
|
|
|
|
|
|
double low = 0.0, high = 1.0; |
|
1350
|
|
|
|
|
|
|
// Find upper bound |
|
1351
|
20
|
100
|
|
|
|
|
while (pf(high, df1, df2) < p) { |
|
1352
|
14
|
|
|
|
|
|
low = high; |
|
1353
|
14
|
|
|
|
|
|
high *= 2.0; |
|
1354
|
14
|
50
|
|
|
|
|
if (high > 1e100) break; /* Fallback limit */ |
|
1355
|
|
|
|
|
|
|
} |
|
1356
|
|
|
|
|
|
|
|
|
1357
|
|
|
|
|
|
|
// Bisect to find the root |
|
1358
|
251
|
50
|
|
|
|
|
for (unsigned short int i = 0; i < 150; i++) { |
|
1359
|
251
|
|
|
|
|
|
double mid = low + (high - low) / 2.0; |
|
1360
|
251
|
|
|
|
|
|
double p_mid = pf(mid, df1, df2); |
|
1361
|
|
|
|
|
|
|
|
|
1362
|
251
|
100
|
|
|
|
|
if (p_mid < p) { |
|
1363
|
122
|
|
|
|
|
|
low = mid; |
|
1364
|
|
|
|
|
|
|
} else { |
|
1365
|
129
|
|
|
|
|
|
high = mid; |
|
1366
|
|
|
|
|
|
|
} |
|
1367
|
251
|
100
|
|
|
|
|
if (high - low < 1e-12) break; |
|
1368
|
|
|
|
|
|
|
} |
|
1369
|
6
|
|
|
|
|
|
return (low + high) / 2.0; |
|
1370
|
|
|
|
|
|
|
} |
|
1371
|
|
|
|
|
|
|
|
|
1372
|
|
|
|
|
|
|
typedef struct { |
|
1373
|
|
|
|
|
|
|
double statistic; |
|
1374
|
|
|
|
|
|
|
double num_df; |
|
1375
|
|
|
|
|
|
|
double denom_df; |
|
1376
|
|
|
|
|
|
|
double p_value; |
|
1377
|
|
|
|
|
|
|
double ss_between; /* between-group sum of squares */ |
|
1378
|
|
|
|
|
|
|
double ss_within; /* within-group sum of squares */ |
|
1379
|
|
|
|
|
|
|
double ms_between; /* ss_between / num_df */ |
|
1380
|
|
|
|
|
|
|
double ms_within; /* ss_within / denom_df */ |
|
1381
|
|
|
|
|
|
|
int k; /* number of groups */ |
|
1382
|
|
|
|
|
|
|
IV n; /* total observations */ |
|
1383
|
|
|
|
|
|
|
bool var_equal; /* 0 = Welch, 1 = classic */ |
|
1384
|
|
|
|
|
|
|
} OneWayResult; |
|
1385
|
|
|
|
|
|
|
|
|
1386
|
|
|
|
|
|
|
static OneWayResult |
|
1387
|
3
|
|
|
|
|
|
c_oneway_test(const double *restrict data, const size_t *restrict sizes, |
|
1388
|
|
|
|
|
|
|
size_t k, bool var_equal) |
|
1389
|
|
|
|
|
|
|
{ |
|
1390
|
|
|
|
|
|
|
OneWayResult res; |
|
1391
|
3
|
|
|
|
|
|
res.var_equal = var_equal; |
|
1392
|
3
|
|
|
|
|
|
res.k = (int)k; |
|
1393
|
|
|
|
|
|
|
|
|
1394
|
3
|
|
|
|
|
|
double *restrict n_i = (double *)safemalloc(k * sizeof(double)); |
|
1395
|
3
|
|
|
|
|
|
double *restrict m_i = (double *)safemalloc(k * sizeof(double)); |
|
1396
|
3
|
|
|
|
|
|
double *restrict v_i = (double *)safemalloc(k * sizeof(double)); |
|
1397
|
3
|
|
|
|
|
|
size_t offset = 0; |
|
1398
|
3
|
|
|
|
|
|
IV total_n = 0; |
|
1399
|
9
|
100
|
|
|
|
|
for (size_t g = 0; g < k; g++) { |
|
1400
|
6
|
|
|
|
|
|
size_t ng = sizes[g]; |
|
1401
|
6
|
|
|
|
|
|
n_i[g] = (double)ng; |
|
1402
|
6
|
|
|
|
|
|
total_n += (IV)ng; |
|
1403
|
6
|
|
|
|
|
|
double sum = 0.0; |
|
1404
|
36
|
100
|
|
|
|
|
for (size_t i = 0; i < ng; i++) sum += data[offset + i]; |
|
1405
|
6
|
|
|
|
|
|
double mean = sum / (double)ng; |
|
1406
|
6
|
|
|
|
|
|
m_i[g] = mean; |
|
1407
|
|
|
|
|
|
|
|
|
1408
|
6
|
|
|
|
|
|
double ss = 0.0; |
|
1409
|
36
|
100
|
|
|
|
|
for (size_t i = 0; i < ng; i++) { |
|
1410
|
30
|
|
|
|
|
|
double d = data[offset + i] - mean; |
|
1411
|
30
|
|
|
|
|
|
ss += d * d; |
|
1412
|
|
|
|
|
|
|
} |
|
1413
|
6
|
|
|
|
|
|
v_i[g] = ss / (double)(ng - 1); /* ng >= 2 guaranteed by caller */ |
|
1414
|
6
|
|
|
|
|
|
offset += ng; |
|
1415
|
|
|
|
|
|
|
} |
|
1416
|
3
|
|
|
|
|
|
res.n = total_n; |
|
1417
|
|
|
|
|
|
|
// grand mean (simple average over all obs; used only by classic branch)/ |
|
1418
|
3
|
|
|
|
|
|
double grand_mean = 0.0; |
|
1419
|
33
|
100
|
|
|
|
|
for (IV i = 0; i < (IV)total_n; i++) grand_mean += data[i]; |
|
1420
|
3
|
|
|
|
|
|
grand_mean /= (double)total_n; |
|
1421
|
|
|
|
|
|
|
|
|
1422
|
3
|
|
|
|
|
|
double df1 = (double)(k - 1); |
|
1423
|
|
|
|
|
|
|
|
|
1424
|
3
|
50
|
|
|
|
|
if (var_equal) {/* ── Classic one-way ANOVA |
|
1425
|
|
|
|
|
|
|
* F = [Σ n_i·(m_i − ȳ)² / (k−1)] / [Σ (n_i−1)·v_i / (n−k)] */ |
|
1426
|
0
|
|
|
|
|
|
double ssbg = 0.0, sswg = 0.0; |
|
1427
|
0
|
0
|
|
|
|
|
for (size_t g = 0; g < k; g++) { |
|
1428
|
0
|
|
|
|
|
|
double dm = m_i[g] - grand_mean; |
|
1429
|
0
|
|
|
|
|
|
ssbg += n_i[g] * dm * dm; |
|
1430
|
0
|
|
|
|
|
|
sswg += (n_i[g] - 1.0) * v_i[g]; |
|
1431
|
|
|
|
|
|
|
} |
|
1432
|
0
|
|
|
|
|
|
double df2 = (double)(total_n - (IV)k); |
|
1433
|
0
|
|
|
|
|
|
res.statistic = (ssbg / df1) / (sswg / df2); |
|
1434
|
0
|
|
|
|
|
|
res.num_df = df1; |
|
1435
|
0
|
|
|
|
|
|
res.denom_df = df2; |
|
1436
|
0
|
|
|
|
|
|
res.ss_between = ssbg; |
|
1437
|
0
|
|
|
|
|
|
res.ss_within = sswg; |
|
1438
|
0
|
|
|
|
|
|
res.ms_between = ssbg / df1; |
|
1439
|
0
|
|
|
|
|
|
res.ms_within = sswg / df2; |
|
1440
|
|
|
|
|
|
|
} else {// ── Welch one-way (heteroscedastic) |
|
1441
|
3
|
|
|
|
|
|
double *restrict w_i = (double *)safemalloc(k * sizeof(double)); |
|
1442
|
3
|
|
|
|
|
|
double sum_w = 0.0; |
|
1443
|
9
|
100
|
|
|
|
|
for (size_t g = 0; g < k; g++) { w_i[g] = n_i[g] / v_i[g]; sum_w += w_i[g]; } |
|
1444
|
3
|
|
|
|
|
|
double wgrand = 0.0; |
|
1445
|
9
|
100
|
|
|
|
|
for (size_t g = 0; g < k; g++) wgrand += w_i[g] * m_i[g]; |
|
1446
|
3
|
|
|
|
|
|
wgrand /= sum_w; |
|
1447
|
3
|
|
|
|
|
|
double tmp = 0.0; |
|
1448
|
9
|
100
|
|
|
|
|
for (size_t g = 0; g < k; g++) { |
|
1449
|
6
|
|
|
|
|
|
double t = 1.0 - w_i[g] / sum_w; |
|
1450
|
6
|
|
|
|
|
|
tmp += (t * t) / (n_i[g] - 1.0); |
|
1451
|
|
|
|
|
|
|
} |
|
1452
|
3
|
|
|
|
|
|
tmp /= ((double)k * (double)k - 1.0); /* k² − 1 */ |
|
1453
|
3
|
|
|
|
|
|
double num = 0.0; |
|
1454
|
9
|
100
|
|
|
|
|
for (size_t g = 0; g < k; g++) { |
|
1455
|
6
|
|
|
|
|
|
double dm = m_i[g] - wgrand; |
|
1456
|
6
|
|
|
|
|
|
num += w_i[g] * dm * dm; |
|
1457
|
|
|
|
|
|
|
} |
|
1458
|
3
|
|
|
|
|
|
res.statistic = num / (df1 * (1.0 + 2.0 * (double)(k - 2) * tmp)); |
|
1459
|
3
|
|
|
|
|
|
res.num_df = df1; |
|
1460
|
3
|
50
|
|
|
|
|
res.denom_df = (tmp > 0.0) ? (1.0 / (3.0 * tmp)) : 1e300; |
|
1461
|
|
|
|
|
|
|
/* unweighted SS for the output table */ |
|
1462
|
3
|
|
|
|
|
|
double ssbg = 0.0, sswg = 0.0; |
|
1463
|
9
|
100
|
|
|
|
|
for (size_t g = 0; g < k; g++) { |
|
1464
|
6
|
|
|
|
|
|
double dm = m_i[g] - grand_mean; |
|
1465
|
6
|
|
|
|
|
|
ssbg += n_i[g] * dm * dm; |
|
1466
|
6
|
|
|
|
|
|
sswg += (n_i[g] - 1.0) * v_i[g]; |
|
1467
|
|
|
|
|
|
|
} |
|
1468
|
3
|
|
|
|
|
|
res.ss_between = ssbg; |
|
1469
|
3
|
|
|
|
|
|
res.ss_within = sswg; |
|
1470
|
3
|
50
|
|
|
|
|
res.ms_between = (df1 > 0.0) ? ssbg / df1 : 0.0; |
|
1471
|
3
|
50
|
|
|
|
|
res.ms_within = (res.denom_df > 0.0) ? sswg / res.denom_df : 0.0; |
|
1472
|
3
|
|
|
|
|
|
Safefree(w_i); |
|
1473
|
|
|
|
|
|
|
} |
|
1474
|
|
|
|
|
|
|
// upper-tail p-value P(F ≥ statistic) |
|
1475
|
3
|
|
|
|
|
|
res.p_value = 1 - pf(res.statistic, res.num_df, res.denom_df); |
|
1476
|
3
|
|
|
|
|
|
Safefree(n_i); Safefree(m_i); Safefree(v_i); |
|
1477
|
3
|
|
|
|
|
|
return res; |
|
1478
|
|
|
|
|
|
|
} |
|
1479
|
|
|
|
|
|
|
|
|
1480
|
|
|
|
|
|
|
/* ── parse_formula |
|
1481
|
|
|
|
|
|
|
* |
|
1482
|
|
|
|
|
|
|
* Splits "response ~ factor" into two NUL-terminated, heap-allocated |
|
1483
|
|
|
|
|
|
|
* strings. Leading/trailing whitespace is stripped from each side. |
|
1484
|
|
|
|
|
|
|
* Returns 1 on success, 0 on failure (malformed / missing '~'). |
|
1485
|
|
|
|
|
|
|
* Caller must Safefree() both *lhs and *rhs on success. */ |
|
1486
|
|
|
|
|
|
|
static int |
|
1487
|
4
|
|
|
|
|
|
parse_formula(const char *formula, char **lhs, char **rhs) |
|
1488
|
|
|
|
|
|
|
{ |
|
1489
|
4
|
|
|
|
|
|
const char *restrict tilde = strchr(formula, '~'); |
|
1490
|
4
|
100
|
|
|
|
|
if (!tilde) return 0; |
|
1491
|
|
|
|
|
|
|
|
|
1492
|
|
|
|
|
|
|
// left-hand side: trim trailing whitespace |
|
1493
|
3
|
|
|
|
|
|
const char *l_start = formula; |
|
1494
|
3
|
|
|
|
|
|
const char *l_end = tilde - 1; |
|
1495
|
6
|
50
|
|
|
|
|
while (l_end >= l_start && isspace((unsigned char)*l_end)) l_end--; |
|
|
|
100
|
|
|
|
|
|
|
1496
|
3
|
50
|
|
|
|
|
if (l_end < l_start) return 0; /* empty LHS */ |
|
1497
|
|
|
|
|
|
|
|
|
1498
|
|
|
|
|
|
|
// right-hand side: trim leading whitespace */ |
|
1499
|
3
|
|
|
|
|
|
const char *restrict r_start = tilde + 1; |
|
1500
|
6
|
50
|
|
|
|
|
while (*r_start && isspace((unsigned char)*r_start)) r_start++; |
|
|
|
100
|
|
|
|
|
|
|
1501
|
3
|
|
|
|
|
|
const char *restrict r_end = r_start + strlen(r_start) - 1; |
|
1502
|
3
|
50
|
|
|
|
|
while (r_end >= r_start && isspace((unsigned char)*r_end)) r_end--; |
|
|
|
50
|
|
|
|
|
|
|
1503
|
3
|
50
|
|
|
|
|
if (r_end < r_start) return 0; /* empty RHS */ |
|
1504
|
|
|
|
|
|
|
|
|
1505
|
3
|
|
|
|
|
|
size_t llen = (size_t)(l_end - l_start + 1); |
|
1506
|
3
|
|
|
|
|
|
size_t rlen = (size_t)(r_end - r_start + 1); |
|
1507
|
|
|
|
|
|
|
|
|
1508
|
3
|
|
|
|
|
|
*lhs = (char *)safemalloc(llen + 1); |
|
1509
|
3
|
|
|
|
|
|
*rhs = (char *)safemalloc(rlen + 1); |
|
1510
|
3
|
|
|
|
|
|
memcpy(*lhs, l_start, llen); (*lhs)[llen] = '\0'; |
|
1511
|
3
|
|
|
|
|
|
memcpy(*rhs, r_start, rlen); (*rhs)[rlen] = '\0'; |
|
1512
|
3
|
|
|
|
|
|
return 1; |
|
1513
|
|
|
|
|
|
|
} |
|
1514
|
|
|
|
|
|
|
|
|
1515
|
|
|
|
|
|
|
/* ── build_groups_from_formula ─────────────── |
|
1516
|
|
|
|
|
|
|
* |
|
1517
|
|
|
|
|
|
|
* Takes parallel response[] and label[] arrays (each length n) and |
|
1518
|
|
|
|
|
|
|
* partitions them into groups, filling: |
|
1519
|
|
|
|
|
|
|
* out_flat[] – observations sorted into contiguous group blocks |
|
1520
|
|
|
|
|
|
|
* out_sizes[] – number of observations per group (caller allocates n |
|
1521
|
|
|
|
|
|
|
* slots for both; actual group count returned via *out_k) |
|
1522
|
|
|
|
|
|
|
* out_names – if non-NULL, receives a heap-allocated char** of k |
|
1523
|
|
|
|
|
|
|
* group-name strings (caller must free each and the array) |
|
1524
|
|
|
|
|
|
|
* |
|
1525
|
|
|
|
|
|
|
* Group identity is the string representation of each label element |
|
1526
|
|
|
|
|
|
|
* (SvPV_nolen), so integer 0 and string "0" are the same group. |
|
1527
|
|
|
|
|
|
|
* Groups are ordered by first appearance in label[], matching R's |
|
1528
|
|
|
|
|
|
|
* factor level ordering from stack(). |
|
1529
|
|
|
|
|
|
|
* |
|
1530
|
|
|
|
|
|
|
* Returns 1 on success; 0 if any validation error (sets errbuf). |
|
1531
|
|
|
|
|
|
|
*/ |
|
1532
|
|
|
|
|
|
|
#define OWT_MAX_GROUPS 1024 /* sane ceiling; ANOVA with >1024 groups is absurd */ |
|
1533
|
|
|
|
|
|
|
|
|
1534
|
2
|
|
|
|
|
|
static int build_groups_from_formula(pTHX_ |
|
1535
|
|
|
|
|
|
|
AV *restrict response_av, |
|
1536
|
|
|
|
|
|
|
AV *restrict label_av, |
|
1537
|
|
|
|
|
|
|
double *restrict out_flat, |
|
1538
|
|
|
|
|
|
|
size_t *restrict out_sizes, |
|
1539
|
|
|
|
|
|
|
size_t *restrict out_k, |
|
1540
|
|
|
|
|
|
|
char ***restrict out_names, |
|
1541
|
|
|
|
|
|
|
char *restrict errbuf, |
|
1542
|
|
|
|
|
|
|
size_t errbuf_len) |
|
1543
|
|
|
|
|
|
|
{ |
|
1544
|
2
|
|
|
|
|
|
IV n = av_len(response_av) + 1; |
|
1545
|
2
|
|
|
|
|
|
IV nl = av_len(label_av) + 1; |
|
1546
|
|
|
|
|
|
|
|
|
1547
|
2
|
100
|
|
|
|
|
if (n != nl) { |
|
1548
|
1
|
|
|
|
|
|
snprintf(errbuf, errbuf_len, |
|
1549
|
|
|
|
|
|
|
"formula: response length (%"IVdf") != factor length (%"IVdf")", |
|
1550
|
|
|
|
|
|
|
n, nl); |
|
1551
|
1
|
|
|
|
|
|
return 0; |
|
1552
|
|
|
|
|
|
|
} |
|
1553
|
1
|
50
|
|
|
|
|
if (n < 2) { |
|
1554
|
0
|
|
|
|
|
|
snprintf(errbuf, errbuf_len, "formula: need at least 2 observations"); |
|
1555
|
0
|
|
|
|
|
|
return 0; |
|
1556
|
|
|
|
|
|
|
} |
|
1557
|
|
|
|
|
|
|
|
|
1558
|
|
|
|
|
|
|
/* ── discover unique group labels in order of first appearance ─── */ |
|
1559
|
|
|
|
|
|
|
/* We store pointers into a heap-allocated label string table. */ |
|
1560
|
1
|
|
|
|
|
|
char **restrict group_names = (char **)safemalloc(OWT_MAX_GROUPS * sizeof(char *)); |
|
1561
|
1
|
|
|
|
|
|
size_t ngroups = 0; |
|
1562
|
1
|
|
|
|
|
|
IV *restrict obs_group = (IV *)safemalloc((size_t)n * sizeof(IV)); |
|
1563
|
|
|
|
|
|
|
/* maps obs index → group index */ |
|
1564
|
|
|
|
|
|
|
|
|
1565
|
7
|
100
|
|
|
|
|
for (IV i = 0; i < n; i++) { |
|
1566
|
6
|
|
|
|
|
|
SV **restrict lsv = av_fetch(label_av, i, 0); |
|
1567
|
6
|
50
|
|
|
|
|
const char *restrict label = (lsv && *lsv) ? SvPV_nolen(*lsv) : ""; |
|
|
|
50
|
|
|
|
|
|
|
1568
|
|
|
|
|
|
|
/* linear scan for existing group (k is small, O(n·k) is fine) */ |
|
1569
|
6
|
|
|
|
|
|
IV gidx = -1; |
|
1570
|
9
|
100
|
|
|
|
|
for (size_t g = 0; g < ngroups; g++) { |
|
1571
|
7
|
100
|
|
|
|
|
if (strEQ(group_names[g], label)) { gidx = (IV)g; break; } |
|
1572
|
|
|
|
|
|
|
} |
|
1573
|
6
|
100
|
|
|
|
|
if (gidx < 0) { |
|
1574
|
2
|
50
|
|
|
|
|
if (ngroups >= OWT_MAX_GROUPS) { |
|
1575
|
0
|
|
|
|
|
|
snprintf(errbuf, errbuf_len, |
|
1576
|
|
|
|
|
|
|
"formula: too many distinct groups (max %d)", OWT_MAX_GROUPS); |
|
1577
|
0
|
|
|
|
|
|
Safefree(group_names); |
|
1578
|
0
|
|
|
|
|
|
Safefree(obs_group); |
|
1579
|
0
|
|
|
|
|
|
return 0; |
|
1580
|
|
|
|
|
|
|
} |
|
1581
|
|
|
|
|
|
|
/* new group: copy the label string */ |
|
1582
|
2
|
|
|
|
|
|
size_t lablen = strlen(label); |
|
1583
|
2
|
|
|
|
|
|
group_names[ngroups] = (char *)safemalloc(lablen + 1); |
|
1584
|
2
|
|
|
|
|
|
memcpy(group_names[ngroups], label, lablen + 1); |
|
1585
|
2
|
|
|
|
|
|
gidx = (IV)ngroups++; |
|
1586
|
|
|
|
|
|
|
} |
|
1587
|
6
|
|
|
|
|
|
obs_group[i] = gidx; |
|
1588
|
|
|
|
|
|
|
} |
|
1589
|
|
|
|
|
|
|
|
|
1590
|
1
|
50
|
|
|
|
|
if (ngroups < 2) { |
|
1591
|
0
|
|
|
|
|
|
snprintf(errbuf, errbuf_len, |
|
1592
|
|
|
|
|
|
|
"formula: need at least 2 distinct groups, found %zu", ngroups); |
|
1593
|
0
|
0
|
|
|
|
|
for (size_t g = 0; g < ngroups; g++) Safefree(group_names[g]); |
|
1594
|
0
|
|
|
|
|
|
Safefree(group_names); Safefree(obs_group); |
|
1595
|
0
|
|
|
|
|
|
return 0; |
|
1596
|
|
|
|
|
|
|
} |
|
1597
|
|
|
|
|
|
|
|
|
1598
|
|
|
|
|
|
|
/* count per-group sizes */ |
|
1599
|
1
|
|
|
|
|
|
memset(out_sizes, 0, ngroups * sizeof(size_t)); |
|
1600
|
7
|
100
|
|
|
|
|
for (unsigned i = 0; i < n; i++) out_sizes[obs_group[i]]++; |
|
1601
|
|
|
|
|
|
|
|
|
1602
|
|
|
|
|
|
|
/* validate: every group needs >= 2 observations */ |
|
1603
|
3
|
100
|
|
|
|
|
for (size_t g = 0; g < ngroups; g++) { |
|
1604
|
2
|
50
|
|
|
|
|
if (out_sizes[g] < 2) { |
|
1605
|
0
|
|
|
|
|
|
snprintf(errbuf, errbuf_len, |
|
1606
|
|
|
|
|
|
|
"formula: group '%s' has only %zu observation(s); need >= 2", |
|
1607
|
0
|
|
|
|
|
|
group_names[g], out_sizes[g]); |
|
1608
|
0
|
0
|
|
|
|
|
for (size_t gg = 0; gg < ngroups; gg++) Safefree(group_names[gg]); |
|
1609
|
0
|
|
|
|
|
|
Safefree(group_names); Safefree(obs_group); |
|
1610
|
0
|
|
|
|
|
|
return 0; |
|
1611
|
|
|
|
|
|
|
} |
|
1612
|
|
|
|
|
|
|
} |
|
1613
|
|
|
|
|
|
|
/* ── fill flat output array in group order ─────────────────────── * |
|
1614
|
|
|
|
|
|
|
* We compute a running write-offset per group, then scatter*/ |
|
1615
|
1
|
|
|
|
|
|
size_t *restrict write_pos = (size_t *)safemalloc(ngroups * sizeof(size_t)); |
|
1616
|
1
|
|
|
|
|
|
write_pos[0] = 0; |
|
1617
|
2
|
100
|
|
|
|
|
for (size_t g = 1; g < ngroups; g++) |
|
1618
|
1
|
|
|
|
|
|
write_pos[g] = write_pos[g - 1] + out_sizes[g - 1]; |
|
1619
|
7
|
100
|
|
|
|
|
for (IV i = 0; i < n; i++) { |
|
1620
|
6
|
|
|
|
|
|
SV **restrict rsv = av_fetch(response_av, i, 0); |
|
1621
|
6
|
50
|
|
|
|
|
double val = (rsv && *rsv) ? SvNV(*rsv) : 0.0; |
|
|
|
50
|
|
|
|
|
|
|
1622
|
6
|
|
|
|
|
|
size_t g = (size_t)obs_group[i]; |
|
1623
|
6
|
|
|
|
|
|
out_flat[write_pos[g]++] = val; |
|
1624
|
|
|
|
|
|
|
} |
|
1625
|
|
|
|
|
|
|
|
|
1626
|
1
|
|
|
|
|
|
*out_k = ngroups; |
|
1627
|
|
|
|
|
|
|
|
|
1628
|
|
|
|
|
|
|
/* ── clean up or hand off group names */ |
|
1629
|
1
|
|
|
|
|
|
Safefree(write_pos); Safefree(obs_group); |
|
1630
|
1
|
50
|
|
|
|
|
if (out_names) { |
|
1631
|
1
|
|
|
|
|
|
*out_names = group_names; /* caller takes ownership */ |
|
1632
|
|
|
|
|
|
|
} else { |
|
1633
|
0
|
0
|
|
|
|
|
for (size_t g = 0; g < ngroups; g++) Safefree(group_names[g]); |
|
1634
|
0
|
|
|
|
|
|
Safefree(group_names); |
|
1635
|
|
|
|
|
|
|
} |
|
1636
|
1
|
|
|
|
|
|
return 1; |
|
1637
|
|
|
|
|
|
|
} |
|
1638
|
|
|
|
|
|
|
#undef OWT_MAX_GROUPS |
|
1639
|
|
|
|
|
|
|
// --- Math Macros --- |
|
1640
|
|
|
|
|
|
|
#ifndef M_LN_SQRT_2PI |
|
1641
|
|
|
|
|
|
|
#define M_LN_SQRT_2PI 0.91893853320467274178 |
|
1642
|
|
|
|
|
|
|
#endif |
|
1643
|
|
|
|
|
|
|
#ifndef M_LN2 |
|
1644
|
|
|
|
|
|
|
#define M_LN2 0.69314718055994530941 |
|
1645
|
|
|
|
|
|
|
#endif |
|
1646
|
|
|
|
|
|
|
#ifndef M_1_SQRT_2PI |
|
1647
|
|
|
|
|
|
|
#define M_1_SQRT_2PI 0.39894228040143267794 |
|
1648
|
|
|
|
|
|
|
#endif |
|
1649
|
|
|
|
|
|
|
|
|
1650
|
|
|
|
|
|
|
/* c_dnorm: Normal distribution PDF |
|
1651
|
|
|
|
|
|
|
* |
|
1652
|
|
|
|
|
|
|
* Mathematically identical to R's dnorm4. |
|
1653
|
|
|
|
|
|
|
* Includes Morten Welinder's precision improvements for extreme tails. |
|
1654
|
|
|
|
|
|
|
* ----------------------------------------------------------------------- */ |
|
1655
|
25
|
|
|
|
|
|
static double c_dnorm(double x, double mu, double sigma, int give_log) { |
|
1656
|
|
|
|
|
|
|
// Propagate NaNs |
|
1657
|
25
|
50
|
|
|
|
|
if (isnan(x) || isnan(mu) || isnan(sigma)) return x + mu + sigma; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1658
|
25
|
50
|
|
|
|
|
if (sigma < 0.0) { |
|
1659
|
0
|
|
|
|
|
|
warn("dnorm: standard deviation must be non-negative"); |
|
1660
|
0
|
|
|
|
|
|
return NAN; |
|
1661
|
|
|
|
|
|
|
} |
|
1662
|
25
|
50
|
|
|
|
|
if (isinf(sigma)) return 0.0; |
|
1663
|
25
|
50
|
|
|
|
|
if ((isnan(x) || isinf(x)) && mu == x) return NAN; // x-mu is NaN |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1664
|
|
|
|
|
|
|
// Dirac delta behavior for zero variance |
|
1665
|
25
|
50
|
|
|
|
|
if (sigma == 0.0) return (x == mu) ? INFINITY : 0.0; |
|
|
|
0
|
|
|
|
|
|
|
1666
|
|
|
|
|
|
|
|
|
1667
|
|
|
|
|
|
|
// Standardize x |
|
1668
|
25
|
|
|
|
|
|
x = (x - mu) / sigma; |
|
1669
|
25
|
50
|
|
|
|
|
if (isnan(x) || isinf(x)) return 0.0; |
|
|
|
50
|
|
|
|
|
|
|
1670
|
25
|
|
|
|
|
|
x = fabs(x); |
|
1671
|
|
|
|
|
|
|
// Catch massive limits early to prevent math overflow |
|
1672
|
25
|
50
|
|
|
|
|
if (x >= 2.0 * sqrt(DBL_MAX)) return 0.0; |
|
1673
|
25
|
100
|
|
|
|
|
if (give_log) { |
|
1674
|
1
|
|
|
|
|
|
return -(M_LN_SQRT_2PI + 0.5 * x * x + log(sigma)); |
|
1675
|
|
|
|
|
|
|
} |
|
1676
|
|
|
|
|
|
|
// Naive formula for standard bodies |
|
1677
|
24
|
100
|
|
|
|
|
if (x < 5.0) { |
|
1678
|
22
|
|
|
|
|
|
return M_1_SQRT_2PI * exp(-0.5 * x * x) / sigma; |
|
1679
|
|
|
|
|
|
|
} |
|
1680
|
|
|
|
|
|
|
// Underflow boundary check using IEEE float characteristics |
|
1681
|
2
|
50
|
|
|
|
|
if (x > sqrt(-2.0 * M_LN2 * (DBL_MIN_EXP + 1.0 - DBL_MANT_DIG))) { |
|
1682
|
0
|
|
|
|
|
|
return 0.0; |
|
1683
|
|
|
|
|
|
|
} |
|
1684
|
|
|
|
|
|
|
/* Splitting x to dodge floating point inaccuracies in x^2 for large x. |
|
1685
|
|
|
|
|
|
|
* x = x1 + x2, where |x2| <= 2^-16 |
|
1686
|
|
|
|
|
|
|
* trunc() safely substitutes R_forceint() */ |
|
1687
|
2
|
|
|
|
|
|
double x1 = ldexp(trunc(ldexp(x, 16)), -16); |
|
1688
|
2
|
|
|
|
|
|
double x2 = x - x1; |
|
1689
|
2
|
|
|
|
|
|
return (M_1_SQRT_2PI / sigma) * (exp(-0.5 * x1 * x1) * exp((-0.5 * x2 - x1) * x2)); |
|
1690
|
|
|
|
|
|
|
} |
|
1691
|
|
|
|
|
|
|
/*Helper for prcomp: Jacobi Eigenvalue Algorithm for Symmetric Matrices |
|
1692
|
|
|
|
|
|
|
* Used to compute the eigendecomposition of the X^T X covariance matrix.*/ |
|
1693
|
7
|
|
|
|
|
|
static void jacobi_eigen(NV *restrict A, size_t n, NV *restrict d, NV *restrict v) { |
|
1694
|
21
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
1695
|
42
|
100
|
|
|
|
|
for (size_t j = 0; j < n; j++) v[i * n + j] = (i == j) ? 1.0 : 0.0; |
|
|
|
100
|
|
|
|
|
|
|
1696
|
14
|
|
|
|
|
|
d[i] = A[i * n + i]; |
|
1697
|
|
|
|
|
|
|
} |
|
1698
|
7
|
|
|
|
|
|
NV *restrict b = (NV*)safemalloc(n * sizeof(NV)); |
|
1699
|
7
|
|
|
|
|
|
NV *restrict z = (NV*)safemalloc(n * sizeof(NV)); |
|
1700
|
21
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { b[i] = d[i]; z[i] = 0.0; } |
|
1701
|
14
|
50
|
|
|
|
|
for (int iter = 1; iter <= 50; iter++) { |
|
1702
|
14
|
|
|
|
|
|
NV sm = 0.0; |
|
1703
|
28
|
100
|
|
|
|
|
for (size_t i = 0; i < n - 1; i++) { |
|
1704
|
28
|
100
|
|
|
|
|
for (size_t j = i + 1; j < n; j++) sm += fabs(A[i * n + j]); |
|
1705
|
|
|
|
|
|
|
} |
|
1706
|
14
|
100
|
|
|
|
|
if (sm == 0.0) break; |
|
1707
|
7
|
50
|
|
|
|
|
NV tresh = (iter < 4) ? 0.2 * sm / (n * n) : 0.0; |
|
1708
|
14
|
100
|
|
|
|
|
for (size_t i = 0; i < n - 1; i++) { |
|
1709
|
14
|
100
|
|
|
|
|
for (size_t j = i + 1; j < n; j++) { |
|
1710
|
7
|
|
|
|
|
|
NV g = 100.0 * fabs(A[i * n + j]); |
|
1711
|
7
|
50
|
|
|
|
|
if (iter > 4 && fabs(d[i]) + g == fabs(d[i]) && fabs(d[j]) + g == fabs(d[j])) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1712
|
0
|
|
|
|
|
|
A[i * n + j] = 0.0; |
|
1713
|
7
|
50
|
|
|
|
|
} else if (fabs(A[i * n + j]) > tresh) { |
|
1714
|
7
|
|
|
|
|
|
NV h = d[j] - d[i]; |
|
1715
|
|
|
|
|
|
|
NV t; |
|
1716
|
7
|
50
|
|
|
|
|
if (fabs(h) + g == fabs(h)) { |
|
1717
|
0
|
|
|
|
|
|
t = A[i * n + j] / h; |
|
1718
|
|
|
|
|
|
|
} else { |
|
1719
|
7
|
|
|
|
|
|
NV theta = 0.5 * h / A[i * n + j]; |
|
1720
|
7
|
|
|
|
|
|
t = 1.0 / (fabs(theta) + sqrt(1.0 + theta * theta)); |
|
1721
|
7
|
100
|
|
|
|
|
if (theta < 0.0) t = -t; |
|
1722
|
|
|
|
|
|
|
} |
|
1723
|
7
|
|
|
|
|
|
NV c = 1.0 / sqrt(1.0 + t * t); |
|
1724
|
7
|
|
|
|
|
|
NV s = t * c; |
|
1725
|
7
|
|
|
|
|
|
NV tau = s / (1.0 + c); |
|
1726
|
7
|
|
|
|
|
|
NV h_t = t * A[i * n + j]; |
|
1727
|
7
|
|
|
|
|
|
z[i] -= h_t; |
|
1728
|
7
|
|
|
|
|
|
z[j] += h_t; |
|
1729
|
7
|
|
|
|
|
|
d[i] -= h_t; |
|
1730
|
7
|
|
|
|
|
|
d[j] += h_t; |
|
1731
|
7
|
|
|
|
|
|
A[i * n + j] = 0.0; |
|
1732
|
7
|
50
|
|
|
|
|
for (size_t k = 0; k < i; k++) { |
|
1733
|
0
|
|
|
|
|
|
g = A[k * n + i]; NV h_val = A[k * n + j]; |
|
1734
|
0
|
|
|
|
|
|
A[k * n + i] = g - s * (h_val + g * tau); |
|
1735
|
0
|
|
|
|
|
|
A[k * n + j] = h_val + s * (g - h_val * tau); |
|
1736
|
|
|
|
|
|
|
} |
|
1737
|
7
|
50
|
|
|
|
|
for (size_t k = i + 1; k < j; k++) { |
|
1738
|
0
|
|
|
|
|
|
g = A[i * n + k]; NV h_val = A[k * n + j]; |
|
1739
|
0
|
|
|
|
|
|
A[i * n + k] = g - s * (h_val + g * tau); |
|
1740
|
0
|
|
|
|
|
|
A[k * n + j] = h_val + s * (g - h_val * tau); |
|
1741
|
|
|
|
|
|
|
} |
|
1742
|
7
|
50
|
|
|
|
|
for (size_t k = j + 1; k < n; k++) { |
|
1743
|
0
|
|
|
|
|
|
g = A[i * n + k]; NV h_val = A[j * n + k]; |
|
1744
|
0
|
|
|
|
|
|
A[i * n + k] = g - s * (h_val + g * tau); |
|
1745
|
0
|
|
|
|
|
|
A[j * n + k] = h_val + s * (g - h_val * tau); |
|
1746
|
|
|
|
|
|
|
} |
|
1747
|
21
|
100
|
|
|
|
|
for (size_t k = 0; k < n; k++) { |
|
1748
|
14
|
|
|
|
|
|
g = v[k * n + i]; NV h_val = v[k * n + j]; |
|
1749
|
14
|
|
|
|
|
|
v[k * n + i] = g - s * (h_val + g * tau); |
|
1750
|
14
|
|
|
|
|
|
v[k * n + j] = h_val + s * (g - h_val * tau); |
|
1751
|
|
|
|
|
|
|
} |
|
1752
|
|
|
|
|
|
|
} |
|
1753
|
|
|
|
|
|
|
} |
|
1754
|
|
|
|
|
|
|
} |
|
1755
|
21
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
1756
|
14
|
|
|
|
|
|
b[i] += z[i]; |
|
1757
|
14
|
|
|
|
|
|
d[i] = b[i]; |
|
1758
|
14
|
|
|
|
|
|
z[i] = 0.0; |
|
1759
|
|
|
|
|
|
|
} |
|
1760
|
|
|
|
|
|
|
} |
|
1761
|
7
|
|
|
|
|
|
Safefree(b); Safefree(z); |
|
1762
|
|
|
|
|
|
|
// Sort eigenvalues and corresponding eigenvectors in descending order |
|
1763
|
14
|
100
|
|
|
|
|
for (size_t i = 0; i < n - 1; i++) { |
|
1764
|
7
|
|
|
|
|
|
size_t max_k = i; |
|
1765
|
7
|
|
|
|
|
|
NV max_val = d[i]; |
|
1766
|
14
|
100
|
|
|
|
|
for (size_t j = i + 1; j < n; j++) { |
|
1767
|
7
|
100
|
|
|
|
|
if (d[j] > max_val) { |
|
1768
|
6
|
|
|
|
|
|
max_val = d[j]; |
|
1769
|
6
|
|
|
|
|
|
max_k = j; |
|
1770
|
|
|
|
|
|
|
} |
|
1771
|
|
|
|
|
|
|
} |
|
1772
|
7
|
100
|
|
|
|
|
if (max_k != i) { |
|
1773
|
6
|
|
|
|
|
|
d[max_k] = d[i]; |
|
1774
|
6
|
|
|
|
|
|
d[i] = max_val; |
|
1775
|
18
|
100
|
|
|
|
|
for (size_t k = 0; k < n; k++) { |
|
1776
|
12
|
|
|
|
|
|
NV tmp = v[k * n + i]; |
|
1777
|
12
|
|
|
|
|
|
v[k * n + i] = v[k * n + max_k]; |
|
1778
|
12
|
|
|
|
|
|
v[k * n + max_k] = tmp; |
|
1779
|
|
|
|
|
|
|
} |
|
1780
|
|
|
|
|
|
|
} |
|
1781
|
|
|
|
|
|
|
} |
|
1782
|
7
|
|
|
|
|
|
} |
|
1783
|
|
|
|
|
|
|
|
|
1784
|
|
|
|
|
|
|
// --- pull a numeric value out of an SV* slot |
|
1785
|
456
|
|
|
|
|
|
static int c2c_num(pTHX_ SV **restrict ep, NV *restrict out) { |
|
1786
|
456
|
50
|
|
|
|
|
if (ep && *ep && SvOK(*ep) && looks_like_number(*ep)) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1787
|
427
|
|
|
|
|
|
*out = SvNV(*ep); |
|
1788
|
427
|
|
|
|
|
|
return 1; |
|
1789
|
|
|
|
|
|
|
} |
|
1790
|
29
|
|
|
|
|
|
return 0; |
|
1791
|
|
|
|
|
|
|
} |
|
1792
|
|
|
|
|
|
|
|
|
1793
|
5
|
|
|
|
|
|
static SV* c2c_call(pTHX_ SV *restrict cv, SV *restrict rv1, SV *restrict rv2) { |
|
1794
|
5
|
|
|
|
|
|
dSP; |
|
1795
|
5
|
|
|
|
|
|
ENTER; |
|
1796
|
5
|
|
|
|
|
|
SAVETMPS; |
|
1797
|
5
|
50
|
|
|
|
|
PUSHMARK(SP); |
|
1798
|
5
|
50
|
|
|
|
|
EXTEND(SP, 2); |
|
1799
|
5
|
|
|
|
|
|
PUSHs(rv1); |
|
1800
|
5
|
|
|
|
|
|
PUSHs(rv2); |
|
1801
|
5
|
|
|
|
|
|
PUTBACK; |
|
1802
|
5
|
|
|
|
|
|
unsigned int count = call_sv(cv, G_SCALAR); |
|
1803
|
4
|
|
|
|
|
|
SPAGAIN; |
|
1804
|
4
|
50
|
|
|
|
|
SV *restrict ret = (count > 0) ? newSVsv(POPs) : newSV(0); |
|
1805
|
4
|
|
|
|
|
|
PUTBACK; |
|
1806
|
4
|
50
|
|
|
|
|
FREETMPS; |
|
1807
|
4
|
|
|
|
|
|
LEAVE; |
|
1808
|
4
|
|
|
|
|
|
return ret; |
|
1809
|
|
|
|
|
|
|
} |
|
1810
|
|
|
|
|
|
|
// Mark col_names[idx] whose name equals (wname,wl) as an outer column; returns |
|
1811
|
|
|
|
|
|
|
// 1 if a matching column was found, 0 otherwise. |
|
1812
|
7
|
|
|
|
|
|
static int c2c_mark(SV **col_names, STRLEN *name_len, size_t ncols, const char *wname, STRLEN wl, char *is_outer) { |
|
1813
|
16
|
100
|
|
|
|
|
for (size_t cc = 0; cc < ncols; cc++) { |
|
1814
|
14
|
100
|
|
|
|
|
if (name_len[cc] == wl && memEQ(SvPVX(col_names[cc]), wname, wl)) { is_outer[cc] = 1; return 1; } |
|
|
|
100
|
|
|
|
|
|
|
1815
|
|
|
|
|
|
|
} |
|
1816
|
2
|
|
|
|
|
|
return 0; |
|
1817
|
|
|
|
|
|
|
} |
|
1818
|
|
|
|
|
|
|
// |
|
1819
|
|
|
|
|
|
|
// filter() helpers — place this block in the C section, ABOVE the MODULE line |
|
1820
|
|
|
|
|
|
|
// |
|
1821
|
|
|
|
|
|
|
// Resolve the cell SV for a column in the "current row". |
|
1822
|
|
|
|
|
|
|
// AoH: current row is row_hv -> hv_fetch(row_hv, col) |
|
1823
|
|
|
|
|
|
|
// HoA: current row is index idx -> hv_fetch(data_hv,col) -> AV -> av_fetch(idx) |
|
1824
|
|
|
|
|
|
|
typedef struct { |
|
1825
|
|
|
|
|
|
|
int is_aoh; |
|
1826
|
|
|
|
|
|
|
HV *restrict row_hv; |
|
1827
|
|
|
|
|
|
|
HV *restrict data_hv; |
|
1828
|
|
|
|
|
|
|
SSize_t idx; |
|
1829
|
|
|
|
|
|
|
} filt_ctx; |
|
1830
|
85
|
|
|
|
|
|
static SV* filt_cell(pTHX_ filt_ctx *restrict ctx, const char *restrict col, STRLEN clen) { |
|
1831
|
85
|
100
|
|
|
|
|
if (ctx->is_aoh) { |
|
1832
|
70
|
|
|
|
|
|
SV **restrict p = hv_fetch(ctx->row_hv, col, clen, 0); |
|
1833
|
70
|
100
|
|
|
|
|
return (p && *p) ? *p : NULL; |
|
|
|
50
|
|
|
|
|
|
|
1834
|
|
|
|
|
|
|
} |
|
1835
|
15
|
|
|
|
|
|
SV **restrict cp = hv_fetch(ctx->data_hv, col, clen, 0); |
|
1836
|
15
|
50
|
|
|
|
|
if (!cp || !*cp || !SvROK(*cp) || SvTYPE(SvRV(*cp)) != SVt_PVAV) return NULL; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1837
|
15
|
|
|
|
|
|
SV **restrict vp = av_fetch((AV*)SvRV(*cp), ctx->idx, 0); |
|
1838
|
15
|
50
|
|
|
|
|
return (vp && *vp) ? *vp : NULL; |
|
|
|
50
|
|
|
|
|
|
|
1839
|
|
|
|
|
|
|
} |
|
1840
|
|
|
|
|
|
|
// Recursively interpret a Stats::LikeR::Pred tree against the current row. |
|
1841
|
101
|
|
|
|
|
|
static bool filt_eval(pTHX_ SV *restrict pred, filt_ctx *restrict ctx) { |
|
1842
|
101
|
50
|
|
|
|
|
if (!pred || !SvROK(pred) || SvTYPE(SvRV(pred)) != SVt_PVHV) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1843
|
0
|
|
|
|
|
|
croak("filter: malformed predicate (expected an object built with col())"); |
|
1844
|
101
|
|
|
|
|
|
HV *restrict h = (HV*)SvRV(pred); |
|
1845
|
101
|
|
|
|
|
|
SV **restrict opp = hv_fetchs(h, "op", 0); |
|
1846
|
101
|
50
|
|
|
|
|
if (!opp || !*opp) croak("filter: predicate node missing 'op'"); |
|
|
|
50
|
|
|
|
|
|
|
1847
|
101
|
|
|
|
|
|
const char *restrict op = SvPV_nolen(*opp); |
|
1848
|
101
|
100
|
|
|
|
|
if (strEQ(op, "and") || strEQ(op, "or")) { |
|
|
|
100
|
|
|
|
|
|
|
1849
|
12
|
|
|
|
|
|
SV **restrict lp = hv_fetchs(h, "l", 0); |
|
1850
|
12
|
|
|
|
|
|
SV **restrict rp = hv_fetchs(h, "r", 0); |
|
1851
|
12
|
50
|
|
|
|
|
bool L = filt_eval(aTHX_ (lp ? *lp : NULL), ctx); |
|
1852
|
12
|
100
|
|
|
|
|
if (op[0] == 'a') return L ? filt_eval(aTHX_ (rp ? *rp : NULL), ctx) : 0; // and |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
1853
|
4
|
100
|
|
|
|
|
return L ? 1 : filt_eval(aTHX_ (rp ? *rp : NULL), ctx); // or |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
1854
|
|
|
|
|
|
|
} |
|
1855
|
89
|
100
|
|
|
|
|
if (strEQ(op, "not")) { |
|
1856
|
4
|
|
|
|
|
|
SV **restrict lp = hv_fetchs(h, "l", 0); |
|
1857
|
4
|
50
|
|
|
|
|
return !filt_eval(aTHX_ (lp ? *lp : NULL), ctx); |
|
1858
|
|
|
|
|
|
|
} |
|
1859
|
85
|
|
|
|
|
|
SV **restrict cp = hv_fetchs(h, "col", 0); |
|
1860
|
85
|
|
|
|
|
|
SV **restrict vp = hv_fetchs(h, "val", 0); |
|
1861
|
85
|
50
|
|
|
|
|
if (!cp || !*cp) croak("filter: comparison node missing 'col'"); |
|
|
|
50
|
|
|
|
|
|
|
1862
|
|
|
|
|
|
|
STRLEN clen; |
|
1863
|
85
|
|
|
|
|
|
const char *restrict col = SvPV(*cp, clen); |
|
1864
|
85
|
|
|
|
|
|
SV *restrict cell = filt_cell(aTHX_ ctx, col, clen); |
|
1865
|
85
|
100
|
|
|
|
|
if (!cell || !SvOK(cell)) return 0; // missing / undef cell never matches |
|
|
|
100
|
|
|
|
|
|
|
1866
|
83
|
50
|
|
|
|
|
SV *restrict val = (vp && *vp) ? *vp : &PL_sv_undef; |
|
|
|
50
|
|
|
|
|
|
|
1867
|
83
|
100
|
|
|
|
|
if (strEQ(op, ">")) return SvNV(cell) > SvNV(val); |
|
1868
|
45
|
100
|
|
|
|
|
if (strEQ(op, "<")) return SvNV(cell) < SvNV(val); |
|
1869
|
38
|
100
|
|
|
|
|
if (strEQ(op, ">=")) return SvNV(cell) >= SvNV(val); |
|
1870
|
34
|
100
|
|
|
|
|
if (strEQ(op, "<=")) return SvNV(cell) <= SvNV(val); |
|
1871
|
30
|
100
|
|
|
|
|
if (strEQ(op, "==")) return SvNV(cell) == SvNV(val); |
|
1872
|
19
|
100
|
|
|
|
|
if (strEQ(op, "!=")) return SvNV(cell) != SvNV(val); |
|
1873
|
|
|
|
|
|
|
{ |
|
1874
|
|
|
|
|
|
|
STRLEN al, bl; |
|
1875
|
15
|
|
|
|
|
|
const char *restrict a = SvPV(cell, al); |
|
1876
|
15
|
|
|
|
|
|
const char *restrict b = SvPV(val, bl); |
|
1877
|
15
|
|
|
|
|
|
STRLEN m = al < bl ? al : bl; |
|
1878
|
15
|
50
|
|
|
|
|
int c = m ? memcmp(a, b, m) : 0; |
|
1879
|
15
|
100
|
|
|
|
|
if (c == 0) c = (al > bl) - (al < bl); |
|
1880
|
23
|
100
|
|
|
|
|
if (strEQ(op, "eq")) return c == 0; |
|
1881
|
8
|
100
|
|
|
|
|
if (strEQ(op, "ne")) return c != 0; |
|
1882
|
4
|
50
|
|
|
|
|
if (strEQ(op, "lt")) return c < 0; |
|
1883
|
4
|
50
|
|
|
|
|
if (strEQ(op, "gt")) return c > 0; |
|
1884
|
0
|
0
|
|
|
|
|
if (strEQ(op, "le")) return c <= 0; |
|
1885
|
0
|
0
|
|
|
|
|
if (strEQ(op, "ge")) return c >= 0; |
|
1886
|
|
|
|
|
|
|
} |
|
1887
|
0
|
|
|
|
|
|
croak("filter: unknown operator '%s' in predicate", op); |
|
1888
|
|
|
|
|
|
|
return 0; // not reached |
|
1889
|
|
|
|
|
|
|
} |
|
1890
|
|
|
|
|
|
|
// Call a coderef predicate with $_ (and $_[0]) set to the row hashref. |
|
1891
|
12
|
|
|
|
|
|
static bool filt_call(pTHX_ SV *restrict cv, SV *restrict row) { |
|
1892
|
12
|
|
|
|
|
|
dSP; |
|
1893
|
|
|
|
|
|
|
bool keep; |
|
1894
|
|
|
|
|
|
|
int n; |
|
1895
|
12
|
|
|
|
|
|
ENTER; SAVETMPS; |
|
1896
|
12
|
|
|
|
|
|
SAVE_DEFSV; |
|
1897
|
12
|
|
|
|
|
|
DEFSV_set(row); |
|
1898
|
12
|
50
|
|
|
|
|
PUSHMARK(SP); |
|
1899
|
12
|
50
|
|
|
|
|
EXTEND(SP, 1); |
|
1900
|
12
|
|
|
|
|
|
PUSHs(row); |
|
1901
|
12
|
|
|
|
|
|
PUTBACK; |
|
1902
|
12
|
|
|
|
|
|
n = call_sv(cv, G_SCALAR); |
|
1903
|
12
|
|
|
|
|
|
SPAGAIN; |
|
1904
|
12
|
50
|
|
|
|
|
keep = (n > 0) ? (bool)SvTRUE(TOPs) : 0; |
|
|
|
100
|
|
|
|
|
|
|
1905
|
12
|
50
|
|
|
|
|
if (n > 0) (void)POPs; |
|
1906
|
12
|
|
|
|
|
|
PUTBACK; |
|
1907
|
12
|
50
|
|
|
|
|
FREETMPS; LEAVE; |
|
1908
|
12
|
|
|
|
|
|
return keep; |
|
1909
|
|
|
|
|
|
|
} |
|
1910
|
|
|
|
|
|
|
|
|
1911
|
12
|
|
|
|
|
|
static int h2h_keycmp(const void *pa, const void *pb) { |
|
1912
|
|
|
|
|
|
|
dTHX; |
|
1913
|
12
|
|
|
|
|
|
SV *restrict const *a = (SV * const *)pa; |
|
1914
|
12
|
|
|
|
|
|
SV *restrict const *b = (SV * const *)pb; |
|
1915
|
12
|
|
|
|
|
|
return sv_cmp(*a, *b); |
|
1916
|
|
|
|
|
|
|
} |
|
1917
|
2918
|
|
|
|
|
|
int compare_NVs(const void *restrict a, const void *restrict b) { |
|
1918
|
2918
|
|
|
|
|
|
NV arg1 = *(const NV *)a; |
|
1919
|
2918
|
|
|
|
|
|
NV arg2 = *(const NV *)b; |
|
1920
|
2918
|
100
|
|
|
|
|
if (arg1 < arg2) return -1; |
|
1921
|
887
|
50
|
|
|
|
|
if (arg1 > arg2) return 1; |
|
1922
|
0
|
|
|
|
|
|
return 0; |
|
1923
|
|
|
|
|
|
|
} |
|
1924
|
|
|
|
|
|
|
// Call a column predicate as $cv->($col_values, $col_name) and return its truth. |
|
1925
|
|
|
|
|
|
|
// $col_values is an array ref of the column's DEFINED cells; $col_name is the |
|
1926
|
|
|
|
|
|
|
// column key. Used so a block like sub { sd($_[0]) == 0 } can pick columns out. |
|
1927
|
39
|
|
|
|
|
|
static bool cf_pred(pTHX_ SV *cv_sv, AV *a_av, AV *b_av, SV *name_sv) { |
|
1928
|
39
|
|
|
|
|
|
dSP; |
|
1929
|
39
|
|
|
|
|
|
bool truth = FALSE; |
|
1930
|
|
|
|
|
|
|
int count; |
|
1931
|
39
|
|
|
|
|
|
ENTER; |
|
1932
|
39
|
|
|
|
|
|
SAVETMPS; |
|
1933
|
39
|
50
|
|
|
|
|
PUSHMARK(SP); |
|
1934
|
39
|
50
|
|
|
|
|
XPUSHs(sv_2mortal(newRV_inc((SV*)a_av))); |
|
1935
|
39
|
100
|
|
|
|
|
if (b_av) XPUSHs(sv_2mortal(newRV_inc((SV*)b_av))); |
|
|
|
50
|
|
|
|
|
|
|
1936
|
39
|
50
|
|
|
|
|
XPUSHs(sv_2mortal(newSVsv(name_sv))); |
|
1937
|
39
|
|
|
|
|
|
PUTBACK; |
|
1938
|
39
|
|
|
|
|
|
count = call_sv(cv_sv, G_SCALAR); |
|
1939
|
39
|
|
|
|
|
|
SPAGAIN; |
|
1940
|
39
|
50
|
|
|
|
|
if (count > 0) { |
|
1941
|
39
|
|
|
|
|
|
SV *restrict ret = POPs; // POPs has a side effect: pop exactly once, |
|
1942
|
39
|
|
|
|
|
|
truth = cBOOL(SvTRUE(ret)); // because SvTRUE() may evaluate its arg twice. |
|
1943
|
|
|
|
|
|
|
} |
|
1944
|
39
|
|
|
|
|
|
PUTBACK; |
|
1945
|
39
|
50
|
|
|
|
|
FREETMPS; |
|
1946
|
39
|
|
|
|
|
|
LEAVE; |
|
1947
|
39
|
|
|
|
|
|
return truth; |
|
1948
|
|
|
|
|
|
|
} |
|
1949
|
|
|
|
|
|
|
|
|
1950
|
|
|
|
|
|
|
// --- XS SECTION --- |
|
1951
|
|
|
|
|
|
|
MODULE = Stats::LikeR PACKAGE = Stats::LikeR |
|
1952
|
|
|
|
|
|
|
|
|
1953
|
|
|
|
|
|
|
SV *cfilter(data, ...) |
|
1954
|
|
|
|
|
|
|
SV *data |
|
1955
|
|
|
|
|
|
|
CODE: |
|
1956
|
|
|
|
|
|
|
{ |
|
1957
|
|
|
|
|
|
|
// 0. options. Exactly one of keep/remove is required; it is either an |
|
1958
|
|
|
|
|
|
|
// array ref of column names or a value predicate (CODE ref / function |
|
1959
|
|
|
|
|
|
|
// name). For a predicate, undef handling is: |
|
1960
|
|
|
|
|
|
|
// na => 'keep' (default) - the predicate sees every cell, incl undef |
|
1961
|
|
|
|
|
|
|
// na => 'omit' - single-column funcs (sd) get defined cells |
|
1962
|
|
|
|
|
|
|
// against => 'col' - two-column funcs (cor): the predicate gets |
|
1963
|
|
|
|
|
|
|
// ($col, $ref) over rows defined in BOTH. |
|
1964
|
32
|
|
|
|
|
|
SV *restrict keep_sv = NULL, *restrict remove_sv = NULL; |
|
1965
|
32
|
|
|
|
|
|
SV *restrict na_sv = NULL, *restrict against_sv = NULL; |
|
1966
|
32
|
50
|
|
|
|
|
if ((items - 1) & 1) croak("cfilter: trailing options must be name => value pairs"); |
|
1967
|
78
|
100
|
|
|
|
|
for (int oi = 1; oi < items; oi += 2) { |
|
1968
|
|
|
|
|
|
|
STRLEN ol; |
|
1969
|
47
|
|
|
|
|
|
const char *restrict oname = SvPV(ST(oi), ol); |
|
1970
|
47
|
|
|
|
|
|
SV *restrict oval = ST(oi + 1); |
|
1971
|
47
|
100
|
|
|
|
|
if (ol == 4 && memEQ(oname, "keep", 4)) keep_sv = oval; |
|
|
|
50
|
|
|
|
|
|
|
1972
|
18
|
100
|
|
|
|
|
else if (ol == 6 && memEQ(oname, "remove", 6)) remove_sv = oval; |
|
|
|
50
|
|
|
|
|
|
|
1973
|
16
|
100
|
|
|
|
|
else if (ol == 2 && memEQ(oname, "na", 2)) na_sv = oval; |
|
|
|
50
|
|
|
|
|
|
|
1974
|
7
|
100
|
|
|
|
|
else if (ol == 7 && memEQ(oname, "against", 7)) against_sv = oval; |
|
|
|
50
|
|
|
|
|
|
|
1975
|
1
|
|
|
|
|
|
else croak("cfilter: unknown option '%s'", oname); |
|
1976
|
|
|
|
|
|
|
} |
|
1977
|
31
|
100
|
|
|
|
|
if (keep_sv && remove_sv) croak("cfilter: give either keep or remove, not both"); |
|
|
|
100
|
|
|
|
|
|
|
1978
|
30
|
100
|
|
|
|
|
if (!keep_sv && !remove_sv) croak("cfilter: need a keep or remove argument"); |
|
|
|
100
|
|
|
|
|
|
|
1979
|
29
|
|
|
|
|
|
bool removing = (remove_sv != NULL); |
|
1980
|
29
|
100
|
|
|
|
|
SV *restrict sel = removing ? remove_sv : keep_sv; |
|
1981
|
|
|
|
|
|
|
// classify the selector: array ref of names, or a value predicate. |
|
1982
|
|
|
|
|
|
|
bool by_name; |
|
1983
|
29
|
|
|
|
|
|
SV *restrict cv_sv = NULL; |
|
1984
|
29
|
100
|
|
|
|
|
if (SvROK(sel) && SvTYPE(SvRV(sel)) == SVt_PVAV) by_name = TRUE; |
|
|
|
100
|
|
|
|
|
|
|
1985
|
18
|
100
|
|
|
|
|
else if ((SvROK(sel) && SvTYPE(SvRV(sel)) == SVt_PVCV) || (SvOK(sel) && !SvROK(sel))) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
1986
|
17
|
|
|
|
|
|
by_name = FALSE; |
|
1987
|
17
|
100
|
|
|
|
|
if (SvROK(sel)) cv_sv = SvRV(sel); |
|
1988
|
|
|
|
|
|
|
else { |
|
1989
|
|
|
|
|
|
|
STRLEN nl; |
|
1990
|
1
|
|
|
|
|
|
const char *restrict name = SvPV(sel, nl); |
|
1991
|
1
|
50
|
|
|
|
|
SV *restrict fq = strstr(name, "::") ? newSVpvn(name, nl) : newSVpvf("Stats::LikeR::%s", name); |
|
1992
|
1
|
|
|
|
|
|
CV *restrict cv = get_cv(SvPV_nolen(fq), 0); |
|
1993
|
1
|
|
|
|
|
|
SvREFCNT_dec(fq); |
|
1994
|
1
|
50
|
|
|
|
|
if (!cv) croak("cfilter: unknown function '%s'", name); |
|
1995
|
0
|
|
|
|
|
|
cv_sv = (SV*)cv; |
|
1996
|
|
|
|
|
|
|
} |
|
1997
|
|
|
|
|
|
|
} |
|
1998
|
1
|
|
|
|
|
|
else croak("cfilter: keep/remove must be an array ref of column names or a code ref / function name"); |
|
1999
|
|
|
|
|
|
|
// decode the undef policy (predicate only). |
|
2000
|
27
|
|
|
|
|
|
bool na_omit = FALSE; |
|
2001
|
27
|
100
|
|
|
|
|
if (na_sv && SvOK(na_sv)) { |
|
|
|
50
|
|
|
|
|
|
|
2002
|
|
|
|
|
|
|
STRLEN nl; |
|
2003
|
9
|
|
|
|
|
|
const char *restrict nv = SvPV(na_sv, nl); |
|
2004
|
9
|
100
|
|
|
|
|
if (nl == 4 && memEQ(nv, "omit", 4)) na_omit = TRUE; |
|
|
|
50
|
|
|
|
|
|
|
2005
|
1
|
50
|
|
|
|
|
else if (nl == 4 && memEQ(nv, "keep", 4)) na_omit = FALSE; |
|
|
|
0
|
|
|
|
|
|
|
2006
|
1
|
|
|
|
|
|
else croak("cfilter: na must be 'keep' or 'omit'"); |
|
2007
|
|
|
|
|
|
|
} |
|
2008
|
26
|
100
|
|
|
|
|
if (by_name && (na_sv || against_sv)) croak("cfilter: na/against only apply to a predicate selector"); |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2009
|
25
|
100
|
|
|
|
|
if (against_sv && na_sv) croak("cfilter: give na or against, not both"); |
|
|
|
100
|
|
|
|
|
|
|
2010
|
|
|
|
|
|
|
// 1. detect the data shape. |
|
2011
|
24
|
100
|
|
|
|
|
if (!SvROK(data)) croak("cfilter: data must be a reference"); |
|
2012
|
23
|
|
|
|
|
|
SV *restrict rv = SvRV(data); |
|
2013
|
|
|
|
|
|
|
short int kind; // 0 = array-of-hashes, 1 = hash-of-arrays, 2 = hash-of-hashes |
|
2014
|
23
|
100
|
|
|
|
|
if (SvTYPE(rv) == SVt_PVAV) kind = 0; |
|
2015
|
20
|
50
|
|
|
|
|
else if (SvTYPE(rv) == SVt_PVHV) { |
|
2016
|
20
|
|
|
|
|
|
HV *restrict h = (HV*)rv; |
|
2017
|
20
|
|
|
|
|
|
hv_iterinit(h); |
|
2018
|
20
|
|
|
|
|
|
HE *restrict fe = hv_iternext(h); |
|
2019
|
20
|
50
|
|
|
|
|
if (!fe) kind = 2; |
|
2020
|
|
|
|
|
|
|
else { |
|
2021
|
20
|
|
|
|
|
|
SV *restrict fv = hv_iterval(h, fe); |
|
2022
|
20
|
50
|
|
|
|
|
if (SvROK(fv) && SvTYPE(SvRV(fv)) == SVt_PVAV) kind = 1; |
|
|
|
100
|
|
|
|
|
|
|
2023
|
2
|
50
|
|
|
|
|
else if (SvROK(fv) && SvTYPE(SvRV(fv)) == SVt_PVHV) kind = 2; |
|
|
|
50
|
|
|
|
|
|
|
2024
|
0
|
|
|
|
|
|
else croak("cfilter: hash values must be array refs (HoA) or hash refs (HoH)"); |
|
2025
|
|
|
|
|
|
|
} |
|
2026
|
|
|
|
|
|
|
} |
|
2027
|
0
|
|
|
|
|
|
else croak("cfilter: data must be an array ref or hash ref"); |
|
2028
|
|
|
|
|
|
|
// 2. the column universe, and (predicate only) a row-aligned cell table |
|
2029
|
|
|
|
|
|
|
// `cellmap`: colname -> AV of length nrows, undef in the gaps. The |
|
2030
|
|
|
|
|
|
|
// alignment lets `against` pair two columns by row. |
|
2031
|
23
|
|
|
|
|
|
HV *restrict universe = newHV(); |
|
2032
|
23
|
|
|
|
|
|
AV *restrict colnames = newAV(); |
|
2033
|
23
|
100
|
|
|
|
|
HV *restrict cellmap = by_name ? NULL : newHV(); |
|
2034
|
23
|
|
|
|
|
|
SSize_t nrows = 0; |
|
2035
|
23
|
100
|
|
|
|
|
if (kind == 1) { |
|
2036
|
18
|
|
|
|
|
|
HV *restrict h = (HV*)rv; |
|
2037
|
|
|
|
|
|
|
HE *restrict e; |
|
2038
|
18
|
|
|
|
|
|
hv_iterinit(h); |
|
2039
|
72
|
100
|
|
|
|
|
while ((e = hv_iternext(h))) { |
|
2040
|
54
|
|
|
|
|
|
SV *restrict val = hv_iterval(h, e); |
|
2041
|
54
|
50
|
|
|
|
|
if (!SvROK(val) || SvTYPE(SvRV(val)) != SVt_PVAV) croak("cfilter: every value must be an array ref (hash of arrays)"); |
|
|
|
50
|
|
|
|
|
|
|
2042
|
54
|
|
|
|
|
|
SSize_t len = av_len((AV*)SvRV(val)) + 1; |
|
2043
|
54
|
100
|
|
|
|
|
if (len > nrows) nrows = len; |
|
2044
|
|
|
|
|
|
|
} |
|
2045
|
18
|
|
|
|
|
|
hv_iterinit(h); |
|
2046
|
90
|
100
|
|
|
|
|
while ((e = hv_iternext(h))) { |
|
2047
|
54
|
|
|
|
|
|
SV *restrict ck = hv_iterkeysv(e); |
|
2048
|
54
|
|
|
|
|
|
(void)hv_store_ent(universe, ck, newSViv(1), 0); |
|
2049
|
54
|
|
|
|
|
|
av_push(colnames, newSVsv(ck)); |
|
2050
|
54
|
100
|
|
|
|
|
if (!by_name) { |
|
2051
|
36
|
|
|
|
|
|
AV *restrict src = (AV*)SvRV(hv_iterval(h, e)), *restrict col = newAV(); |
|
2052
|
36
|
50
|
|
|
|
|
if (nrows > 0) av_extend(col, nrows - 1); |
|
2053
|
216
|
100
|
|
|
|
|
for (SSize_t r = 0; r < nrows; r++) { |
|
2054
|
180
|
50
|
|
|
|
|
SV **restrict ep = (r <= av_len(src)) ? av_fetch(src, r, 0) : NULL; |
|
2055
|
180
|
50
|
|
|
|
|
av_push(col, (ep && *ep && SvOK(*ep)) ? newSVsv(*ep) : newSV(0)); |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
2056
|
|
|
|
|
|
|
} |
|
2057
|
36
|
|
|
|
|
|
(void)hv_store_ent(cellmap, ck, newRV_noinc((SV*)col), 0); |
|
2058
|
|
|
|
|
|
|
} |
|
2059
|
|
|
|
|
|
|
} |
|
2060
|
|
|
|
|
|
|
} else { |
|
2061
|
|
|
|
|
|
|
// row-major: collect the rows in a stable order, then build per column. |
|
2062
|
5
|
|
|
|
|
|
AV *restrict rows = newAV(); |
|
2063
|
5
|
100
|
|
|
|
|
if (kind == 0) { |
|
2064
|
3
|
|
|
|
|
|
AV *restrict a = (AV*)rv; |
|
2065
|
3
|
|
|
|
|
|
SSize_t n = av_len(a) + 1; |
|
2066
|
12
|
100
|
|
|
|
|
for (SSize_t r = 0; r < n; r++) { |
|
2067
|
9
|
|
|
|
|
|
SV **restrict ep = av_fetch(a, r, 0); |
|
2068
|
9
|
50
|
|
|
|
|
if (!ep || !*ep || !SvROK(*ep) || SvTYPE(SvRV(*ep)) != SVt_PVHV) croak("cfilter: array elements must be hash refs (array of hashes)"); |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2069
|
9
|
|
|
|
|
|
av_push(rows, newRV_inc(SvRV(*ep))); |
|
2070
|
|
|
|
|
|
|
} |
|
2071
|
|
|
|
|
|
|
} else { |
|
2072
|
2
|
|
|
|
|
|
HV *restrict h = (HV*)rv; |
|
2073
|
|
|
|
|
|
|
HE *restrict e; |
|
2074
|
2
|
|
|
|
|
|
hv_iterinit(h); |
|
2075
|
9
|
100
|
|
|
|
|
while ((e = hv_iternext(h))) { |
|
2076
|
7
|
|
|
|
|
|
SV *restrict val = hv_iterval(h, e); |
|
2077
|
7
|
50
|
|
|
|
|
if (!SvROK(val) || SvTYPE(SvRV(val)) != SVt_PVHV) croak("cfilter: every value must be a hash ref (hash of hashes)"); |
|
|
|
50
|
|
|
|
|
|
|
2078
|
7
|
|
|
|
|
|
av_push(rows, newRV_inc(SvRV(val))); |
|
2079
|
|
|
|
|
|
|
} |
|
2080
|
|
|
|
|
|
|
} |
|
2081
|
5
|
|
|
|
|
|
nrows = av_len(rows) + 1; |
|
2082
|
|
|
|
|
|
|
// union of columns, in first-seen order. |
|
2083
|
|
|
|
|
|
|
{ |
|
2084
|
5
|
|
|
|
|
|
HV *restrict seen = newHV(); |
|
2085
|
21
|
100
|
|
|
|
|
for (SSize_t r = 0; r < nrows; r++) { |
|
2086
|
16
|
|
|
|
|
|
HV *restrict row = (HV*)SvRV(*av_fetch(rows, r, 0)); |
|
2087
|
|
|
|
|
|
|
HE *restrict ie; |
|
2088
|
16
|
|
|
|
|
|
hv_iterinit(row); |
|
2089
|
72
|
100
|
|
|
|
|
while ((ie = hv_iternext(row))) { |
|
2090
|
40
|
|
|
|
|
|
SV *restrict ck = hv_iterkeysv(ie); |
|
2091
|
40
|
100
|
|
|
|
|
if (!hv_exists_ent(seen, ck, 0)) { |
|
2092
|
14
|
|
|
|
|
|
(void)hv_store_ent(seen, ck, newSViv(1), 0); |
|
2093
|
14
|
|
|
|
|
|
(void)hv_store_ent(universe, ck, newSViv(1), 0); |
|
2094
|
14
|
|
|
|
|
|
av_push(colnames, newSVsv(ck)); |
|
2095
|
|
|
|
|
|
|
} |
|
2096
|
|
|
|
|
|
|
} |
|
2097
|
|
|
|
|
|
|
} |
|
2098
|
5
|
|
|
|
|
|
SvREFCNT_dec((SV*)seen); |
|
2099
|
|
|
|
|
|
|
} |
|
2100
|
5
|
100
|
|
|
|
|
if (!by_name) { |
|
2101
|
2
|
|
|
|
|
|
SSize_t nc = av_len(colnames) + 1; |
|
2102
|
8
|
100
|
|
|
|
|
for (SSize_t c = 0; c < nc; c++) { |
|
2103
|
6
|
|
|
|
|
|
SV *restrict ck = *av_fetch(colnames, c, 0); |
|
2104
|
6
|
|
|
|
|
|
AV *restrict col = newAV(); |
|
2105
|
6
|
50
|
|
|
|
|
if (nrows > 0) av_extend(col, nrows - 1); |
|
2106
|
36
|
100
|
|
|
|
|
for (SSize_t r = 0; r < nrows; r++) { |
|
2107
|
30
|
|
|
|
|
|
HV *restrict row = (HV*)SvRV(*av_fetch(rows, r, 0)); |
|
2108
|
30
|
|
|
|
|
|
HE *restrict che = hv_fetch_ent(row, ck, 0, 0); |
|
2109
|
30
|
100
|
|
|
|
|
SV *restrict cell = che ? HeVAL(che) : NULL; |
|
2110
|
30
|
100
|
|
|
|
|
av_push(col, (cell && SvOK(cell)) ? newSVsv(cell) : newSV(0)); |
|
|
|
50
|
|
|
|
|
|
|
2111
|
|
|
|
|
|
|
} |
|
2112
|
6
|
|
|
|
|
|
(void)hv_store_ent(cellmap, ck, newRV_noinc((SV*)col), 0); |
|
2113
|
|
|
|
|
|
|
} |
|
2114
|
|
|
|
|
|
|
} |
|
2115
|
5
|
|
|
|
|
|
SvREFCNT_dec((SV*)rows); |
|
2116
|
|
|
|
|
|
|
} |
|
2117
|
|
|
|
|
|
|
// 2b. resolve the `against` reference column into its cell array. |
|
2118
|
23
|
|
|
|
|
|
AV *restrict against_av = NULL; |
|
2119
|
23
|
100
|
|
|
|
|
if (against_sv) { |
|
2120
|
5
|
50
|
|
|
|
|
if (!SvOK(against_sv) || SvROK(against_sv)) croak("cfilter: against must be a column name (string)"); |
|
|
|
50
|
|
|
|
|
|
|
2121
|
5
|
100
|
|
|
|
|
if (!hv_exists_ent(universe, against_sv, 0)) croak("cfilter: against column '%s' not found in data", SvPV_nolen(against_sv)); |
|
2122
|
4
|
|
|
|
|
|
against_av = (AV*)SvRV(HeVAL(hv_fetch_ent(cellmap, against_sv, 0, 0))); |
|
2123
|
|
|
|
|
|
|
} |
|
2124
|
|
|
|
|
|
|
// 3. decide which columns to keep. |
|
2125
|
22
|
|
|
|
|
|
HV *restrict keepset = newHV(); |
|
2126
|
22
|
100
|
|
|
|
|
if (by_name) { |
|
2127
|
9
|
|
|
|
|
|
AV *restrict names = (AV*)SvRV(sel); |
|
2128
|
9
|
|
|
|
|
|
HV *restrict listed = newHV(); |
|
2129
|
9
|
|
|
|
|
|
SSize_t n = av_len(names) + 1; |
|
2130
|
21
|
100
|
|
|
|
|
for (SSize_t i = 0; i < n; i++) { |
|
2131
|
13
|
|
|
|
|
|
SV **restrict ep = av_fetch(names, i, 0); |
|
2132
|
13
|
50
|
|
|
|
|
if (!ep || !*ep || !SvOK(*ep)) croak("cfilter: column list contains an undefined entry"); |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2133
|
13
|
100
|
|
|
|
|
if (!hv_exists_ent(universe, *ep, 0)) croak("cfilter: column '%s' not found in data", SvPV_nolen(*ep)); |
|
2134
|
12
|
|
|
|
|
|
(void)hv_store_ent(listed, *ep, newSViv(1), 0); |
|
2135
|
|
|
|
|
|
|
} |
|
2136
|
8
|
|
|
|
|
|
SSize_t nc = av_len(colnames) + 1; |
|
2137
|
31
|
100
|
|
|
|
|
for (SSize_t c = 0; c < nc; c++) { |
|
2138
|
23
|
|
|
|
|
|
SV *restrict ck = *av_fetch(colnames, c, 0); |
|
2139
|
23
|
|
|
|
|
|
bool in_list = cBOOL(hv_exists_ent(listed, ck, 0)); |
|
2140
|
23
|
100
|
|
|
|
|
if (removing ? !in_list : in_list) (void)hv_store_ent(keepset, ck, newSViv(1), 0); |
|
|
|
100
|
|
|
|
|
|
|
2141
|
|
|
|
|
|
|
} |
|
2142
|
8
|
|
|
|
|
|
SvREFCNT_dec((SV*)listed); |
|
2143
|
|
|
|
|
|
|
} else { |
|
2144
|
|
|
|
|
|
|
// predicate over the flat colnames list (never a live hash iterator |
|
2145
|
|
|
|
|
|
|
// across call_sv). Apply the undef policy per column. |
|
2146
|
13
|
|
|
|
|
|
SSize_t nc = av_len(colnames) + 1; |
|
2147
|
52
|
100
|
|
|
|
|
for (SSize_t c = 0; c < nc; c++) { |
|
2148
|
39
|
|
|
|
|
|
SV *restrict ck = *av_fetch(colnames, c, 0); |
|
2149
|
39
|
|
|
|
|
|
AV *restrict cells = (AV*)SvRV(HeVAL(hv_fetch_ent(cellmap, ck, 0, 0))); |
|
2150
|
|
|
|
|
|
|
bool pass; |
|
2151
|
39
|
100
|
|
|
|
|
if (against_av) { |
|
2152
|
|
|
|
|
|
|
// two columns, pairwise complete: rows defined in BOTH. |
|
2153
|
12
|
|
|
|
|
|
AV *restrict a1 = newAV(), *restrict a2 = newAV(); |
|
2154
|
72
|
100
|
|
|
|
|
for (SSize_t r = 0; r < nrows; r++) { |
|
2155
|
60
|
|
|
|
|
|
SV **restrict p1 = av_fetch(cells, r, 0); |
|
2156
|
60
|
|
|
|
|
|
SV **restrict p2 = av_fetch(against_av, r, 0); |
|
2157
|
60
|
50
|
|
|
|
|
if (p1 && *p1 && SvOK(*p1) && p2 && *p2 && SvOK(*p2)) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2158
|
57
|
|
|
|
|
|
av_push(a1, newSVsv(*p1)); |
|
2159
|
57
|
|
|
|
|
|
av_push(a2, newSVsv(*p2)); |
|
2160
|
|
|
|
|
|
|
} |
|
2161
|
|
|
|
|
|
|
} |
|
2162
|
12
|
|
|
|
|
|
pass = cf_pred(aTHX_ cv_sv, a1, a2, ck); |
|
2163
|
12
|
|
|
|
|
|
SvREFCNT_dec((SV*)a1); |
|
2164
|
12
|
|
|
|
|
|
SvREFCNT_dec((SV*)a2); |
|
2165
|
27
|
100
|
|
|
|
|
} else if (na_omit) { |
|
2166
|
|
|
|
|
|
|
// one column, defined cells only. |
|
2167
|
18
|
|
|
|
|
|
AV *restrict a1 = newAV(); |
|
2168
|
108
|
100
|
|
|
|
|
for (SSize_t r = 0; r < nrows; r++) { |
|
2169
|
90
|
|
|
|
|
|
SV **restrict p = av_fetch(cells, r, 0); |
|
2170
|
90
|
50
|
|
|
|
|
if (p && *p && SvOK(*p)) av_push(a1, newSVsv(*p)); |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
2171
|
|
|
|
|
|
|
} |
|
2172
|
18
|
|
|
|
|
|
pass = cf_pred(aTHX_ cv_sv, a1, NULL, ck); |
|
2173
|
18
|
|
|
|
|
|
SvREFCNT_dec((SV*)a1); |
|
2174
|
|
|
|
|
|
|
} else { |
|
2175
|
|
|
|
|
|
|
// one column, every cell including undef. |
|
2176
|
9
|
|
|
|
|
|
pass = cf_pred(aTHX_ cv_sv, cells, NULL, ck); |
|
2177
|
|
|
|
|
|
|
} |
|
2178
|
39
|
50
|
|
|
|
|
if (removing ? !pass : pass) (void)hv_store_ent(keepset, ck, newSViv(1), 0); |
|
|
|
100
|
|
|
|
|
|
|
2179
|
|
|
|
|
|
|
} |
|
2180
|
|
|
|
|
|
|
} |
|
2181
|
|
|
|
|
|
|
// 4. rebuild the data in its original shape with only the kept columns. |
|
2182
|
|
|
|
|
|
|
SV *restrict out; |
|
2183
|
21
|
100
|
|
|
|
|
if (kind == 1) { |
|
2184
|
16
|
|
|
|
|
|
HV *restrict outh = newHV(), *restrict h = (HV*)rv; |
|
2185
|
|
|
|
|
|
|
HE *restrict e; |
|
2186
|
16
|
|
|
|
|
|
hv_iterinit(h); |
|
2187
|
64
|
100
|
|
|
|
|
while ((e = hv_iternext(h))) { |
|
2188
|
48
|
|
|
|
|
|
SV *restrict ck = hv_iterkeysv(e); |
|
2189
|
48
|
100
|
|
|
|
|
if (!hv_exists_ent(keepset, ck, 0)) continue; |
|
2190
|
33
|
|
|
|
|
|
AV *restrict src = (AV*)SvRV(hv_iterval(h, e)), *restrict dst = newAV(); |
|
2191
|
33
|
|
|
|
|
|
SSize_t n = av_len(src) + 1; |
|
2192
|
33
|
50
|
|
|
|
|
if (n > 0) av_extend(dst, n - 1); |
|
2193
|
190
|
100
|
|
|
|
|
for (SSize_t i = 0; i < n; i++) { |
|
2194
|
157
|
|
|
|
|
|
SV **restrict ep = av_fetch(src, i, 0); |
|
2195
|
157
|
50
|
|
|
|
|
av_push(dst, (ep && *ep) ? newSVsv(*ep) : newSV(0)); |
|
|
|
50
|
|
|
|
|
|
|
2196
|
|
|
|
|
|
|
} |
|
2197
|
33
|
|
|
|
|
|
(void)hv_store_ent(outh, ck, newRV_noinc((SV*)dst), 0); |
|
2198
|
|
|
|
|
|
|
} |
|
2199
|
16
|
|
|
|
|
|
out = (SV*)outh; |
|
2200
|
5
|
100
|
|
|
|
|
} else if (kind == 2) { |
|
2201
|
2
|
|
|
|
|
|
HV *restrict outh = newHV(), *restrict h = (HV*)rv; |
|
2202
|
|
|
|
|
|
|
HE *restrict e; |
|
2203
|
2
|
|
|
|
|
|
hv_iterinit(h); |
|
2204
|
9
|
100
|
|
|
|
|
while ((e = hv_iternext(h))) { |
|
2205
|
7
|
|
|
|
|
|
SV *restrict rk = hv_iterkeysv(e); |
|
2206
|
7
|
|
|
|
|
|
HV *restrict row = (HV*)SvRV(hv_iterval(h, e)), *restrict nr = newHV(); |
|
2207
|
|
|
|
|
|
|
HE *restrict ie; |
|
2208
|
7
|
|
|
|
|
|
hv_iterinit(row); |
|
2209
|
23
|
100
|
|
|
|
|
while ((ie = hv_iternext(row))) { |
|
2210
|
16
|
|
|
|
|
|
SV *restrict ck = hv_iterkeysv(ie); |
|
2211
|
16
|
100
|
|
|
|
|
if (!hv_exists_ent(keepset, ck, 0)) continue; |
|
2212
|
5
|
|
|
|
|
|
(void)hv_store_ent(nr, ck, newSVsv(HeVAL(ie)), 0); |
|
2213
|
|
|
|
|
|
|
} |
|
2214
|
7
|
|
|
|
|
|
(void)hv_store_ent(outh, rk, newRV_noinc((SV*)nr), 0); |
|
2215
|
|
|
|
|
|
|
} |
|
2216
|
2
|
|
|
|
|
|
out = (SV*)outh; |
|
2217
|
|
|
|
|
|
|
} else { |
|
2218
|
3
|
|
|
|
|
|
AV *restrict outa = newAV(), *restrict a = (AV*)rv; |
|
2219
|
3
|
|
|
|
|
|
SSize_t n = av_len(a) + 1; |
|
2220
|
12
|
100
|
|
|
|
|
for (SSize_t r = 0; r < n; r++) { |
|
2221
|
9
|
|
|
|
|
|
HV *restrict row = (HV*)SvRV(*av_fetch(a, r, 0)), *restrict nr = newHV(); |
|
2222
|
|
|
|
|
|
|
HE *restrict ie; |
|
2223
|
9
|
|
|
|
|
|
hv_iterinit(row); |
|
2224
|
33
|
100
|
|
|
|
|
while ((ie = hv_iternext(row))) { |
|
2225
|
24
|
|
|
|
|
|
SV *restrict ck = hv_iterkeysv(ie); |
|
2226
|
24
|
100
|
|
|
|
|
if (!hv_exists_ent(keepset, ck, 0)) continue; |
|
2227
|
9
|
|
|
|
|
|
(void)hv_store_ent(nr, ck, newSVsv(HeVAL(ie)), 0); |
|
2228
|
|
|
|
|
|
|
} |
|
2229
|
9
|
|
|
|
|
|
av_push(outa, newRV_noinc((SV*)nr)); |
|
2230
|
|
|
|
|
|
|
} |
|
2231
|
3
|
|
|
|
|
|
out = (SV*)outa; |
|
2232
|
|
|
|
|
|
|
} |
|
2233
|
|
|
|
|
|
|
// 5. tidy up the scratch tables (the result keeps its own copies). |
|
2234
|
21
|
|
|
|
|
|
SvREFCNT_dec((SV*)universe); |
|
2235
|
21
|
|
|
|
|
|
SvREFCNT_dec((SV*)colnames); |
|
2236
|
21
|
|
|
|
|
|
SvREFCNT_dec((SV*)keepset); |
|
2237
|
21
|
100
|
|
|
|
|
if (cellmap) SvREFCNT_dec((SV*)cellmap); |
|
2238
|
21
|
|
|
|
|
|
RETVAL = newRV_noinc(out); |
|
2239
|
|
|
|
|
|
|
} |
|
2240
|
|
|
|
|
|
|
OUTPUT: |
|
2241
|
|
|
|
|
|
|
RETVAL |
|
2242
|
|
|
|
|
|
|
|
|
2243
|
|
|
|
|
|
|
SV *hoh2hoa(data, ...) |
|
2244
|
|
|
|
|
|
|
SV *data |
|
2245
|
|
|
|
|
|
|
CODE: |
|
2246
|
|
|
|
|
|
|
{ |
|
2247
|
|
|
|
|
|
|
// 0. parse trailing name => value options (done before any allocation so |
|
2248
|
|
|
|
|
|
|
// option/usage errors can't leak). undef.val sets the fill for a |
|
2249
|
|
|
|
|
|
|
// missing key or an undef cell (default: undef). row.names, if given, |
|
2250
|
|
|
|
|
|
|
// adds a column of that name holding the sorted row labels. |
|
2251
|
20
|
|
|
|
|
|
SV *restrict fill = NULL; // NULL => fill gaps with undef |
|
2252
|
20
|
|
|
|
|
|
SV *restrict rn_sv = NULL; // NULL => do not emit a row-names column |
|
2253
|
20
|
100
|
|
|
|
|
if ((items - 1) & 1) croak("hoh2hoa: trailing options must be name => value pairs"); |
|
2254
|
27
|
100
|
|
|
|
|
for (int oi = 1; oi < items; oi += 2) { |
|
2255
|
|
|
|
|
|
|
STRLEN ol; |
|
2256
|
10
|
|
|
|
|
|
const char *restrict oname = SvPV(ST(oi), ol); |
|
2257
|
10
|
|
|
|
|
|
SV *restrict oval = ST(oi + 1); |
|
2258
|
10
|
100
|
|
|
|
|
if (ol == 9 && memEQ(oname, "undef.val", 9)) fill = SvOK(oval) ? oval : NULL; |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
2259
|
5
|
100
|
|
|
|
|
else if (ol == 9 && memEQ(oname, "row.names", 9)) { |
|
|
|
50
|
|
|
|
|
|
|
2260
|
4
|
50
|
|
|
|
|
if (SvOK(oval) && !SvROK(oval)) rn_sv = oval; |
|
|
|
100
|
|
|
|
|
|
|
2261
|
1
|
|
|
|
|
|
else croak("hoh2hoa: row.names must be a column name (string)"); |
|
2262
|
|
|
|
|
|
|
} |
|
2263
|
1
|
|
|
|
|
|
else croak("hoh2hoa: unknown option '%s'", oname); |
|
2264
|
|
|
|
|
|
|
} |
|
2265
|
|
|
|
|
|
|
// 1. the input must be a hash ref (a hash of hashes). |
|
2266
|
17
|
100
|
|
|
|
|
if (!SvROK(data) || SvTYPE(SvRV(data)) != SVt_PVHV) croak("hoh2hoa: data must be a hash ref (hash of hashes)"); |
|
|
|
100
|
|
|
|
|
|
|
2267
|
15
|
|
|
|
|
|
HV *restrict in_hv = (HV*)SvRV(data); |
|
2268
|
|
|
|
|
|
|
// 2. these cross the section boundaries (gather -> build -> cleanup). |
|
2269
|
15
|
|
|
|
|
|
HV *restrict out_hv = newHV(); // the result: column name -> array ref |
|
2270
|
15
|
|
|
|
|
|
AV *restrict rows_av = newAV(); // outer keys, sorted into the row order |
|
2271
|
15
|
|
|
|
|
|
AV *restrict cols_av = newAV(); // union of inner keys (column names) |
|
2272
|
15
|
|
|
|
|
|
HV *restrict seen = newHV(); // membership test while taking the union |
|
2273
|
|
|
|
|
|
|
// 3. collect the outer keys (row labels) and sort for a stable row order. |
|
2274
|
|
|
|
|
|
|
{ |
|
2275
|
|
|
|
|
|
|
HE *restrict e; |
|
2276
|
15
|
|
|
|
|
|
hv_iterinit(in_hv); |
|
2277
|
39
|
100
|
|
|
|
|
while ((e = hv_iternext(in_hv))) { |
|
2278
|
25
|
|
|
|
|
|
SV *restrict rv = hv_iterval(in_hv, e); |
|
2279
|
25
|
50
|
|
|
|
|
if (!SvROK(rv) || SvTYPE(SvRV(rv)) != SVt_PVHV) croak("hoh2hoa: every value must be a hash ref (hash of hashes)"); |
|
|
|
100
|
|
|
|
|
|
|
2280
|
24
|
|
|
|
|
|
av_push(rows_av, newSVsv(hv_iterkeysv(e))); |
|
2281
|
|
|
|
|
|
|
} |
|
2282
|
|
|
|
|
|
|
} |
|
2283
|
14
|
|
|
|
|
|
SSize_t nrows = av_len(rows_av) + 1; |
|
2284
|
14
|
100
|
|
|
|
|
if (nrows > 1) qsort(AvARRAY(rows_av), (size_t)nrows, sizeof(SV*), h2h_keycmp); |
|
2285
|
|
|
|
|
|
|
// 4. discover the union of inner keys. Each new column gets an empty array |
|
2286
|
|
|
|
|
|
|
// in the result straight away so step 5 can just push into it. |
|
2287
|
|
|
|
|
|
|
{ |
|
2288
|
|
|
|
|
|
|
HE *restrict e; |
|
2289
|
14
|
|
|
|
|
|
hv_iterinit(in_hv); |
|
2290
|
38
|
100
|
|
|
|
|
while ((e = hv_iternext(in_hv))) { |
|
2291
|
24
|
|
|
|
|
|
HV *restrict row = (HV*)SvRV(hv_iterval(in_hv, e)); |
|
2292
|
|
|
|
|
|
|
HE *restrict ie; |
|
2293
|
24
|
|
|
|
|
|
hv_iterinit(row); |
|
2294
|
88
|
100
|
|
|
|
|
while ((ie = hv_iternext(row))) { |
|
2295
|
40
|
|
|
|
|
|
SV *restrict ck = hv_iterkeysv(ie); |
|
2296
|
40
|
100
|
|
|
|
|
if (!hv_exists_ent(seen, ck, 0)) { |
|
2297
|
26
|
|
|
|
|
|
(void)hv_store_ent(seen, ck, &PL_sv_yes, 0); |
|
2298
|
26
|
|
|
|
|
|
av_push(cols_av, newSVsv(ck)); |
|
2299
|
26
|
|
|
|
|
|
(void)hv_store_ent(out_hv, ck, newRV_noinc((SV*)newAV()), 0); |
|
2300
|
|
|
|
|
|
|
} |
|
2301
|
|
|
|
|
|
|
} |
|
2302
|
|
|
|
|
|
|
} |
|
2303
|
|
|
|
|
|
|
} |
|
2304
|
14
|
|
|
|
|
|
SSize_t ncols = av_len(cols_av) + 1; |
|
2305
|
|
|
|
|
|
|
// 5. walk the rows in sorted order; for every column push the cell (a copy) |
|
2306
|
|
|
|
|
|
|
// or the fill value, so each column ends up exactly nrows long. |
|
2307
|
38
|
100
|
|
|
|
|
for (SSize_t r = 0; r < nrows; r++) { |
|
2308
|
24
|
|
|
|
|
|
SV *restrict rk = *av_fetch(rows_av, r, 0); |
|
2309
|
24
|
|
|
|
|
|
HE *restrict rhe = hv_fetch_ent(in_hv, rk, 0, 0); |
|
2310
|
24
|
|
|
|
|
|
HV *restrict row = (HV*)SvRV(HeVAL(rhe)); |
|
2311
|
75
|
100
|
|
|
|
|
for (SSize_t c = 0; c < ncols; c++) { |
|
2312
|
51
|
|
|
|
|
|
SV *restrict ck = *av_fetch(cols_av, c, 0); |
|
2313
|
51
|
|
|
|
|
|
HE *restrict che = hv_fetch_ent(row, ck, 0, 0); |
|
2314
|
51
|
100
|
|
|
|
|
SV *restrict src = che ? HeVAL(che) : NULL; |
|
2315
|
51
|
100
|
|
|
|
|
SV *restrict cell = (src && SvOK(src)) ? newSVsv(src) : (fill ? newSVsv(fill) : newSV(0)); |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
2316
|
51
|
|
|
|
|
|
HE *restrict colhe = hv_fetch_ent(out_hv, ck, 0, 0); |
|
2317
|
51
|
|
|
|
|
|
av_push((AV*)SvRV(HeVAL(colhe)), cell); |
|
2318
|
|
|
|
|
|
|
} |
|
2319
|
|
|
|
|
|
|
} |
|
2320
|
|
|
|
|
|
|
// 6. optional row-names column: the sorted labels under the requested name. |
|
2321
|
14
|
100
|
|
|
|
|
if (rn_sv) { |
|
2322
|
3
|
100
|
|
|
|
|
if (hv_exists_ent(out_hv, rn_sv, 0)) croak("hoh2hoa: row.names column '%s' collides with an existing column", SvPV_nolen(rn_sv)); |
|
2323
|
2
|
|
|
|
|
|
AV *restrict rn_av = newAV(); |
|
2324
|
4
|
100
|
|
|
|
|
for (SSize_t r = 0; r < nrows; r++) av_push(rn_av, newSVsv(*av_fetch(rows_av, r, 0))); |
|
2325
|
2
|
|
|
|
|
|
(void)hv_store_ent(out_hv, rn_sv, newRV_noinc((SV*)rn_av), 0); |
|
2326
|
|
|
|
|
|
|
} |
|
2327
|
|
|
|
|
|
|
// 7. tidy up the scratch structures (the result keeps its own copies). |
|
2328
|
13
|
|
|
|
|
|
SvREFCNT_dec((SV*)rows_av); |
|
2329
|
13
|
|
|
|
|
|
SvREFCNT_dec((SV*)cols_av); |
|
2330
|
13
|
|
|
|
|
|
SvREFCNT_dec((SV*)seen); |
|
2331
|
13
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)out_hv); |
|
2332
|
|
|
|
|
|
|
} |
|
2333
|
|
|
|
|
|
|
OUTPUT: |
|
2334
|
|
|
|
|
|
|
RETVAL |
|
2335
|
|
|
|
|
|
|
|
|
2336
|
|
|
|
|
|
|
void filter(df, pred) |
|
2337
|
|
|
|
|
|
|
SV *df |
|
2338
|
|
|
|
|
|
|
SV *pred |
|
2339
|
|
|
|
|
|
|
PPCODE: |
|
2340
|
|
|
|
|
|
|
{ |
|
2341
|
27
|
50
|
|
|
|
|
if (!df || !SvROK(df)) |
|
|
|
100
|
|
|
|
|
|
|
2342
|
1
|
|
|
|
|
|
croak("filter: first argument must be a HASH or ARRAY reference (a data frame)"); |
|
2343
|
26
|
50
|
|
|
|
|
bool is_code = (pred && SvROK(pred) && SvTYPE(SvRV(pred)) == SVt_PVCV); |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
2344
|
26
|
100
|
|
|
|
|
if (!is_code && (!pred || !SvROK(pred) || SvTYPE(SvRV(pred)) != SVt_PVHV)) |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2345
|
1
|
|
|
|
|
|
croak("filter: second argument must be a CODE ref or a predicate built with col()"); |
|
2346
|
25
|
|
|
|
|
|
SV *restrict ref = SvRV(df); |
|
2347
|
|
|
|
|
|
|
SV *restrict result; |
|
2348
|
25
|
100
|
|
|
|
|
if (SvTYPE(ref) == SVt_PVAV) { |
|
2349
|
|
|
|
|
|
|
// ----- Array of Hashes: keep matching row hashrefs (shared, not copied) ----- |
|
2350
|
20
|
|
|
|
|
|
AV *restrict in = (AV*)ref; |
|
2351
|
20
|
|
|
|
|
|
AV *restrict out = newAV(); |
|
2352
|
20
|
|
|
|
|
|
SSize_t n = av_len(in) + 1, i; |
|
2353
|
20
|
|
|
|
|
|
filt_ctx ctx; ctx.is_aoh = 1; ctx.data_hv = NULL; ctx.idx = 0; |
|
2354
|
92
|
100
|
|
|
|
|
for (i = 0; i < n; i++) { |
|
2355
|
73
|
|
|
|
|
|
SV **restrict rp = av_fetch(in, i, 0); |
|
2356
|
73
|
50
|
|
|
|
|
if (!rp || !*rp || !SvROK(*rp) || SvTYPE(SvRV(*rp)) != SVt_PVHV) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2357
|
1
|
|
|
|
|
|
SvREFCNT_dec((SV*)out); |
|
2358
|
1
|
|
|
|
|
|
croak("filter: array data frame must hold HASH references; element %ld is not one", (long)i); |
|
2359
|
|
|
|
|
|
|
} |
|
2360
|
|
|
|
|
|
|
bool keep; |
|
2361
|
72
|
100
|
|
|
|
|
if (is_code) keep = filt_call(aTHX_ pred, *rp); |
|
2362
|
64
|
|
|
|
|
|
else { ctx.row_hv = (HV*)SvRV(*rp); keep = filt_eval(aTHX_ pred, &ctx); } |
|
2363
|
72
|
100
|
|
|
|
|
if (keep) av_push(out, SvREFCNT_inc_simple_NN(*rp)); |
|
2364
|
|
|
|
|
|
|
} |
|
2365
|
19
|
|
|
|
|
|
result = newRV_noinc((SV*)out); |
|
2366
|
5
|
50
|
|
|
|
|
} else if (SvTYPE(ref) == SVt_PVHV) { |
|
2367
|
|
|
|
|
|
|
// ----- Hash of Arrays: keep matching row indices across every column ----- |
|
2368
|
5
|
|
|
|
|
|
HV *restrict in = (HV*)ref; |
|
2369
|
5
|
|
|
|
|
|
I32 ncols = hv_iterinit(in); |
|
2370
|
5
|
50
|
|
|
|
|
if (ncols <= 0) { |
|
2371
|
0
|
|
|
|
|
|
result = newRV_noinc((SV*)newHV()); |
|
2372
|
|
|
|
|
|
|
} else { |
|
2373
|
5
|
|
|
|
|
|
char **restrict names = (char**)safemalloc(ncols * sizeof(char*)); |
|
2374
|
5
|
|
|
|
|
|
STRLEN *restrict nlens = (STRLEN*)safemalloc(ncols * sizeof(STRLEN)); |
|
2375
|
5
|
|
|
|
|
|
AV **restrict inav = (AV**)safemalloc(ncols * sizeof(AV*)); |
|
2376
|
5
|
|
|
|
|
|
AV **restrict outav = (AV**)safemalloc(ncols * sizeof(AV*)); |
|
2377
|
5
|
|
|
|
|
|
HV *restrict out = newHV(); |
|
2378
|
5
|
|
|
|
|
|
SSize_t maxrows = 0, i; |
|
2379
|
5
|
|
|
|
|
|
I32 c = 0, cc; |
|
2380
|
|
|
|
|
|
|
HE *restrict e; |
|
2381
|
17
|
100
|
|
|
|
|
while ((e = hv_iternext(in)) && c < ncols) { |
|
|
|
50
|
|
|
|
|
|
|
2382
|
|
|
|
|
|
|
STRLEN klen; |
|
2383
|
13
|
50
|
|
|
|
|
char *restrict k = HePV(e, klen); |
|
2384
|
13
|
|
|
|
|
|
SV *restrict v = HeVAL(e); |
|
2385
|
13
|
50
|
|
|
|
|
if (!v || !SvROK(v) || SvTYPE(SvRV(v)) != SVt_PVAV) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2386
|
1
|
|
|
|
|
|
safefree(names); safefree(nlens); safefree(inav); safefree(outav); |
|
2387
|
1
|
|
|
|
|
|
SvREFCNT_dec((SV*)out); |
|
2388
|
1
|
|
|
|
|
|
croak("filter: hash data frame must hold ARRAY references (a hash of arrays); column '%s' is not one", k); |
|
2389
|
|
|
|
|
|
|
} |
|
2390
|
12
|
|
|
|
|
|
AV *restrict a = (AV*)SvRV(v); |
|
2391
|
12
|
|
|
|
|
|
SSize_t len = av_len(a) + 1; |
|
2392
|
12
|
100
|
|
|
|
|
if (len > maxrows) maxrows = len; |
|
2393
|
12
|
|
|
|
|
|
names[c] = k; nlens[c] = klen; inav[c] = a; |
|
2394
|
12
|
|
|
|
|
|
outav[c] = newAV(); |
|
2395
|
12
|
|
|
|
|
|
hv_store(out, k, klen, newRV_noinc((SV*)outav[c]), 0); |
|
2396
|
12
|
|
|
|
|
|
c++; |
|
2397
|
|
|
|
|
|
|
} |
|
2398
|
4
|
|
|
|
|
|
filt_ctx ctx; ctx.is_aoh = 0; ctx.row_hv = NULL; ctx.data_hv = in; |
|
2399
|
20
|
100
|
|
|
|
|
for (i = 0; i < maxrows; i++) { |
|
2400
|
|
|
|
|
|
|
bool keep; |
|
2401
|
16
|
100
|
|
|
|
|
if (is_code) { |
|
2402
|
4
|
|
|
|
|
|
HV *restrict rowh = newHV(); |
|
2403
|
16
|
100
|
|
|
|
|
for (cc = 0; cc < ncols; cc++) { |
|
2404
|
12
|
|
|
|
|
|
SV **restrict vp = av_fetch(inav[cc], i, 0); |
|
2405
|
12
|
50
|
|
|
|
|
hv_store(rowh, names[cc], nlens[cc], newSVsv((vp && *vp) ? *vp : &PL_sv_undef), 0); |
|
|
|
50
|
|
|
|
|
|
|
2406
|
|
|
|
|
|
|
} |
|
2407
|
4
|
|
|
|
|
|
SV *restrict rowrv = newRV_noinc((SV*)rowh); |
|
2408
|
4
|
|
|
|
|
|
keep = filt_call(aTHX_ pred, rowrv); |
|
2409
|
4
|
|
|
|
|
|
SvREFCNT_dec(rowrv); |
|
2410
|
|
|
|
|
|
|
} else { |
|
2411
|
12
|
|
|
|
|
|
ctx.idx = i; |
|
2412
|
12
|
|
|
|
|
|
keep = filt_eval(aTHX_ pred, &ctx); |
|
2413
|
|
|
|
|
|
|
} |
|
2414
|
16
|
100
|
|
|
|
|
if (keep) { |
|
2415
|
28
|
100
|
|
|
|
|
for (cc = 0; cc < ncols; cc++) { |
|
2416
|
21
|
|
|
|
|
|
SV **restrict vp = av_fetch(inav[cc], i, 0); |
|
2417
|
21
|
50
|
|
|
|
|
av_push(outav[cc], newSVsv((vp && *vp) ? *vp : &PL_sv_undef)); |
|
|
|
50
|
|
|
|
|
|
|
2418
|
|
|
|
|
|
|
} |
|
2419
|
|
|
|
|
|
|
} |
|
2420
|
|
|
|
|
|
|
} |
|
2421
|
4
|
|
|
|
|
|
safefree(names); safefree(nlens); safefree(inav); safefree(outav); |
|
2422
|
4
|
|
|
|
|
|
result = newRV_noinc((SV*)out); |
|
2423
|
|
|
|
|
|
|
} |
|
2424
|
|
|
|
|
|
|
} else { |
|
2425
|
0
|
|
|
|
|
|
croak("filter: unsupported data frame; expected an array of hashes (AoH) or a hash of arrays (HoA)"); |
|
2426
|
|
|
|
|
|
|
} |
|
2427
|
23
|
|
|
|
|
|
ST(0) = sv_2mortal(result); |
|
2428
|
23
|
|
|
|
|
|
XSRETURN(1); |
|
2429
|
|
|
|
|
|
|
} |
|
2430
|
|
|
|
|
|
|
|
|
2431
|
|
|
|
|
|
|
SV *col2col(data, cmd, cols = &PL_sv_undef, ...) |
|
2432
|
|
|
|
|
|
|
SV *data |
|
2433
|
|
|
|
|
|
|
SV *cmd |
|
2434
|
|
|
|
|
|
|
SV *cols |
|
2435
|
|
|
|
|
|
|
CODE: |
|
2436
|
|
|
|
|
|
|
{ |
|
2437
|
|
|
|
|
|
|
// Only these cross the section boundaries (build -> loop -> cleanup); |
|
2438
|
|
|
|
|
|
|
// everything else is declared at its point of use just below. |
|
2439
|
51
|
|
|
|
|
|
SV *restrict cv_sv = NULL; |
|
2440
|
51
|
|
|
|
|
|
size_t ncols = 0, nrows = 0; |
|
2441
|
51
|
|
|
|
|
|
AV *restrict names_av = newAV(); |
|
2442
|
51
|
|
|
|
|
|
double **restrict col_val = NULL; |
|
2443
|
51
|
|
|
|
|
|
char **restrict col_def = NULL; |
|
2444
|
51
|
|
|
|
|
|
short int na_mode = 0; // 0 = pairwise, 1 = omit, 2 = keep; see section 0 |
|
2445
|
51
|
|
|
|
|
|
bool skip_errors = TRUE; // skip.errors (default true): trap a croaking block, store its message |
|
2446
|
|
|
|
|
|
|
// 0. options. They may be given either as trailing name => value pairs |
|
2447
|
|
|
|
|
|
|
// (after the positional cols), or - so no placeholder is needed when |
|
2448
|
|
|
|
|
|
|
// there is no column restriction - as a single hash ref in cols's |
|
2449
|
|
|
|
|
|
|
// place, e.g. col2col($data, 'cor', { 'skip.errors' => 1 }). |
|
2450
|
|
|
|
|
|
|
// `na` controls how undef is handled when one column is paired with |
|
2451
|
|
|
|
|
|
|
// another: |
|
2452
|
|
|
|
|
|
|
// 'pairwise' (default) - a row counts for the (a,b) pair only if |
|
2453
|
|
|
|
|
|
|
// BOTH columns are defined there, so the block gets two equal |
|
2454
|
|
|
|
|
|
|
// length, aligned columns. This is what paired stats (cor) want. |
|
2455
|
|
|
|
|
|
|
// 'omit' - each column independently drops its own undef values, |
|
2456
|
|
|
|
|
|
|
// so the two columns may differ in length. This is what unpaired |
|
2457
|
|
|
|
|
|
|
// tests (t_test, kruskal_test) want: a gap in one column must not |
|
2458
|
|
|
|
|
|
|
// throw away a good value in the other. |
|
2459
|
|
|
|
|
|
|
// 'keep' - every row passes through and undef reaches the block. |
|
2460
|
|
|
|
|
|
|
// rm.undef / rm.na (bool) remain as aliases: true => 'pairwise' (the |
|
2461
|
|
|
|
|
|
|
// old default), false => 'keep'. |
|
2462
|
|
|
|
|
|
|
// skip.errors (bool, default true): a block that croaks for a pair |
|
2463
|
|
|
|
|
|
|
// does not abort col2col; instead the first line of its error message |
|
2464
|
|
|
|
|
|
|
// is stored as that cell's value, so the result shows which |
|
2465
|
|
|
|
|
|
|
// (outer => inner) pair failed and why. Set it false to make a croak |
|
2466
|
|
|
|
|
|
|
// propagate and abort the whole call instead. |
|
2467
|
51
|
|
|
|
|
|
SV *restrict cols_eff = cols; |
|
2468
|
51
|
|
|
|
|
|
bool na_set = FALSE, rm_set = FALSE; |
|
2469
|
|
|
|
|
|
|
#define C2C_DECODE_OPT(ONAME, OL, OVAL) do { \ |
|
2470
|
|
|
|
|
|
|
if ((OL) == 2 && memEQ((ONAME), "na", 2)) { \ |
|
2471
|
|
|
|
|
|
|
STRLEN vl_; const char *restrict nv_ = SvPV((OVAL), vl_); \ |
|
2472
|
|
|
|
|
|
|
if (vl_ == 8 && memEQ(nv_, "pairwise", 8)) na_mode = 0; \ |
|
2473
|
|
|
|
|
|
|
else if (vl_ == 4 && memEQ(nv_, "omit", 4)) na_mode = 1; \ |
|
2474
|
|
|
|
|
|
|
else if (vl_ == 4 && memEQ(nv_, "keep", 4)) na_mode = 2; \ |
|
2475
|
|
|
|
|
|
|
else croak("col2col: na must be 'pairwise', 'omit' or 'keep'"); \ |
|
2476
|
|
|
|
|
|
|
na_set = TRUE; \ |
|
2477
|
|
|
|
|
|
|
} else if (((OL) == 8 && memEQ((ONAME), "rm.undef", 8)) || ((OL) == 5 && memEQ((ONAME), "rm.na", 5))) { \ |
|
2478
|
|
|
|
|
|
|
na_mode = cBOOL(SvTRUE((OVAL))) ? 0 : 2; rm_set = TRUE; \ |
|
2479
|
|
|
|
|
|
|
} else if ((OL) == 11 && memEQ((ONAME), "skip.errors", 11)) { \ |
|
2480
|
|
|
|
|
|
|
skip_errors = cBOOL(SvTRUE((OVAL))); \ |
|
2481
|
|
|
|
|
|
|
} else croak("col2col: unknown option '%s'", (ONAME)); \ |
|
2482
|
|
|
|
|
|
|
} while (0) |
|
2483
|
51
|
100
|
|
|
|
|
if (SvROK(cols) && SvTYPE(SvRV(cols)) == SVt_PVHV) { |
|
|
|
100
|
|
|
|
|
|
|
2484
|
|
|
|
|
|
|
// options supplied as a hash ref instead of cols: no column restriction |
|
2485
|
6
|
|
|
|
|
|
HV *restrict oh = (HV*)SvRV(cols); |
|
2486
|
|
|
|
|
|
|
HE *restrict he; |
|
2487
|
6
|
100
|
|
|
|
|
if (items > 3) croak("col2col: an options hash ref must be the last argument"); |
|
2488
|
5
|
|
|
|
|
|
hv_iterinit(oh); |
|
2489
|
8
|
100
|
|
|
|
|
while ((he = hv_iternext(oh))) { |
|
2490
|
|
|
|
|
|
|
STRLEN ol; |
|
2491
|
5
|
50
|
|
|
|
|
const char *restrict oname = HePV(he, ol); |
|
2492
|
5
|
|
|
|
|
|
SV *restrict oval = HeVAL(he); |
|
2493
|
5
|
100
|
|
|
|
|
C2C_DECODE_OPT(oname, ol, oval); |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2494
|
|
|
|
|
|
|
} |
|
2495
|
3
|
|
|
|
|
|
cols_eff = &PL_sv_undef; |
|
2496
|
45
|
100
|
|
|
|
|
} else if (items > 3) { |
|
2497
|
18
|
100
|
|
|
|
|
if ((items - 3) & 1) croak("col2col: trailing options must be name => value pairs"); |
|
2498
|
33
|
100
|
|
|
|
|
for (int oi = 3; oi < items; oi += 2) { |
|
2499
|
|
|
|
|
|
|
STRLEN ol; |
|
2500
|
18
|
|
|
|
|
|
const char *restrict oname = SvPV(ST(oi), ol); |
|
2501
|
18
|
|
|
|
|
|
SV *restrict oval = ST(oi + 1); |
|
2502
|
18
|
100
|
|
|
|
|
C2C_DECODE_OPT(oname, ol, oval); |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2503
|
|
|
|
|
|
|
} |
|
2504
|
|
|
|
|
|
|
} |
|
2505
|
45
|
100
|
|
|
|
|
if (na_set && rm_set) croak("col2col: give na or rm.undef, not both"); |
|
|
|
100
|
|
|
|
|
|
|
2506
|
|
|
|
|
|
|
#undef C2C_DECODE_OPT |
|
2507
|
|
|
|
|
|
|
// 1. resolve the command: a CODE block or a function name. Either way |
|
2508
|
|
|
|
|
|
|
// we end up with the CV to call as $cv->($col_a, $col_b). |
|
2509
|
44
|
100
|
|
|
|
|
if (SvROK(cmd) && SvTYPE(SvRV(cmd)) == SVt_PVCV) cv_sv = SvRV(cmd); |
|
|
|
100
|
|
|
|
|
|
|
2510
|
4
|
100
|
|
|
|
|
else if (SvOK(cmd) && !SvROK(cmd)) { |
|
|
|
100
|
|
|
|
|
|
|
2511
|
|
|
|
|
|
|
STRLEN nl; |
|
2512
|
2
|
|
|
|
|
|
const char *restrict name = SvPV(cmd, nl); |
|
2513
|
2
|
50
|
|
|
|
|
SV *restrict fq = strstr(name, "::") ? newSVpvn(name, nl) : newSVpvf("Stats::LikeR::%s", name); |
|
2514
|
2
|
|
|
|
|
|
CV *restrict cv = get_cv(SvPV_nolen(fq), 0); |
|
2515
|
2
|
|
|
|
|
|
SvREFCNT_dec(fq); |
|
2516
|
2
|
100
|
|
|
|
|
if (!cv) croak("col2col: unknown function '%s'", name); |
|
2517
|
1
|
|
|
|
|
|
cv_sv = (SV*)cv; |
|
2518
|
2
|
|
|
|
|
|
} else croak("col2col: command must be a CODE ref or a function name"); |
|
2519
|
|
|
|
|
|
|
// 2. detect the data shape and build per-column value/defined tables. |
|
2520
|
41
|
100
|
|
|
|
|
if (!SvROK(data)) croak("col2col: data must be a reference"); |
|
2521
|
|
|
|
|
|
|
{ |
|
2522
|
40
|
|
|
|
|
|
SV *restrict rv = SvRV(data); |
|
2523
|
|
|
|
|
|
|
short int kind; |
|
2524
|
40
|
100
|
|
|
|
|
if (SvTYPE(rv) == SVt_PVAV) kind = 1; |
|
2525
|
38
|
50
|
|
|
|
|
else if (SvTYPE(rv) == SVt_PVHV) { |
|
2526
|
38
|
|
|
|
|
|
HV *restrict h = (HV*)rv; |
|
2527
|
38
|
|
|
|
|
|
hv_iterinit(h); |
|
2528
|
38
|
|
|
|
|
|
HE *restrict e = hv_iternext(h); |
|
2529
|
38
|
50
|
|
|
|
|
if (!e) croak("col2col: empty data hash"); |
|
2530
|
38
|
|
|
|
|
|
SV *restrict first = hv_iterval(h, e); |
|
2531
|
38
|
50
|
|
|
|
|
if (SvROK(first) && SvTYPE(SvRV(first)) == SVt_PVAV) kind = 0; |
|
|
|
100
|
|
|
|
|
|
|
2532
|
1
|
50
|
|
|
|
|
else if (SvROK(first) && SvTYPE(SvRV(first)) == SVt_PVHV) kind = 2; |
|
|
|
50
|
|
|
|
|
|
|
2533
|
0
|
|
|
|
|
|
else croak("col2col: hash values must be array refs (HoA) or hash refs (HoH)"); |
|
2534
|
|
|
|
|
|
|
} |
|
2535
|
0
|
|
|
|
|
|
else croak("col2col: data must be an array ref or hash ref"); |
|
2536
|
40
|
100
|
|
|
|
|
if (kind == 0) { |
|
2537
|
|
|
|
|
|
|
// hash of arrays: names = keys, rows = longest column. |
|
2538
|
37
|
|
|
|
|
|
HV *restrict h = (HV*)rv; |
|
2539
|
37
|
|
|
|
|
|
AV **restrict src = NULL; |
|
2540
|
|
|
|
|
|
|
HE *restrict e; |
|
2541
|
37
|
|
|
|
|
|
hv_iterinit(h); |
|
2542
|
129
|
100
|
|
|
|
|
while ((e = hv_iternext(h))) { |
|
2543
|
92
|
|
|
|
|
|
SV *restrict val = hv_iterval(h, e); |
|
2544
|
92
|
50
|
|
|
|
|
if (!SvROK(val) || SvTYPE(SvRV(val)) != SVt_PVAV) continue; |
|
|
|
50
|
|
|
|
|
|
|
2545
|
92
|
|
|
|
|
|
av_push(names_av, newSVsv(hv_iterkeysv(e))); |
|
2546
|
92
|
|
|
|
|
|
AV *restrict a = (AV*)SvRV(val); |
|
2547
|
92
|
|
|
|
|
|
size_t len = (size_t)(av_len(a) + 1); |
|
2548
|
92
|
100
|
|
|
|
|
if (len > nrows) nrows = len; |
|
2549
|
92
|
50
|
|
|
|
|
Renew(src, av_len(names_av) + 1, AV*); |
|
2550
|
92
|
|
|
|
|
|
src[av_len(names_av)] = a; |
|
2551
|
|
|
|
|
|
|
} |
|
2552
|
37
|
|
|
|
|
|
ncols = (size_t)(av_len(names_av) + 1); |
|
2553
|
37
|
50
|
|
|
|
|
Newxz(col_val, ncols ? ncols : 1, NV*); |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2554
|
37
|
50
|
|
|
|
|
Newxz(col_def, ncols ? ncols : 1, char*); |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2555
|
129
|
100
|
|
|
|
|
for (size_t cc = 0; cc < ncols; cc++) { |
|
2556
|
92
|
50
|
|
|
|
|
Newxz(col_val[cc], nrows ? nrows : 1, NV); |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2557
|
92
|
50
|
|
|
|
|
Newxz(col_def[cc], nrows ? nrows : 1, char); |
|
2558
|
92
|
|
|
|
|
|
AV *restrict a = src[cc]; |
|
2559
|
518
|
100
|
|
|
|
|
for (size_t r = 0; r < nrows; r++) { |
|
2560
|
|
|
|
|
|
|
NV v; |
|
2561
|
426
|
100
|
|
|
|
|
if (c2c_num(aTHX_ av_fetch(a, (SSize_t)r, 0), &v)) { col_val[cc][r] = v; col_def[cc][r] = 1; } |
|
2562
|
|
|
|
|
|
|
} |
|
2563
|
|
|
|
|
|
|
} |
|
2564
|
37
|
|
|
|
|
|
Safefree(src); |
|
2565
|
|
|
|
|
|
|
} else { |
|
2566
|
|
|
|
|
|
|
// row-major (array of hashes / hash of hashes): union of keys. |
|
2567
|
3
|
|
|
|
|
|
HV **restrict row_hv = NULL; |
|
2568
|
3
|
100
|
|
|
|
|
if (kind == 1) { |
|
2569
|
2
|
|
|
|
|
|
AV *restrict a = (AV*)rv; |
|
2570
|
2
|
|
|
|
|
|
nrows = (size_t)(av_len(a) + 1); |
|
2571
|
2
|
50
|
|
|
|
|
Newxz(row_hv, nrows ? nrows : 1, HV*); |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2572
|
10
|
100
|
|
|
|
|
for (size_t r = 0; r < nrows; r++) { |
|
2573
|
8
|
|
|
|
|
|
SV **restrict ep = av_fetch(a, (SSize_t)r, 0); |
|
2574
|
8
|
50
|
|
|
|
|
if (ep && *ep && SvROK(*ep) && SvTYPE(SvRV(*ep)) == SVt_PVHV) row_hv[r] = (HV*)SvRV(*ep); |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2575
|
|
|
|
|
|
|
} |
|
2576
|
|
|
|
|
|
|
} else { |
|
2577
|
1
|
|
|
|
|
|
HV *restrict h = (HV*)rv; |
|
2578
|
|
|
|
|
|
|
HE *restrict e; |
|
2579
|
1
|
|
|
|
|
|
size_t r = 0; |
|
2580
|
1
|
50
|
|
|
|
|
nrows = (size_t)HvKEYS(h); |
|
2581
|
1
|
50
|
|
|
|
|
Newxz(row_hv, nrows ? nrows : 1, HV*); |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2582
|
1
|
|
|
|
|
|
hv_iterinit(h); |
|
2583
|
6
|
100
|
|
|
|
|
while ((e = hv_iternext(h)) && r < nrows) { |
|
|
|
50
|
|
|
|
|
|
|
2584
|
5
|
|
|
|
|
|
SV *restrict val = hv_iterval(h, e); |
|
2585
|
5
|
50
|
|
|
|
|
if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVHV) row_hv[r] = (HV*)SvRV(val); |
|
|
|
50
|
|
|
|
|
|
|
2586
|
5
|
|
|
|
|
|
r++; |
|
2587
|
|
|
|
|
|
|
} |
|
2588
|
|
|
|
|
|
|
} |
|
2589
|
|
|
|
|
|
|
{ |
|
2590
|
3
|
|
|
|
|
|
HV *restrict seen = newHV(); |
|
2591
|
16
|
100
|
|
|
|
|
for (size_t r = 0; r < nrows; r++) { |
|
2592
|
13
|
100
|
|
|
|
|
if (!row_hv[r]) continue; |
|
2593
|
|
|
|
|
|
|
HE *restrict e; |
|
2594
|
10
|
|
|
|
|
|
hv_iterinit(row_hv[r]); |
|
2595
|
40
|
100
|
|
|
|
|
while ((e = hv_iternext(row_hv[r]))) { |
|
2596
|
|
|
|
|
|
|
STRLEN kl; |
|
2597
|
30
|
50
|
|
|
|
|
char *restrict k = HePV(e, kl); |
|
2598
|
30
|
100
|
|
|
|
|
if (!hv_exists(seen, k, kl)) { (void)hv_store(seen, k, kl, &PL_sv_yes, 0); av_push(names_av, newSVsv(hv_iterkeysv(e))); } |
|
2599
|
|
|
|
|
|
|
} |
|
2600
|
|
|
|
|
|
|
} |
|
2601
|
3
|
|
|
|
|
|
SvREFCNT_dec((SV*)seen); |
|
2602
|
|
|
|
|
|
|
} |
|
2603
|
3
|
|
|
|
|
|
ncols = (size_t)(av_len(names_av) + 1); |
|
2604
|
3
|
100
|
|
|
|
|
Newxz(col_val, ncols ? ncols : 1, double*); |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
2605
|
3
|
100
|
|
|
|
|
Newxz(col_def, ncols ? ncols : 1, char*); |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
2606
|
9
|
100
|
|
|
|
|
for (size_t cc = 0; cc < ncols; cc++) { |
|
2607
|
|
|
|
|
|
|
STRLEN kl; |
|
2608
|
6
|
|
|
|
|
|
char *restrict k = SvPV(*av_fetch(names_av, (SSize_t)cc, 0), kl); |
|
2609
|
6
|
50
|
|
|
|
|
Newxz(col_val[cc], nrows ? nrows : 1, double); |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2610
|
6
|
50
|
|
|
|
|
Newxz(col_def[cc], nrows ? nrows : 1, char); |
|
2611
|
36
|
100
|
|
|
|
|
for (size_t r = 0; r < nrows; r++) { |
|
2612
|
|
|
|
|
|
|
double v; |
|
2613
|
30
|
50
|
|
|
|
|
if (!row_hv[r]) continue; |
|
2614
|
30
|
50
|
|
|
|
|
if (c2c_num(aTHX_ hv_fetch(row_hv[r], k, kl, 0), &v)) { col_val[cc][r] = v; col_def[cc][r] = 1; } |
|
2615
|
|
|
|
|
|
|
} |
|
2616
|
|
|
|
|
|
|
} |
|
2617
|
3
|
|
|
|
|
|
Safefree(row_hv); |
|
2618
|
|
|
|
|
|
|
} |
|
2619
|
|
|
|
|
|
|
} |
|
2620
|
40
|
100
|
|
|
|
|
if (ncols == 0) croak("col2col: no usable columns found"); |
|
2621
|
|
|
|
|
|
|
// 3. flatten the column names for fast hv_store keys in the loop. |
|
2622
|
|
|
|
|
|
|
SV **restrict col_names; |
|
2623
|
|
|
|
|
|
|
STRLEN *restrict name_len; |
|
2624
|
39
|
50
|
|
|
|
|
Newx(col_names, ncols, SV*); |
|
2625
|
39
|
50
|
|
|
|
|
Newx(name_len, ncols, STRLEN); |
|
2626
|
137
|
100
|
|
|
|
|
for (size_t cc = 0; cc < ncols; cc++) { |
|
2627
|
98
|
|
|
|
|
|
col_names[cc] = *av_fetch(names_av, (SSize_t)cc, 0); |
|
2628
|
98
|
|
|
|
|
|
(void)SvPV(col_names[cc], name_len[cc]); |
|
2629
|
|
|
|
|
|
|
} |
|
2630
|
|
|
|
|
|
|
// 3b. decide which columns may be col_a (the outer/"from" side). With no |
|
2631
|
|
|
|
|
|
|
// restriction every column qualifies; a name or list narrows it. |
|
2632
|
|
|
|
|
|
|
char *restrict is_outer; |
|
2633
|
39
|
|
|
|
|
|
Newxz(is_outer, ncols, char); |
|
2634
|
39
|
100
|
|
|
|
|
if (!SvOK(cols_eff)) { |
|
2635
|
118
|
100
|
|
|
|
|
for (size_t cc = 0; cc < ncols; cc++) is_outer[cc] = 1; |
|
2636
|
|
|
|
|
|
|
} |
|
2637
|
6
|
100
|
|
|
|
|
else if (SvROK(cols_eff) && SvTYPE(SvRV(cols_eff)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
2638
|
2
|
|
|
|
|
|
AV *restrict want = (AV*)SvRV(cols_eff); |
|
2639
|
2
|
|
|
|
|
|
SSize_t n = av_len(want) + 1; |
|
2640
|
5
|
100
|
|
|
|
|
for (SSize_t i = 0; i < n; i++) { |
|
2641
|
4
|
|
|
|
|
|
SV **restrict ep = av_fetch(want, i, 0); |
|
2642
|
|
|
|
|
|
|
STRLEN wl; |
|
2643
|
|
|
|
|
|
|
const char *restrict wname; |
|
2644
|
4
|
50
|
|
|
|
|
if (!ep || !*ep || !SvOK(*ep)) croak("col2col: column list contains an undefined entry"); |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2645
|
4
|
|
|
|
|
|
wname = SvPV(*ep, wl); |
|
2646
|
4
|
100
|
|
|
|
|
if (!c2c_mark(col_names, name_len, ncols, wname, wl, is_outer)) croak("col2col: column '%s' not found in data", wname); |
|
2647
|
|
|
|
|
|
|
} |
|
2648
|
3
|
50
|
|
|
|
|
} else if (!SvROK(cols_eff)) { |
|
2649
|
|
|
|
|
|
|
STRLEN wl; |
|
2650
|
3
|
|
|
|
|
|
const char *restrict wname = SvPV(cols_eff, wl); |
|
2651
|
3
|
100
|
|
|
|
|
if (!c2c_mark(col_names, name_len, ncols, wname, wl, is_outer)) croak("col2col: column '%s' not found in data", wname); |
|
2652
|
0
|
|
|
|
|
|
} else croak("col2col: cols must be a column name or an array ref of names"); |
|
2653
|
|
|
|
|
|
|
// 4. each selected column vs every other column. The two columns reach |
|
2654
|
|
|
|
|
|
|
// the block as @_ = ($col_a, $col_b); how undef is handled depends on |
|
2655
|
|
|
|
|
|
|
// na (section 0): 'pairwise' drops a row missing in either side (equal |
|
2656
|
|
|
|
|
|
|
// aligned lengths, for cor); 'omit' drops each column's own undef |
|
2657
|
|
|
|
|
|
|
// independently (lengths may differ, for t_test / kruskal_test); |
|
2658
|
|
|
|
|
|
|
// 'keep' passes every row through with undef in the gaps. |
|
2659
|
37
|
|
|
|
|
|
HV *restrict out_hv = newHV(); |
|
2660
|
127
|
100
|
|
|
|
|
for (size_t a = 0; a < ncols; a++) { |
|
2661
|
|
|
|
|
|
|
HV *restrict inner; |
|
2662
|
91
|
100
|
|
|
|
|
if (!is_outer[a]) continue; |
|
2663
|
87
|
|
|
|
|
|
inner = newHV(); |
|
2664
|
308
|
100
|
|
|
|
|
for (size_t b = 0; b < ncols; b++) { |
|
2665
|
|
|
|
|
|
|
AV *restrict ca, *restrict cb; |
|
2666
|
|
|
|
|
|
|
SV *restrict rv1, *restrict rv2, *restrict res; |
|
2667
|
222
|
100
|
|
|
|
|
if (a == b) continue; |
|
2668
|
136
|
|
|
|
|
|
ca = newAV(); |
|
2669
|
136
|
|
|
|
|
|
cb = newAV(); |
|
2670
|
136
|
100
|
|
|
|
|
if (na_mode == 0) { // pairwise complete: keep rows defined in both |
|
2671
|
648
|
100
|
|
|
|
|
for (size_t r = 0; r < nrows; r++) |
|
2672
|
531
|
100
|
|
|
|
|
if (col_def[a][r] && col_def[b][r]) { av_push(ca, newSVnv(col_val[a][r])); av_push(cb, newSVnv(col_val[b][r])); } |
|
|
|
100
|
|
|
|
|
|
|
2673
|
19
|
100
|
|
|
|
|
} else if (na_mode == 1) { // omit: each column drops its own undef (lengths may differ) |
|
2674
|
44
|
100
|
|
|
|
|
for (size_t r = 0; r < nrows; r++) if (col_def[a][r]) av_push(ca, newSVnv(col_val[a][r])); |
|
|
|
100
|
|
|
|
|
|
|
2675
|
44
|
100
|
|
|
|
|
for (size_t r = 0; r < nrows; r++) if (col_def[b][r]) av_push(cb, newSVnv(col_val[b][r])); |
|
|
|
100
|
|
|
|
|
|
|
2676
|
|
|
|
|
|
|
} else { // keep: every row, undef passed through |
|
2677
|
66
|
100
|
|
|
|
|
for (size_t r = 0; r < nrows; r++) { |
|
2678
|
55
|
100
|
|
|
|
|
av_push(ca, col_def[a][r] ? newSVnv(col_val[a][r]) : newSV(0)); |
|
2679
|
55
|
100
|
|
|
|
|
av_push(cb, col_def[b][r] ? newSVnv(col_val[b][r]) : newSV(0)); |
|
2680
|
|
|
|
|
|
|
} |
|
2681
|
|
|
|
|
|
|
} |
|
2682
|
136
|
|
|
|
|
|
rv1 = newRV_noinc((SV*)ca); |
|
2683
|
136
|
|
|
|
|
|
rv2 = newRV_noinc((SV*)cb); |
|
2684
|
136
|
100
|
|
|
|
|
if (av_len(ca) < 0 || av_len(cb) < 0) { |
|
|
|
100
|
|
|
|
|
|
|
2685
|
2
|
|
|
|
|
|
res = newSV(0); // a column had no usable values for this pair |
|
2686
|
134
|
100
|
|
|
|
|
} else if (!skip_errors) { |
|
2687
|
5
|
|
|
|
|
|
res = c2c_call(aTHX_ cv_sv, rv1, rv2); // a croak here propagates |
|
2688
|
|
|
|
|
|
|
} else { |
|
2689
|
|
|
|
|
|
|
// skip.errors: run the block under eval; on a croak keep the |
|
2690
|
|
|
|
|
|
|
// first line of its message as this cell so the caller sees |
|
2691
|
|
|
|
|
|
|
// which pair failed and why instead of the whole call dying. |
|
2692
|
129
|
|
|
|
|
|
dSP; |
|
2693
|
|
|
|
|
|
|
int n; |
|
2694
|
129
|
|
|
|
|
|
ENTER; SAVETMPS; |
|
2695
|
129
|
50
|
|
|
|
|
PUSHMARK(SP); |
|
2696
|
129
|
50
|
|
|
|
|
XPUSHs(rv1); XPUSHs(rv2); |
|
|
|
50
|
|
|
|
|
|
|
2697
|
129
|
|
|
|
|
|
PUTBACK; |
|
2698
|
129
|
|
|
|
|
|
n = call_sv(cv_sv, G_SCALAR | G_EVAL); |
|
2699
|
129
|
|
|
|
|
|
SPAGAIN; |
|
2700
|
129
|
50
|
|
|
|
|
if (SvTRUE(ERRSV)) { |
|
|
|
100
|
|
|
|
|
|
|
2701
|
|
|
|
|
|
|
STRLEN el; |
|
2702
|
8
|
50
|
|
|
|
|
const char *restrict ep = SvPV(ERRSV, el); |
|
2703
|
8
|
|
|
|
|
|
STRLEN ll = 0; // length of the first line only |
|
2704
|
132
|
50
|
|
|
|
|
while (ll < el && ep[ll] != '\n' && ep[ll] != '\r') ll++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2705
|
8
|
|
|
|
|
|
res = newSVpvn(ep, ll); |
|
2706
|
8
|
50
|
|
|
|
|
if (n > 0) (void)POPs; // discard the undef G_SCALAR leaves |
|
2707
|
|
|
|
|
|
|
} else { |
|
2708
|
121
|
50
|
|
|
|
|
res = (n > 0) ? newSVsv(POPs) : newSV(0); |
|
2709
|
|
|
|
|
|
|
} |
|
2710
|
129
|
|
|
|
|
|
PUTBACK; |
|
2711
|
129
|
50
|
|
|
|
|
FREETMPS; LEAVE; |
|
2712
|
|
|
|
|
|
|
} |
|
2713
|
135
|
|
|
|
|
|
(void)hv_store(inner, SvPVX(col_names[b]), (I32)name_len[b], res, 0); |
|
2714
|
135
|
|
|
|
|
|
SvREFCNT_dec(rv1); |
|
2715
|
135
|
|
|
|
|
|
SvREFCNT_dec(rv2); |
|
2716
|
|
|
|
|
|
|
} |
|
2717
|
86
|
|
|
|
|
|
(void)hv_store(out_hv, SvPVX(col_names[a]), (I32)name_len[a], newRV_noinc((SV*)inner), 0); |
|
2718
|
|
|
|
|
|
|
} |
|
2719
|
|
|
|
|
|
|
// 5. tidy up. |
|
2720
|
125
|
100
|
|
|
|
|
for (size_t cc = 0; cc < ncols; cc++) { Safefree(col_val[cc]); Safefree(col_def[cc]); } |
|
2721
|
36
|
|
|
|
|
|
Safefree(col_val); Safefree(col_def); Safefree(col_names); |
|
2722
|
36
|
|
|
|
|
|
Safefree(name_len); Safefree(is_outer); SvREFCNT_dec((SV*)names_av); |
|
2723
|
36
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)out_hv); |
|
2724
|
|
|
|
|
|
|
} |
|
2725
|
|
|
|
|
|
|
OUTPUT: |
|
2726
|
|
|
|
|
|
|
RETVAL |
|
2727
|
|
|
|
|
|
|
|
|
2728
|
|
|
|
|
|
|
SV *oneway_test(data_ref, ...) |
|
2729
|
|
|
|
|
|
|
SV *data_ref |
|
2730
|
|
|
|
|
|
|
PREINIT: |
|
2731
|
6
|
|
|
|
|
|
HV *restrict in_hv = NULL; |
|
2732
|
6
|
|
|
|
|
|
AV *restrict in_av = NULL; |
|
2733
|
|
|
|
|
|
|
HE *restrict he; |
|
2734
|
6
|
|
|
|
|
|
bool var_equal = 0; |
|
2735
|
6
|
|
|
|
|
|
const char *restrict formula_str = NULL; |
|
2736
|
6
|
|
|
|
|
|
const char *restrict factor_name = "Group"; |
|
2737
|
6
|
|
|
|
|
|
char *lhs = NULL, *rhs = NULL; |
|
2738
|
6
|
|
|
|
|
|
NV *restrict flat = NULL; |
|
2739
|
6
|
|
|
|
|
|
size_t *restrict sizes = NULL; |
|
2740
|
6
|
|
|
|
|
|
char ** gnames = NULL; |
|
2741
|
6
|
|
|
|
|
|
NV *restrict gmeans = NULL; |
|
2742
|
6
|
|
|
|
|
|
size_t k = 0; |
|
2743
|
6
|
|
|
|
|
|
IV total_n = 0; |
|
2744
|
|
|
|
|
|
|
OneWayResult res; |
|
2745
|
|
|
|
|
|
|
HV *restrict ret_hv; |
|
2746
|
|
|
|
|
|
|
char errbuf[512]; |
|
2747
|
|
|
|
|
|
|
CODE: |
|
2748
|
|
|
|
|
|
|
// parse named arguments |
|
2749
|
10
|
100
|
|
|
|
|
for (I32 ai = 1; ai + 1 < items; ai += 2) { |
|
2750
|
4
|
|
|
|
|
|
const char *restrict key = SvPV_nolen(ST(ai)); |
|
2751
|
4
|
|
|
|
|
|
SV *restrict val = ST(ai + 1); |
|
2752
|
4
|
50
|
|
|
|
|
if (strEQ(key, "var_equal")) |
|
2753
|
0
|
|
|
|
|
|
var_equal = SvTRUE(val) ? 1 : 0; |
|
2754
|
4
|
50
|
|
|
|
|
else if (strEQ(key, "formula")) |
|
2755
|
4
|
|
|
|
|
|
formula_str = SvPV_nolen(val); |
|
2756
|
|
|
|
|
|
|
} |
|
2757
|
|
|
|
|
|
|
// validate data_ref and determine if it's an Array or Hash |
|
2758
|
6
|
50
|
|
|
|
|
if (!SvROK(data_ref)) |
|
2759
|
0
|
|
|
|
|
|
croak("oneway_test: first argument must be a hash or array reference"); |
|
2760
|
6
|
|
|
|
|
|
SV *restrict rv = SvRV(data_ref); |
|
2761
|
6
|
100
|
|
|
|
|
if (SvTYPE(rv) == SVt_PVHV) { |
|
2762
|
5
|
|
|
|
|
|
in_hv = (HV *)rv; |
|
2763
|
1
|
50
|
|
|
|
|
} else if (SvTYPE(rv) == SVt_PVAV) { |
|
2764
|
1
|
|
|
|
|
|
in_av = (AV *)rv; |
|
2765
|
|
|
|
|
|
|
} else { |
|
2766
|
0
|
|
|
|
|
|
croak("oneway_test: first argument must be a hash or array reference"); |
|
2767
|
|
|
|
|
|
|
} |
|
2768
|
6
|
100
|
|
|
|
|
if (in_av) { |
|
2769
|
|
|
|
|
|
|
// MODE 3 – Array of Arrays (AoA) |
|
2770
|
1
|
50
|
|
|
|
|
if (formula_str != NULL) |
|
2771
|
0
|
|
|
|
|
|
croak("oneway_test: formula mode is not supported with an array of arrays"); |
|
2772
|
|
|
|
|
|
|
|
|
2773
|
1
|
|
|
|
|
|
k = (size_t)av_len(in_av) + 1; |
|
2774
|
1
|
50
|
|
|
|
|
if (k < 2) |
|
2775
|
0
|
|
|
|
|
|
croak("oneway_test: need at least 2 groups, got %zu", k); |
|
2776
|
1
|
|
|
|
|
|
sizes = (size_t *)safemalloc(k * sizeof(size_t)); |
|
2777
|
1
|
|
|
|
|
|
gnames = (char **)safemalloc(k * sizeof(char *)); |
|
2778
|
|
|
|
|
|
|
// first pass: sizes, total_n, and generate index names |
|
2779
|
3
|
100
|
|
|
|
|
for (size_t g = 0; g < k; g++) { |
|
2780
|
2
|
|
|
|
|
|
SV **restrict val = av_fetch(in_av, (I32)g, 0); |
|
2781
|
2
|
50
|
|
|
|
|
if (!val || !*val || !SvROK(*val) || SvTYPE(SvRV(*val)) != SVt_PVAV) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2782
|
0
|
|
|
|
|
|
croak("oneway_test: index %zu is not an array reference", g); |
|
2783
|
2
|
|
|
|
|
|
IV len = av_len((AV *)SvRV(*val)) + 1; |
|
2784
|
2
|
50
|
|
|
|
|
if (len < 2) |
|
2785
|
0
|
|
|
|
|
|
croak("oneway_test: index %zu has fewer than 2 observations", g); |
|
2786
|
2
|
|
|
|
|
|
sizes[g] = (size_t)len; |
|
2787
|
2
|
|
|
|
|
|
total_n += (IV)len; |
|
2788
|
|
|
|
|
|
|
/* synthesize group names: "Index 0", "Index 1", ... to match 0-based index */ |
|
2789
|
|
|
|
|
|
|
char buf[64]; |
|
2790
|
2
|
|
|
|
|
|
snprintf(buf, sizeof(buf), "Index %zu", g); |
|
2791
|
2
|
|
|
|
|
|
size_t klen = strlen(buf); |
|
2792
|
2
|
|
|
|
|
|
gnames[g] = (char *)safemalloc(klen + 1); |
|
2793
|
2
|
|
|
|
|
|
memcpy(gnames[g], buf, klen + 1); |
|
2794
|
|
|
|
|
|
|
} |
|
2795
|
|
|
|
|
|
|
// second pass: fill flat array |
|
2796
|
1
|
|
|
|
|
|
flat = (NV *)safemalloc((size_t)total_n * sizeof(NV)); |
|
2797
|
1
|
|
|
|
|
|
size_t offset = 0; |
|
2798
|
3
|
100
|
|
|
|
|
for (size_t g = 0; g < k; g++) { |
|
2799
|
2
|
|
|
|
|
|
SV **restrict val = av_fetch(in_av, (I32)g, 0); |
|
2800
|
2
|
|
|
|
|
|
AV *restrict av = (AV *)SvRV(*val); |
|
2801
|
2
|
|
|
|
|
|
IV len = av_len(av) + 1; |
|
2802
|
14
|
100
|
|
|
|
|
for (IV i = 0; i < len; i++) { |
|
2803
|
12
|
|
|
|
|
|
SV **restrict svp = av_fetch(av, i, 0); |
|
2804
|
12
|
50
|
|
|
|
|
flat[offset++] = (svp && *svp) ? SvNV(*svp) : 0.0; |
|
|
|
50
|
|
|
|
|
|
|
2805
|
|
|
|
|
|
|
} |
|
2806
|
|
|
|
|
|
|
} |
|
2807
|
5
|
100
|
|
|
|
|
} else if (formula_str != NULL) {// MODE 2 – formula "response ~ factor" |
|
2808
|
4
|
100
|
|
|
|
|
if (!parse_formula(formula_str, &lhs, &rhs)) |
|
2809
|
1
|
|
|
|
|
|
croak("oneway_test: cannot parse formula '%s' — " |
|
2810
|
|
|
|
|
|
|
"expected 'response ~ factor'", formula_str); |
|
2811
|
3
|
|
|
|
|
|
factor_name = rhs; /* use the actual factor variable name */ |
|
2812
|
3
|
|
|
|
|
|
SV **restrict resp_svp = hv_fetch(in_hv, lhs, (I32)strlen(lhs), 0); |
|
2813
|
3
|
100
|
|
|
|
|
if (!resp_svp || !*resp_svp || !SvROK(*resp_svp) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2814
|
2
|
50
|
|
|
|
|
|| SvTYPE(SvRV(*resp_svp)) != SVt_PVAV) |
|
2815
|
1
|
|
|
|
|
|
croak("oneway_test: formula LHS '%s' not found as an array ref " |
|
2816
|
|
|
|
|
|
|
"in the hash", lhs); |
|
2817
|
2
|
|
|
|
|
|
SV **restrict fact_svp = hv_fetch(in_hv, rhs, (I32)strlen(rhs), 0); |
|
2818
|
2
|
50
|
|
|
|
|
if (!fact_svp || !*fact_svp || !SvROK(*fact_svp) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2819
|
2
|
50
|
|
|
|
|
|| SvTYPE(SvRV(*fact_svp)) != SVt_PVAV) |
|
2820
|
0
|
|
|
|
|
|
croak("oneway_test: formula RHS '%s' not found as an array ref " |
|
2821
|
|
|
|
|
|
|
"in the hash", rhs); |
|
2822
|
2
|
|
|
|
|
|
AV *restrict resp_av = (AV *)SvRV(*resp_svp); |
|
2823
|
2
|
|
|
|
|
|
AV *restrict label_av = (AV *)SvRV(*fact_svp); |
|
2824
|
2
|
|
|
|
|
|
IV n = av_len(resp_av) + 1; |
|
2825
|
2
|
|
|
|
|
|
flat = (NV *)safemalloc((size_t)n * sizeof(NV)); |
|
2826
|
2
|
|
|
|
|
|
sizes = (size_t *)safemalloc((size_t)n * sizeof(size_t)); |
|
2827
|
2
|
100
|
|
|
|
|
if (!build_groups_from_formula(aTHX_ resp_av, label_av, |
|
2828
|
|
|
|
|
|
|
flat, sizes, &k, &gnames, |
|
2829
|
|
|
|
|
|
|
errbuf, sizeof errbuf)) { |
|
2830
|
1
|
|
|
|
|
|
Safefree(flat); Safefree(sizes); Safefree(lhs); Safefree(rhs); |
|
2831
|
1
|
|
|
|
|
|
croak("oneway_test: %s", errbuf); |
|
2832
|
|
|
|
|
|
|
} |
|
2833
|
3
|
100
|
|
|
|
|
for (size_t g = 0; g < k; g++) total_n += (IV)sizes[g]; |
|
2834
|
|
|
|
|
|
|
} else { |
|
2835
|
|
|
|
|
|
|
/* MODE 1 – hash of groups { label => \@observations, … } */ |
|
2836
|
1
|
|
|
|
|
|
k = (size_t)hv_iterinit(in_hv); |
|
2837
|
1
|
50
|
|
|
|
|
if (k < 2) |
|
2838
|
0
|
|
|
|
|
|
croak("oneway_test: need at least 2 groups, got %zu", k); |
|
2839
|
1
|
|
|
|
|
|
sizes = (size_t *)safemalloc(k * sizeof(size_t)); |
|
2840
|
1
|
|
|
|
|
|
gnames = (char **)safemalloc(k * sizeof(char *)); |
|
2841
|
|
|
|
|
|
|
/* first pass: sizes, total_n, and group name strings */ |
|
2842
|
|
|
|
|
|
|
{ |
|
2843
|
1
|
|
|
|
|
|
size_t g = 0; |
|
2844
|
3
|
100
|
|
|
|
|
while ((he = hv_iternext(in_hv)) != NULL) { |
|
2845
|
2
|
|
|
|
|
|
SV *restrict val = HeVAL(he); |
|
2846
|
2
|
50
|
|
|
|
|
if (!SvROK(val) || SvTYPE(SvRV(val)) != SVt_PVAV) |
|
|
|
50
|
|
|
|
|
|
|
2847
|
0
|
0
|
|
|
|
|
croak("oneway_test: value for group '%s' is not an array ref", |
|
2848
|
|
|
|
|
|
|
HePV(he, PL_na)); |
|
2849
|
2
|
|
|
|
|
|
IV len = av_len((AV *)SvRV(val)) + 1; |
|
2850
|
2
|
50
|
|
|
|
|
if (len < 2) |
|
2851
|
0
|
0
|
|
|
|
|
croak("oneway_test: group '%s' has fewer than 2 observations", |
|
2852
|
|
|
|
|
|
|
HePV(he, PL_na)); |
|
2853
|
2
|
|
|
|
|
|
sizes[g] = (size_t)len; |
|
2854
|
2
|
|
|
|
|
|
total_n += (IV)len; |
|
2855
|
|
|
|
|
|
|
/* save a copy of the key string */ |
|
2856
|
|
|
|
|
|
|
STRLEN klen; |
|
2857
|
2
|
50
|
|
|
|
|
const char *kstr = HePV(he, klen); |
|
2858
|
2
|
|
|
|
|
|
gnames[g] = (char *)safemalloc(klen + 1); |
|
2859
|
2
|
|
|
|
|
|
memcpy(gnames[g], kstr, klen + 1); |
|
2860
|
2
|
|
|
|
|
|
g++; |
|
2861
|
|
|
|
|
|
|
} |
|
2862
|
|
|
|
|
|
|
} |
|
2863
|
|
|
|
|
|
|
// second pass: fill flat in the same iteration order |
|
2864
|
1
|
|
|
|
|
|
flat = (NV *)safemalloc((size_t)total_n * sizeof(NV)); |
|
2865
|
|
|
|
|
|
|
{ |
|
2866
|
1
|
|
|
|
|
|
size_t offset = 0; |
|
2867
|
1
|
|
|
|
|
|
hv_iterinit(in_hv); |
|
2868
|
3
|
100
|
|
|
|
|
while ((he = hv_iternext(in_hv)) != NULL) { |
|
2869
|
2
|
|
|
|
|
|
AV *restrict av = (AV *)SvRV(HeVAL(he)); |
|
2870
|
2
|
|
|
|
|
|
IV len = av_len(av) + 1; |
|
2871
|
14
|
100
|
|
|
|
|
for (IV i = 0; i < len; i++) { |
|
2872
|
12
|
|
|
|
|
|
SV **restrict svp = av_fetch(av, i, 0); |
|
2873
|
12
|
50
|
|
|
|
|
flat[offset++] = (svp && *svp) ? SvNV(*svp) : 0.0; |
|
|
|
50
|
|
|
|
|
|
|
2874
|
|
|
|
|
|
|
} |
|
2875
|
|
|
|
|
|
|
} |
|
2876
|
|
|
|
|
|
|
} |
|
2877
|
|
|
|
|
|
|
} |
|
2878
|
|
|
|
|
|
|
// per-group means from flat (before c_oneway_test frees nothing) |
|
2879
|
3
|
|
|
|
|
|
gmeans = (NV *)safemalloc(k * sizeof(NV)); |
|
2880
|
|
|
|
|
|
|
{ |
|
2881
|
3
|
|
|
|
|
|
size_t offset = 0; |
|
2882
|
9
|
100
|
|
|
|
|
for (size_t g = 0; g < k; g++) { |
|
2883
|
6
|
|
|
|
|
|
NV sum = 0.0; |
|
2884
|
36
|
100
|
|
|
|
|
for (size_t i = 0; i < sizes[g]; i++) sum += flat[offset + i]; |
|
2885
|
6
|
|
|
|
|
|
gmeans[g] = sum / (NV)sizes[g]; |
|
2886
|
6
|
|
|
|
|
|
offset += sizes[g]; |
|
2887
|
|
|
|
|
|
|
} |
|
2888
|
|
|
|
|
|
|
} |
|
2889
|
|
|
|
|
|
|
// run the arithmetic |
|
2890
|
3
|
|
|
|
|
|
res = c_oneway_test(flat, sizes, k, var_equal); |
|
2891
|
3
|
|
|
|
|
|
Safefree(flat); |
|
2892
|
3
|
100
|
|
|
|
|
if (lhs) Safefree(lhs); |
|
2893
|
|
|
|
|
|
|
/* rhs kept alive as factor_name until after output */ |
|
2894
|
|
|
|
|
|
|
/* ── build return hash ref |
|
2895
|
|
|
|
|
|
|
* { * |
|
2896
|
|
|
|
|
|
|
* => { Df, "Sum Sq", "Mean Sq", "F value", "Pr(>F)" } * |
|
2897
|
|
|
|
|
|
|
* Residuals => { Df, "Sum Sq", "Mean Sq" } * |
|
2898
|
|
|
|
|
|
|
* group_stats => { mean => { g => v, … }, size => { g => n, … } } * |
|
2899
|
|
|
|
|
|
|
* }*/ |
|
2900
|
3
|
|
|
|
|
|
ret_hv = (HV *)sv_2mortal((SV *)newHV()); |
|
2901
|
|
|
|
|
|
|
/* Group (factor) sub-hash */ |
|
2902
|
|
|
|
|
|
|
{ |
|
2903
|
3
|
|
|
|
|
|
HV *restrict g_hv = newHV(); |
|
2904
|
3
|
|
|
|
|
|
hv_stores(g_hv, "Df", newSVnv(res.num_df)); |
|
2905
|
3
|
|
|
|
|
|
hv_stores(g_hv, "Sum Sq", newSVnv(res.ss_between)); |
|
2906
|
3
|
|
|
|
|
|
hv_stores(g_hv, "Mean Sq", newSVnv(res.ms_between)); |
|
2907
|
3
|
|
|
|
|
|
hv_stores(g_hv, "F value", newSVnv(res.statistic)); |
|
2908
|
3
|
|
|
|
|
|
hv_stores(g_hv, "Pr(>F)", newSVnv(res.p_value)); |
|
2909
|
3
|
|
|
|
|
|
hv_store(ret_hv, factor_name, (I32)strlen(factor_name), |
|
2910
|
|
|
|
|
|
|
newRV_noinc((SV *)g_hv), 0); |
|
2911
|
|
|
|
|
|
|
} |
|
2912
|
|
|
|
|
|
|
/* Residuals sub-hash */ |
|
2913
|
|
|
|
|
|
|
{ |
|
2914
|
3
|
|
|
|
|
|
HV *restrict r_hv = newHV(); |
|
2915
|
3
|
|
|
|
|
|
hv_stores(r_hv, "Df", newSVnv(res.denom_df)); |
|
2916
|
3
|
|
|
|
|
|
hv_stores(r_hv, "Sum Sq", newSVnv(res.ss_within)); |
|
2917
|
3
|
|
|
|
|
|
hv_stores(r_hv, "Mean Sq", newSVnv(res.ms_within)); |
|
2918
|
3
|
|
|
|
|
|
hv_stores(ret_hv, "Residuals", newRV_noinc((SV *)r_hv)); |
|
2919
|
|
|
|
|
|
|
} |
|
2920
|
|
|
|
|
|
|
/* group_stats sub-hash */ |
|
2921
|
|
|
|
|
|
|
{ |
|
2922
|
3
|
|
|
|
|
|
HV *restrict gs_hv = newHV(); |
|
2923
|
3
|
|
|
|
|
|
HV *restrict mean_hv = newHV(); |
|
2924
|
3
|
|
|
|
|
|
HV *restrict size_hv = newHV(); |
|
2925
|
9
|
100
|
|
|
|
|
for (size_t g = 0; g < k; g++) { |
|
2926
|
6
|
|
|
|
|
|
const char *restrict gn = gnames[g]; |
|
2927
|
6
|
|
|
|
|
|
I32 gnl = (I32)strlen(gn); |
|
2928
|
6
|
|
|
|
|
|
hv_store(mean_hv, gn, gnl, newSVnv(gmeans[g]), 0); |
|
2929
|
6
|
|
|
|
|
|
hv_store(size_hv, gn, gnl, newSViv((IV)sizes[g]), 0); |
|
2930
|
|
|
|
|
|
|
} |
|
2931
|
3
|
|
|
|
|
|
hv_stores(gs_hv, "mean", newRV_noinc((SV *)mean_hv)); |
|
2932
|
3
|
|
|
|
|
|
hv_stores(gs_hv, "size", newRV_noinc((SV *)size_hv)); |
|
2933
|
3
|
|
|
|
|
|
hv_stores(ret_hv, "group_stats", newRV_noinc((SV *)gs_hv)); |
|
2934
|
|
|
|
|
|
|
} |
|
2935
|
|
|
|
|
|
|
// clean up |
|
2936
|
3
|
|
|
|
|
|
Safefree(gmeans); Safefree(sizes); |
|
2937
|
9
|
100
|
|
|
|
|
for (size_t g = 0; g < k; g++) Safefree(gnames[g]); |
|
2938
|
3
|
|
|
|
|
|
Safefree(gnames); |
|
2939
|
3
|
100
|
|
|
|
|
if (rhs) Safefree(rhs); |
|
2940
|
|
|
|
|
|
|
// freed here, after factor_name is no longer needed |
|
2941
|
3
|
|
|
|
|
|
RETVAL = newRV((SV *)ret_hv); |
|
2942
|
|
|
|
|
|
|
OUTPUT: |
|
2943
|
|
|
|
|
|
|
RETVAL |
|
2944
|
|
|
|
|
|
|
|
|
2945
|
|
|
|
|
|
|
SV* ks_test(...) |
|
2946
|
|
|
|
|
|
|
CODE: |
|
2947
|
|
|
|
|
|
|
{ |
|
2948
|
10
|
|
|
|
|
|
SV *restrict x_sv = NULL, *restrict y_sv = NULL; |
|
2949
|
10
|
|
|
|
|
|
short int exact = -1; |
|
2950
|
10
|
|
|
|
|
|
const char *restrict alternative = "two.sided"; |
|
2951
|
10
|
|
|
|
|
|
int arg_idx = 0; |
|
2952
|
|
|
|
|
|
|
|
|
2953
|
|
|
|
|
|
|
// Shift arrays if provided positionally |
|
2954
|
10
|
50
|
|
|
|
|
if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2955
|
10
|
|
|
|
|
|
x_sv = ST(arg_idx); |
|
2956
|
10
|
|
|
|
|
|
arg_idx++; |
|
2957
|
|
|
|
|
|
|
} |
|
2958
|
|
|
|
|
|
|
// Check if second argument is an array (2-sample) or a string representing a CDF (1-sample) |
|
2959
|
10
|
50
|
|
|
|
|
if (arg_idx < items) { |
|
2960
|
10
|
100
|
|
|
|
|
if (SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
2961
|
9
|
|
|
|
|
|
y_sv = ST(arg_idx); |
|
2962
|
9
|
|
|
|
|
|
arg_idx++; |
|
2963
|
1
|
50
|
|
|
|
|
} else if (SvPOK(ST(arg_idx))) { |
|
2964
|
1
|
|
|
|
|
|
y_sv = ST(arg_idx); // Save string (e.g., "pnorm") for 1-sample test logic |
|
2965
|
1
|
|
|
|
|
|
arg_idx++; |
|
2966
|
|
|
|
|
|
|
} |
|
2967
|
|
|
|
|
|
|
} |
|
2968
|
|
|
|
|
|
|
// Parse named arguments |
|
2969
|
12
|
100
|
|
|
|
|
for (; arg_idx < items; arg_idx += 2) { |
|
2970
|
2
|
|
|
|
|
|
const char *restrict key = SvPV_nolen(ST(arg_idx)); |
|
2971
|
2
|
|
|
|
|
|
SV *restrict val = ST(arg_idx + 1); |
|
2972
|
2
|
50
|
|
|
|
|
if (strEQ(key, "x")) x_sv = val; |
|
2973
|
2
|
50
|
|
|
|
|
else if (strEQ(key, "y")) y_sv = val; |
|
2974
|
2
|
50
|
|
|
|
|
else if (strEQ(key, "exact")) { |
|
2975
|
0
|
0
|
|
|
|
|
if (!SvOK(val)) exact = -1; |
|
2976
|
0
|
|
|
|
|
|
else exact = SvTRUE(val) ? 1 : 0; |
|
2977
|
|
|
|
|
|
|
} |
|
2978
|
2
|
50
|
|
|
|
|
else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val); |
|
2979
|
0
|
|
|
|
|
|
else croak("ks_test: unknown argument '%s'", key); |
|
2980
|
|
|
|
|
|
|
} |
|
2981
|
|
|
|
|
|
|
|
|
2982
|
10
|
50
|
|
|
|
|
if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2983
|
0
|
|
|
|
|
|
croak("ks_test: 'x' is a required argument and must be an ARRAY reference"); |
|
2984
|
|
|
|
|
|
|
} |
|
2985
|
|
|
|
|
|
|
|
|
2986
|
10
|
|
|
|
|
|
bool is_two_sided = strEQ(alternative, "two.sided") ? 1 : 0; |
|
2987
|
10
|
|
|
|
|
|
bool is_greater = strEQ(alternative, "greater") ? 1 : 0; |
|
2988
|
10
|
|
|
|
|
|
bool is_less = strEQ(alternative, "less") ? 1 : 0; |
|
2989
|
|
|
|
|
|
|
|
|
2990
|
10
|
100
|
|
|
|
|
if (!is_two_sided && !is_greater && !is_less) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2991
|
0
|
|
|
|
|
|
croak("ks_test: alternative must be 'two.sided', 'less', or 'greater'"); |
|
2992
|
|
|
|
|
|
|
} |
|
2993
|
|
|
|
|
|
|
|
|
2994
|
10
|
|
|
|
|
|
AV *restrict x_av = (AV*)SvRV(x_sv); |
|
2995
|
10
|
|
|
|
|
|
size_t nx = av_len(x_av) + 1; |
|
2996
|
10
|
50
|
|
|
|
|
if (nx == 0) croak("Not enough 'x' observations"); |
|
2997
|
|
|
|
|
|
|
|
|
2998
|
|
|
|
|
|
|
// Extract 'x' array to C-array |
|
2999
|
10
|
|
|
|
|
|
NV *restrict x_data = (NV *)safemalloc(nx * sizeof(NV)); |
|
3000
|
10
|
|
|
|
|
|
size_t valid_nx = 0; |
|
3001
|
240
|
100
|
|
|
|
|
for (size_t i = 0; i < nx; i++) { |
|
3002
|
230
|
|
|
|
|
|
SV**restrict el = av_fetch(x_av, i, 0); |
|
3003
|
230
|
50
|
|
|
|
|
if (el && SvOK(*el) && looks_like_number(*el)) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
3004
|
230
|
|
|
|
|
|
x_data[valid_nx++] = SvNV(*el); |
|
3005
|
|
|
|
|
|
|
} |
|
3006
|
|
|
|
|
|
|
} |
|
3007
|
10
|
|
|
|
|
|
NV statistic = 0.0, p_value = 0.0; |
|
3008
|
10
|
|
|
|
|
|
const char *restrict method_desc = ""; |
|
3009
|
|
|
|
|
|
|
// --- TWO SAMPLE --- |
|
3010
|
19
|
50
|
|
|
|
|
if (y_sv && SvROK(y_sv) && SvTYPE(SvRV(y_sv)) == SVt_PVAV) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
3011
|
9
|
|
|
|
|
|
AV *restrict y_av = (AV*)SvRV(y_sv); |
|
3012
|
9
|
|
|
|
|
|
size_t ny = av_len(y_av) + 1; |
|
3013
|
9
|
|
|
|
|
|
NV *restrict y_data = (NV *)safemalloc(ny * sizeof(NV)); |
|
3014
|
9
|
|
|
|
|
|
size_t valid_ny = 0; |
|
3015
|
129
|
100
|
|
|
|
|
for (size_t i = 0; i < ny; i++) { |
|
3016
|
120
|
|
|
|
|
|
SV**restrict el = av_fetch(y_av, i, 0); |
|
3017
|
120
|
50
|
|
|
|
|
if (el && SvOK(*el) && looks_like_number(*el)) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
3018
|
120
|
|
|
|
|
|
y_data[valid_ny++] = SvNV(*el); |
|
3019
|
|
|
|
|
|
|
} |
|
3020
|
|
|
|
|
|
|
} |
|
3021
|
9
|
50
|
|
|
|
|
if (valid_nx < 1 || valid_ny < 1) { |
|
|
|
50
|
|
|
|
|
|
|
3022
|
0
|
|
|
|
|
|
Safefree(x_data); Safefree(y_data); |
|
3023
|
0
|
|
|
|
|
|
croak("Not enough non-missing observations for KS test"); |
|
3024
|
|
|
|
|
|
|
} |
|
3025
|
|
|
|
|
|
|
NV d, d_plus, d_minus; |
|
3026
|
9
|
|
|
|
|
|
calc_2sample_stats(x_data, valid_nx, y_data, valid_ny, &d, &d_plus, &d_minus); |
|
3027
|
|
|
|
|
|
|
// Map alternative to the correct statistic |
|
3028
|
9
|
100
|
|
|
|
|
if (is_greater) statistic = d_plus; |
|
3029
|
8
|
100
|
|
|
|
|
else if (is_less) statistic = d_minus; |
|
3030
|
7
|
|
|
|
|
|
else statistic = d; |
|
3031
|
|
|
|
|
|
|
// Determine if exact or asymptotic |
|
3032
|
9
|
|
|
|
|
|
bool use_exact = FALSE; |
|
3033
|
9
|
50
|
|
|
|
|
if (exact == 1) use_exact = TRUE; |
|
3034
|
9
|
50
|
|
|
|
|
else if (exact == 0) use_exact = FALSE; |
|
3035
|
9
|
|
|
|
|
|
else use_exact = (valid_nx * valid_ny < 10000); |
|
3036
|
|
|
|
|
|
|
// Check for ties in combined set |
|
3037
|
9
|
|
|
|
|
|
size_t total_n = valid_nx + valid_ny; |
|
3038
|
9
|
|
|
|
|
|
NV *restrict comb = (NV *)safemalloc(total_n * sizeof(NV)); |
|
3039
|
189
|
100
|
|
|
|
|
for(size_t i=0; i
|
|
3040
|
129
|
100
|
|
|
|
|
for(size_t i=0; i
|
|
3041
|
9
|
|
|
|
|
|
qsort(comb, total_n, sizeof(NV), compare_NVs); |
|
3042
|
9
|
|
|
|
|
|
bool has_ties = FALSE; |
|
3043
|
300
|
100
|
|
|
|
|
for(size_t i = 1; i < total_n; i++) { |
|
3044
|
291
|
50
|
|
|
|
|
if(comb[i] == comb[i-1]) { has_ties = TRUE; break; } |
|
3045
|
|
|
|
|
|
|
} |
|
3046
|
9
|
|
|
|
|
|
Safefree(comb); |
|
3047
|
9
|
50
|
|
|
|
|
if (use_exact && has_ties) { |
|
|
|
50
|
|
|
|
|
|
|
3048
|
0
|
|
|
|
|
|
warn("ks_test: cannot compute exact p-value with ties; falling back to asymptotic"); |
|
3049
|
0
|
|
|
|
|
|
use_exact = FALSE; |
|
3050
|
|
|
|
|
|
|
} |
|
3051
|
9
|
50
|
|
|
|
|
if (use_exact) { |
|
3052
|
9
|
|
|
|
|
|
method_desc = "Two-sample Kolmogorov-Smirnov exact test"; |
|
3053
|
9
|
|
|
|
|
|
NV q = (0.5 + floor(statistic * valid_nx * valid_ny - 1e-7)) / ((NV)valid_nx * valid_ny); |
|
3054
|
9
|
|
|
|
|
|
p_value = psmirnov_exact_uniq_upper(q, valid_nx, valid_ny, is_two_sided); |
|
3055
|
|
|
|
|
|
|
} else { |
|
3056
|
0
|
|
|
|
|
|
method_desc = "Two-sample Kolmogorov-Smirnov test (asymptotic)"; |
|
3057
|
0
|
|
|
|
|
|
NV z = statistic * sqrt((NV)(valid_nx * valid_ny) / (valid_nx + valid_ny)); |
|
3058
|
0
|
0
|
|
|
|
|
if (is_two_sided) { |
|
3059
|
0
|
|
|
|
|
|
p_value = K2l(z, 0, 1e-9); |
|
3060
|
|
|
|
|
|
|
} else { |
|
3061
|
0
|
|
|
|
|
|
p_value = exp(-2.0 * z * z); // One-sided limit distribution |
|
3062
|
|
|
|
|
|
|
} |
|
3063
|
|
|
|
|
|
|
} |
|
3064
|
9
|
|
|
|
|
|
Safefree(y_data); |
|
3065
|
2
|
50
|
|
|
|
|
} else if (y_sv && SvPOK(y_sv)) {// --- ONE SAMPLE (e.g. against pnorm) --- |
|
|
|
50
|
|
|
|
|
|
|
3066
|
1
|
|
|
|
|
|
const char *restrict dist = SvPV_nolen(y_sv); |
|
3067
|
1
|
50
|
|
|
|
|
if (strEQ(dist, "pnorm")) { |
|
3068
|
1
|
|
|
|
|
|
qsort(x_data, valid_nx, sizeof(NV), compare_NVs); |
|
3069
|
1
|
|
|
|
|
|
NV max_d = 0.0, max_d_plus = 0.0, max_d_minus = 0.0; |
|
3070
|
51
|
100
|
|
|
|
|
for(size_t i = 0; i < valid_nx; i++) { |
|
3071
|
50
|
|
|
|
|
|
NV cdf_obs_low = (NV)i / valid_nx; |
|
3072
|
50
|
|
|
|
|
|
NV cdf_obs_high = (NV)(i + 1) / valid_nx; |
|
3073
|
50
|
|
|
|
|
|
NV cdf_theor = approx_pnorm(x_data[i]); |
|
3074
|
50
|
|
|
|
|
|
NV diff1 = cdf_obs_low - cdf_theor; |
|
3075
|
50
|
|
|
|
|
|
NV diff2 = cdf_obs_high - cdf_theor; |
|
3076
|
50
|
50
|
|
|
|
|
if (diff1 > max_d_plus) max_d_plus = diff1; |
|
3077
|
50
|
100
|
|
|
|
|
if (diff2 > max_d_plus) max_d_plus = diff2; |
|
3078
|
50
|
100
|
|
|
|
|
if (-diff1 > max_d_minus) max_d_minus = -diff1; |
|
3079
|
50
|
50
|
|
|
|
|
if (-diff2 > max_d_minus) max_d_minus = -diff2; |
|
3080
|
50
|
100
|
|
|
|
|
if (fabs(diff1) > max_d) max_d = fabs(diff1); |
|
3081
|
50
|
50
|
|
|
|
|
if (fabs(diff2) > max_d) max_d = fabs(diff2); |
|
3082
|
|
|
|
|
|
|
} |
|
3083
|
1
|
50
|
|
|
|
|
if (is_greater) statistic = max_d_plus; |
|
3084
|
1
|
50
|
|
|
|
|
else if (is_less) statistic = max_d_minus; |
|
3085
|
1
|
|
|
|
|
|
else statistic = max_d; |
|
3086
|
1
|
50
|
|
|
|
|
bool use_exact = (exact == -1) ? (valid_nx < 100) : (exact == 1); |
|
3087
|
1
|
50
|
|
|
|
|
if (use_exact) { |
|
3088
|
1
|
|
|
|
|
|
method_desc = "One-sample Kolmogorov-Smirnov exact test"; |
|
3089
|
1
|
50
|
|
|
|
|
if (is_two_sided) { |
|
3090
|
1
|
|
|
|
|
|
p_value = 1.0 - K2x(valid_nx, statistic); |
|
3091
|
|
|
|
|
|
|
} else { |
|
3092
|
0
|
|
|
|
|
|
warn("exact 1-sample 1-sided KS test not implemented; using asymptotic"); |
|
3093
|
0
|
|
|
|
|
|
NV z = statistic * sqrt((NV)valid_nx); |
|
3094
|
0
|
|
|
|
|
|
p_value = exp(-2.0 * z * z); |
|
3095
|
|
|
|
|
|
|
} |
|
3096
|
|
|
|
|
|
|
} else { |
|
3097
|
0
|
|
|
|
|
|
method_desc = "One-sample Kolmogorov-Smirnov test (asymptotic)"; |
|
3098
|
0
|
|
|
|
|
|
NV z = statistic * sqrt((NV)valid_nx); |
|
3099
|
0
|
0
|
|
|
|
|
if (is_two_sided) p_value = K2l(z, 0, 1e-6); |
|
3100
|
0
|
|
|
|
|
|
else p_value = exp(-2.0 * z * z); |
|
3101
|
|
|
|
|
|
|
} |
|
3102
|
|
|
|
|
|
|
} else { |
|
3103
|
0
|
|
|
|
|
|
Safefree(x_data); |
|
3104
|
0
|
|
|
|
|
|
croak("ks_test: Unsupported 1-sample distribution '%s'. Use arrays for 2-sample.", dist); |
|
3105
|
|
|
|
|
|
|
} |
|
3106
|
|
|
|
|
|
|
} else { |
|
3107
|
0
|
|
|
|
|
|
Safefree(x_data); |
|
3108
|
0
|
|
|
|
|
|
croak("ks_test: Invalid arguments for 'y'."); |
|
3109
|
|
|
|
|
|
|
} |
|
3110
|
10
|
|
|
|
|
|
Safefree(x_data); |
|
3111
|
10
|
50
|
|
|
|
|
if (p_value > 1.0) p_value = 1.0; |
|
3112
|
10
|
50
|
|
|
|
|
if (p_value < 0.0) p_value = 0.0; |
|
3113
|
10
|
|
|
|
|
|
HV *restrict res = newHV(); |
|
3114
|
10
|
|
|
|
|
|
hv_stores(res, "statistic", newSVnv(statistic)); |
|
3115
|
10
|
|
|
|
|
|
hv_stores(res, "p_value", newSVnv(p_value)); |
|
3116
|
10
|
|
|
|
|
|
hv_stores(res, "method", newSVpv(method_desc, 0)); |
|
3117
|
10
|
|
|
|
|
|
hv_stores(res, "alternative", newSVpv(alternative, 0)); |
|
3118
|
10
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)res); |
|
3119
|
|
|
|
|
|
|
} |
|
3120
|
|
|
|
|
|
|
OUTPUT: |
|
3121
|
|
|
|
|
|
|
RETVAL |
|
3122
|
|
|
|
|
|
|
|
|
3123
|
|
|
|
|
|
|
SV* wilcox_test(...) |
|
3124
|
|
|
|
|
|
|
CODE: |
|
3125
|
|
|
|
|
|
|
{ |
|
3126
|
10
|
|
|
|
|
|
SV *restrict x_sv = NULL, *restrict y_sv = NULL; |
|
3127
|
10
|
|
|
|
|
|
bool paired = FALSE, correct = TRUE; |
|
3128
|
10
|
|
|
|
|
|
NV mu = 0.0; |
|
3129
|
10
|
|
|
|
|
|
short int exact = -1; |
|
3130
|
10
|
|
|
|
|
|
const char *restrict alternative = "two.sided"; |
|
3131
|
10
|
|
|
|
|
|
int arg_idx = 0; |
|
3132
|
|
|
|
|
|
|
// 1. Shift first positional argument as 'x' if it's an array reference |
|
3133
|
10
|
50
|
|
|
|
|
if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
3134
|
2
|
|
|
|
|
|
x_sv = ST(arg_idx); |
|
3135
|
2
|
|
|
|
|
|
arg_idx++; |
|
3136
|
|
|
|
|
|
|
} |
|
3137
|
|
|
|
|
|
|
// 2. Shift second positional argument as 'y' if it's an array reference |
|
3138
|
10
|
50
|
|
|
|
|
if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
3139
|
2
|
|
|
|
|
|
y_sv = ST(arg_idx); |
|
3140
|
2
|
|
|
|
|
|
arg_idx++; |
|
3141
|
|
|
|
|
|
|
} |
|
3142
|
|
|
|
|
|
|
// Ensure the remaining arguments form complete key-value pairs |
|
3143
|
10
|
50
|
|
|
|
|
if ((items - arg_idx) % 2 != 0) { |
|
3144
|
0
|
|
|
|
|
|
croak("Usage: wilcox_test(\\@x, [\\@y], key => value, ...)"); |
|
3145
|
|
|
|
|
|
|
} |
|
3146
|
|
|
|
|
|
|
// --- Parse named arguments from the remaining flat stack --- |
|
3147
|
30
|
100
|
|
|
|
|
for (; arg_idx < items; arg_idx += 2) { |
|
3148
|
20
|
|
|
|
|
|
const char *restrict key = SvPV_nolen(ST(arg_idx)); |
|
3149
|
20
|
|
|
|
|
|
SV *restrict val = ST(arg_idx + 1); |
|
3150
|
20
|
100
|
|
|
|
|
if (strEQ(key, "x")) x_sv = val; |
|
3151
|
13
|
100
|
|
|
|
|
else if (strEQ(key, "y")) y_sv = val; |
|
3152
|
6
|
100
|
|
|
|
|
else if (strEQ(key, "paired")) paired = SvTRUE(val); |
|
3153
|
3
|
50
|
|
|
|
|
else if (strEQ(key, "correct")) correct = SvTRUE(val); |
|
3154
|
3
|
100
|
|
|
|
|
else if (strEQ(key, "mu")) mu = SvNV(val); |
|
3155
|
2
|
50
|
|
|
|
|
else if (strEQ(key, "exact")) { |
|
3156
|
0
|
0
|
|
|
|
|
if (!SvOK(val)) exact = -1; |
|
3157
|
0
|
|
|
|
|
|
else exact = SvTRUE(val) ? 1 : 0; |
|
3158
|
|
|
|
|
|
|
} |
|
3159
|
2
|
50
|
|
|
|
|
else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val); |
|
3160
|
0
|
|
|
|
|
|
else croak("wilcox_test: unknown argument '%s'", key); |
|
3161
|
|
|
|
|
|
|
} |
|
3162
|
|
|
|
|
|
|
// --- Validate required / types --- |
|
3163
|
10
|
100
|
|
|
|
|
if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
3164
|
1
|
|
|
|
|
|
croak("wilcox_test: 'x' is a required argument and must be an ARRAY reference"); |
|
3165
|
9
|
|
|
|
|
|
AV *restrict x_av = (AV*)SvRV(x_sv); |
|
3166
|
9
|
|
|
|
|
|
size_t nx = av_len(x_av) + 1; |
|
3167
|
9
|
50
|
|
|
|
|
if (nx == 0) croak("Not enough 'x' observations"); |
|
3168
|
|
|
|
|
|
|
|
|
3169
|
9
|
|
|
|
|
|
AV *restrict y_av = NULL; |
|
3170
|
9
|
|
|
|
|
|
size_t ny = 0; |
|
3171
|
9
|
100
|
|
|
|
|
if (y_sv && SvROK(y_sv) && SvTYPE(SvRV(y_sv)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
3172
|
8
|
|
|
|
|
|
y_av = (AV*)SvRV(y_sv); |
|
3173
|
8
|
|
|
|
|
|
ny = av_len(y_av) + 1; |
|
3174
|
|
|
|
|
|
|
} |
|
3175
|
9
|
|
|
|
|
|
NV p_value = 0.0, statistic = 0.0; |
|
3176
|
9
|
|
|
|
|
|
const char *restrict method_desc = ""; |
|
3177
|
9
|
|
|
|
|
|
bool use_exact = FALSE; |
|
3178
|
|
|
|
|
|
|
// --- TWO SAMPLE (Mann-Whitney) --- |
|
3179
|
14
|
100
|
|
|
|
|
if (ny > 0 && !paired) { |
|
|
|
100
|
|
|
|
|
|
|
3180
|
5
|
|
|
|
|
|
RankInfo *restrict ri = (RankInfo *)safemalloc((nx + ny) * sizeof(RankInfo)); |
|
3181
|
5
|
|
|
|
|
|
size_t valid_nx = 0, valid_ny = 0; |
|
3182
|
33
|
100
|
|
|
|
|
for (size_t i = 0; i < nx; i++) { |
|
3183
|
28
|
|
|
|
|
|
SV**restrict el = av_fetch(x_av, i, 0); |
|
3184
|
28
|
50
|
|
|
|
|
if (el && SvOK(*el) && looks_like_number(*el)) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
3185
|
28
|
|
|
|
|
|
ri[valid_nx].val = SvNV(*el) - mu; // R subtracts mu from x |
|
3186
|
28
|
|
|
|
|
|
ri[valid_nx].idx = 1; |
|
3187
|
28
|
|
|
|
|
|
valid_nx++; |
|
3188
|
|
|
|
|
|
|
} |
|
3189
|
|
|
|
|
|
|
} |
|
3190
|
33
|
100
|
|
|
|
|
for (size_t i = 0; i < ny; i++) { |
|
3191
|
28
|
|
|
|
|
|
SV**restrict el = av_fetch(y_av, i, 0); |
|
3192
|
28
|
50
|
|
|
|
|
if (el && SvOK(*el) && looks_like_number(*el)) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
3193
|
28
|
|
|
|
|
|
ri[valid_nx + valid_ny].val = SvNV(*el); |
|
3194
|
28
|
|
|
|
|
|
ri[valid_nx + valid_ny].idx = 2; |
|
3195
|
28
|
|
|
|
|
|
valid_ny++; |
|
3196
|
|
|
|
|
|
|
} |
|
3197
|
|
|
|
|
|
|
} |
|
3198
|
5
|
50
|
|
|
|
|
if (valid_nx == 0) { Safefree(ri); croak("not enough (non-missing) 'x' observations"); } |
|
3199
|
5
|
50
|
|
|
|
|
if (valid_ny == 0) { Safefree(ri); croak("not enough 'y' observations"); } |
|
3200
|
5
|
|
|
|
|
|
size_t total_n = valid_nx + valid_ny; |
|
3201
|
5
|
|
|
|
|
|
bool has_ties = 0; |
|
3202
|
5
|
|
|
|
|
|
NV tie_adj = rank_and_count_ties(ri, total_n, &has_ties); |
|
3203
|
5
|
|
|
|
|
|
NV w_rank_sum = 0.0; |
|
3204
|
61
|
100
|
|
|
|
|
for (size_t i = 0; i < total_n; i++) if (ri[i].idx == 1) w_rank_sum += ri[i].rank; |
|
|
|
100
|
|
|
|
|
|
|
3205
|
5
|
|
|
|
|
|
statistic = w_rank_sum - (NV)valid_nx * (valid_nx + 1.0) / 2.0; |
|
3206
|
5
|
50
|
|
|
|
|
if (exact == 1) use_exact = TRUE; |
|
3207
|
5
|
50
|
|
|
|
|
else if (exact == 0) use_exact = FALSE; |
|
3208
|
5
|
50
|
|
|
|
|
else use_exact = (valid_nx < 50 && valid_ny < 50 && !has_ties); |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
3209
|
5
|
100
|
|
|
|
|
if (use_exact && has_ties) { |
|
|
|
50
|
|
|
|
|
|
|
3210
|
0
|
|
|
|
|
|
warn("wilcox_test: cannot compute exact p-value with ties; falling back to approximation"); |
|
3211
|
0
|
|
|
|
|
|
use_exact = FALSE; |
|
3212
|
|
|
|
|
|
|
} |
|
3213
|
5
|
100
|
|
|
|
|
if (use_exact) { |
|
3214
|
2
|
|
|
|
|
|
method_desc = "Wilcoxon rank sum exact test"; |
|
3215
|
2
|
|
|
|
|
|
NV p_less = exact_pwilcox(statistic, valid_nx, valid_ny); |
|
3216
|
2
|
|
|
|
|
|
NV p_greater = 1.0 - exact_pwilcox(statistic - 1.0, valid_nx, valid_ny); |
|
3217
|
|
|
|
|
|
|
|
|
3218
|
2
|
100
|
|
|
|
|
if (strcmp(alternative, "less") == 0) p_value = p_less; |
|
3219
|
1
|
50
|
|
|
|
|
else if (strcmp(alternative, "greater") == 0) p_value = p_greater; |
|
3220
|
|
|
|
|
|
|
else { |
|
3221
|
0
|
0
|
|
|
|
|
NV p = (p_less < p_greater) ? p_less : p_greater; |
|
3222
|
0
|
|
|
|
|
|
p_value = 2.0 * p; |
|
3223
|
|
|
|
|
|
|
} |
|
3224
|
|
|
|
|
|
|
} else { |
|
3225
|
3
|
50
|
|
|
|
|
method_desc = correct ? "Wilcoxon rank sum test with continuity correction" : "Wilcoxon rank sum test"; |
|
3226
|
3
|
|
|
|
|
|
NV exp = (NV)valid_nx * valid_ny / 2.0; |
|
3227
|
3
|
|
|
|
|
|
NV var = ((NV)valid_nx * valid_ny / 12.0) * ((total_n + 1.0) - tie_adj / (total_n * (total_n - 1.0))); |
|
3228
|
3
|
|
|
|
|
|
NV z = statistic - exp; |
|
3229
|
|
|
|
|
|
|
|
|
3230
|
3
|
|
|
|
|
|
NV CORRECTION = 0.0; |
|
3231
|
3
|
50
|
|
|
|
|
if (correct) { |
|
3232
|
3
|
50
|
|
|
|
|
if (strcmp(alternative, "two.sided") == 0) CORRECTION = (z > 0 ? 0.5 : -0.5); |
|
|
|
100
|
|
|
|
|
|
|
3233
|
0
|
0
|
|
|
|
|
else if (strcmp(alternative, "greater") == 0) CORRECTION = 0.5; |
|
3234
|
0
|
0
|
|
|
|
|
else if (strcmp(alternative, "less") == 0) CORRECTION = -0.5; |
|
3235
|
|
|
|
|
|
|
} |
|
3236
|
3
|
|
|
|
|
|
z = (z - CORRECTION) / sqrt(var); |
|
3237
|
|
|
|
|
|
|
|
|
3238
|
3
|
50
|
|
|
|
|
if (strcmp(alternative, "less") == 0) p_value = approx_pnorm(z); |
|
3239
|
3
|
50
|
|
|
|
|
else if (strcmp(alternative, "greater") == 0) p_value = 1.0 - approx_pnorm(z); |
|
3240
|
3
|
|
|
|
|
|
else p_value = 2.0 * approx_pnorm(-fabs(z)); |
|
3241
|
|
|
|
|
|
|
} |
|
3242
|
5
|
|
|
|
|
|
Safefree(ri); |
|
3243
|
|
|
|
|
|
|
} else { // --- ONE SAMPLE / PAIRED --- |
|
3244
|
4
|
100
|
|
|
|
|
if (paired && (!y_av || nx != ny)) croak("'x' and 'y' must have the same length for paired test"); |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
3245
|
3
|
|
|
|
|
|
NV *restrict diffs = (NV *)safemalloc(nx * sizeof(NV)); |
|
3246
|
3
|
|
|
|
|
|
size_t n_nz = 0; |
|
3247
|
3
|
|
|
|
|
|
bool has_zeroes = FALSE; |
|
3248
|
26
|
100
|
|
|
|
|
for (size_t i = 0; i < nx; i++) { |
|
3249
|
23
|
|
|
|
|
|
SV**restrict x_el = av_fetch(x_av, i, 0); |
|
3250
|
23
|
50
|
|
|
|
|
if (!x_el || !SvOK(*x_el) || !looks_like_number(*x_el)) continue; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
3251
|
23
|
|
|
|
|
|
NV dx = SvNV(*x_el); |
|
3252
|
|
|
|
|
|
|
|
|
3253
|
23
|
100
|
|
|
|
|
if (paired) { |
|
3254
|
18
|
|
|
|
|
|
SV**restrict y_el = av_fetch(y_av, i, 0); |
|
3255
|
18
|
50
|
|
|
|
|
if (!y_el || !SvOK(*y_el) || !looks_like_number(*y_el)) continue; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
3256
|
18
|
|
|
|
|
|
NV dy = SvNV(*y_el); |
|
3257
|
18
|
|
|
|
|
|
NV d = dx - dy - mu; |
|
3258
|
18
|
50
|
|
|
|
|
if (d == 0.0) has_zeroes = TRUE; // Drop exact zeroes |
|
3259
|
18
|
|
|
|
|
|
else diffs[n_nz++] = d; |
|
3260
|
|
|
|
|
|
|
} else { |
|
3261
|
5
|
|
|
|
|
|
NV d = dx - mu; |
|
3262
|
5
|
50
|
|
|
|
|
if (d == 0.0) has_zeroes = TRUE; |
|
3263
|
5
|
|
|
|
|
|
else diffs[n_nz++] = d; |
|
3264
|
|
|
|
|
|
|
} |
|
3265
|
|
|
|
|
|
|
} |
|
3266
|
3
|
50
|
|
|
|
|
if (n_nz == 0) { |
|
3267
|
0
|
|
|
|
|
|
Safefree(diffs); |
|
3268
|
0
|
|
|
|
|
|
croak("not enough (non-missing) observations"); |
|
3269
|
|
|
|
|
|
|
} |
|
3270
|
3
|
|
|
|
|
|
RankInfo *restrict ri = (RankInfo *)safemalloc(n_nz * sizeof(RankInfo)); |
|
3271
|
26
|
100
|
|
|
|
|
for (size_t i = 0; i < n_nz; i++) { |
|
3272
|
23
|
|
|
|
|
|
ri[i].val = fabs(diffs[i]); |
|
3273
|
23
|
|
|
|
|
|
ri[i].idx = (diffs[i] > 0); |
|
3274
|
|
|
|
|
|
|
} |
|
3275
|
3
|
|
|
|
|
|
bool has_ties = 0; |
|
3276
|
3
|
|
|
|
|
|
NV tie_adj = rank_and_count_ties(ri, n_nz, &has_ties); |
|
3277
|
3
|
|
|
|
|
|
statistic = 0.0; |
|
3278
|
26
|
100
|
|
|
|
|
for (size_t i = 0; i < n_nz; i++) { |
|
3279
|
23
|
100
|
|
|
|
|
if (ri[i].idx) statistic += ri[i].rank; |
|
3280
|
|
|
|
|
|
|
} |
|
3281
|
3
|
50
|
|
|
|
|
if (exact == 1) use_exact = TRUE; |
|
3282
|
3
|
50
|
|
|
|
|
else if (exact == 0) use_exact = FALSE; |
|
3283
|
3
|
50
|
|
|
|
|
else use_exact = (n_nz < 50 && !has_ties); |
|
|
|
50
|
|
|
|
|
|
|
3284
|
3
|
50
|
|
|
|
|
if (use_exact && has_ties) { |
|
|
|
50
|
|
|
|
|
|
|
3285
|
0
|
|
|
|
|
|
warn("cannot compute exact p-value with ties; falling back to approximation"); |
|
3286
|
0
|
|
|
|
|
|
use_exact = FALSE; |
|
3287
|
|
|
|
|
|
|
} |
|
3288
|
3
|
50
|
|
|
|
|
if (use_exact && has_zeroes) { |
|
|
|
50
|
|
|
|
|
|
|
3289
|
0
|
|
|
|
|
|
warn("cannot compute exact p-value with zeroes; falling back to approximation"); |
|
3290
|
0
|
|
|
|
|
|
use_exact = FALSE; |
|
3291
|
|
|
|
|
|
|
} |
|
3292
|
3
|
50
|
|
|
|
|
if (use_exact) { |
|
3293
|
3
|
|
|
|
|
|
method_desc = paired ? "Wilcoxon exact signed rank test" : "Wilcoxon exact signed rank test"; |
|
3294
|
3
|
|
|
|
|
|
double p_less = exact_psignrank(statistic, n_nz); |
|
3295
|
3
|
|
|
|
|
|
double p_greater = 1.0 - exact_psignrank(statistic - 1.0, n_nz); |
|
3296
|
|
|
|
|
|
|
|
|
3297
|
3
|
50
|
|
|
|
|
if (strcmp(alternative, "less") == 0) p_value = p_less; |
|
3298
|
3
|
50
|
|
|
|
|
else if (strcmp(alternative, "greater") == 0) p_value = p_greater; |
|
3299
|
|
|
|
|
|
|
else { |
|
3300
|
3
|
50
|
|
|
|
|
double p = (p_less < p_greater) ? p_less : p_greater; |
|
3301
|
3
|
|
|
|
|
|
p_value = 2.0 * p; |
|
3302
|
|
|
|
|
|
|
} |
|
3303
|
|
|
|
|
|
|
} else { |
|
3304
|
0
|
0
|
|
|
|
|
method_desc = correct ? "Wilcoxon signed rank test with continuity correction" : "Wilcoxon signed rank test"; |
|
3305
|
0
|
|
|
|
|
|
double exp = (double)n_nz * (n_nz + 1.0) / 4.0; |
|
3306
|
0
|
|
|
|
|
|
double var = (n_nz * (n_nz + 1.0) * (2.0 * n_nz + 1.0) / 24.0) - (tie_adj / 48.0); |
|
3307
|
0
|
|
|
|
|
|
double z = statistic - exp; |
|
3308
|
0
|
|
|
|
|
|
double CORRECTION = 0.0; |
|
3309
|
0
|
0
|
|
|
|
|
if (correct) { |
|
3310
|
0
|
0
|
|
|
|
|
if (strcmp(alternative, "two.sided") == 0) CORRECTION = (z > 0 ? 0.5 : -0.5); |
|
|
|
0
|
|
|
|
|
|
|
3311
|
0
|
0
|
|
|
|
|
else if (strcmp(alternative, "greater") == 0) CORRECTION = 0.5; |
|
3312
|
0
|
0
|
|
|
|
|
else if (strcmp(alternative, "less") == 0) CORRECTION = -0.5; |
|
3313
|
|
|
|
|
|
|
} |
|
3314
|
0
|
|
|
|
|
|
z = (z - CORRECTION) / sqrt(var); |
|
3315
|
|
|
|
|
|
|
|
|
3316
|
0
|
0
|
|
|
|
|
if (strcmp(alternative, "less") == 0) p_value = approx_pnorm(z); |
|
3317
|
0
|
0
|
|
|
|
|
else if (strcmp(alternative, "greater") == 0) p_value = 1.0 - approx_pnorm(z); |
|
3318
|
0
|
|
|
|
|
|
else p_value = 2.0 * approx_pnorm(-fabs(z)); |
|
3319
|
|
|
|
|
|
|
} |
|
3320
|
3
|
|
|
|
|
|
Safefree(ri); Safefree(diffs); |
|
3321
|
|
|
|
|
|
|
} |
|
3322
|
8
|
50
|
|
|
|
|
if (p_value > 1.0) p_value = 1.0; |
|
3323
|
8
|
|
|
|
|
|
HV *restrict res = newHV(); |
|
3324
|
8
|
|
|
|
|
|
hv_stores(res, "statistic", newSVnv(statistic)); |
|
3325
|
8
|
|
|
|
|
|
hv_stores(res, "p_value", newSVnv(p_value)); |
|
3326
|
8
|
|
|
|
|
|
hv_stores(res, "method", newSVpv(method_desc, 0)); |
|
3327
|
8
|
|
|
|
|
|
hv_stores(res, "alternative", newSVpv(alternative, 0)); |
|
3328
|
8
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)res); |
|
3329
|
|
|
|
|
|
|
} |
|
3330
|
|
|
|
|
|
|
OUTPUT: |
|
3331
|
|
|
|
|
|
|
RETVAL |
|
3332
|
|
|
|
|
|
|
|
|
3333
|
|
|
|
|
|
|
SV* chisq_test(data_ref) |
|
3334
|
|
|
|
|
|
|
SV* data_ref; |
|
3335
|
|
|
|
|
|
|
CODE: |
|
3336
|
|
|
|
|
|
|
{ |
|
3337
|
|
|
|
|
|
|
// 1. Input Validation & Data Matrix Construction |
|
3338
|
16
|
100
|
|
|
|
|
if (!SvROK(data_ref)) { |
|
3339
|
3
|
|
|
|
|
|
croak("Input must be a reference"); |
|
3340
|
|
|
|
|
|
|
} |
|
3341
|
|
|
|
|
|
|
|
|
3342
|
13
|
|
|
|
|
|
svtype input_type = SvTYPE(SvRV(data_ref)); |
|
3343
|
13
|
100
|
|
|
|
|
if (input_type != SVt_PVAV && input_type != SVt_PVHV) { |
|
|
|
100
|
|
|
|
|
|
|
3344
|
1
|
|
|
|
|
|
croak("Input must be an array reference or a hash reference"); |
|
3345
|
|
|
|
|
|
|
} |
|
3346
|
|
|
|
|
|
|
|
|
3347
|
12
|
|
|
|
|
|
double **restrict obs_matrix = NULL; |
|
3348
|
12
|
|
|
|
|
|
double *restrict obs_array = NULL; |
|
3349
|
12
|
|
|
|
|
|
AV*restrict row_keys = NULL; |
|
3350
|
12
|
|
|
|
|
|
AV*restrict col_keys = NULL; |
|
3351
|
12
|
|
|
|
|
|
unsigned int r = 0, c = 0; |
|
3352
|
12
|
|
|
|
|
|
bool is_2d = 0; |
|
3353
|
|
|
|
|
|
|
|
|
3354
|
12
|
100
|
|
|
|
|
if (input_type == SVt_PVAV) { |
|
3355
|
8
|
|
|
|
|
|
AV*restrict obs_av = (AV*)SvRV(data_ref); |
|
3356
|
8
|
50
|
|
|
|
|
r = av_top_index(obs_av) + 1; |
|
3357
|
8
|
100
|
|
|
|
|
if (r > 0) { |
|
3358
|
7
|
|
|
|
|
|
SV**restrict first_elem = av_fetch(obs_av, 0, 0); |
|
3359
|
7
|
50
|
|
|
|
|
if (first_elem && SvROK(*first_elem) && SvTYPE(SvRV(*first_elem)) == SVt_PVAV) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
3360
|
4
|
|
|
|
|
|
is_2d = 1; |
|
3361
|
4
|
50
|
|
|
|
|
c = av_top_index((AV*)SvRV(*first_elem)) + 1; |
|
3362
|
4
|
|
|
|
|
|
obs_matrix = (double**)safemalloc(r * sizeof(double*)); |
|
3363
|
12
|
100
|
|
|
|
|
for (unsigned int i = 0; i < r; i++) { |
|
3364
|
8
|
|
|
|
|
|
obs_matrix[i] = (double*)safecalloc(c, sizeof(double)); |
|
3365
|
8
|
|
|
|
|
|
SV**restrict row_sv = av_fetch(obs_av, i, 0); |
|
3366
|
8
|
50
|
|
|
|
|
if (row_sv && SvROK(*row_sv)) { |
|
|
|
50
|
|
|
|
|
|
|
3367
|
8
|
|
|
|
|
|
AV*restrict row_av = (AV*)SvRV(*row_sv); |
|
3368
|
28
|
100
|
|
|
|
|
for (unsigned int j = 0; j < c; j++) { |
|
3369
|
20
|
|
|
|
|
|
SV**restrict val_sv = av_fetch(row_av, j, 0); |
|
3370
|
20
|
50
|
|
|
|
|
if (val_sv) obs_matrix[i][j] = SvNV(*val_sv); |
|
3371
|
|
|
|
|
|
|
} |
|
3372
|
|
|
|
|
|
|
} |
|
3373
|
|
|
|
|
|
|
} |
|
3374
|
|
|
|
|
|
|
} else { |
|
3375
|
3
|
|
|
|
|
|
c = r; |
|
3376
|
3
|
|
|
|
|
|
r = 1; |
|
3377
|
3
|
|
|
|
|
|
obs_array = (double*)safemalloc(c * sizeof(double)); |
|
3378
|
9
|
100
|
|
|
|
|
for (unsigned int j = 0; j < c; j++) { |
|
3379
|
7
|
|
|
|
|
|
SV**restrict val_sv = av_fetch(obs_av, j, 0); |
|
3380
|
7
|
50
|
|
|
|
|
if (val_sv) obs_array[j] = SvNV(*val_sv); |
|
3381
|
|
|
|
|
|
|
} |
|
3382
|
|
|
|
|
|
|
} |
|
3383
|
|
|
|
|
|
|
} |
|
3384
|
4
|
50
|
|
|
|
|
} else if (input_type == SVt_PVHV) { |
|
3385
|
4
|
|
|
|
|
|
HV*restrict obs_hv = (HV*)SvRV(data_ref); |
|
3386
|
4
|
|
|
|
|
|
row_keys = newAV(); |
|
3387
|
4
|
|
|
|
|
|
col_keys = newAV(); |
|
3388
|
|
|
|
|
|
|
|
|
3389
|
|
|
|
|
|
|
HE*restrict first_entry; |
|
3390
|
4
|
|
|
|
|
|
hv_iterinit(obs_hv); |
|
3391
|
4
|
|
|
|
|
|
first_entry = hv_iternext(obs_hv); |
|
3392
|
|
|
|
|
|
|
|
|
3393
|
4
|
100
|
|
|
|
|
if (first_entry) { |
|
3394
|
3
|
|
|
|
|
|
SV*restrict first_val = hv_iterval(obs_hv, first_entry); |
|
3395
|
4
|
100
|
|
|
|
|
if (SvROK(first_val) && SvTYPE(SvRV(first_val)) == SVt_PVHV) { |
|
|
|
50
|
|
|
|
|
|
|
3396
|
1
|
|
|
|
|
|
is_2d = 1; |
|
3397
|
1
|
|
|
|
|
|
HV*restrict col_idx_map = newHV(); |
|
3398
|
1
|
|
|
|
|
|
hv_iterinit(obs_hv); |
|
3399
|
|
|
|
|
|
|
HE*restrict row_entry; |
|
3400
|
3
|
100
|
|
|
|
|
while ((row_entry = hv_iternext(obs_hv))) { |
|
3401
|
2
|
|
|
|
|
|
av_push(row_keys, newSVsv(hv_iterkeysv(row_entry))); |
|
3402
|
2
|
|
|
|
|
|
r++; |
|
3403
|
2
|
|
|
|
|
|
SV*restrict inner_sv = hv_iterval(obs_hv, row_entry); |
|
3404
|
2
|
50
|
|
|
|
|
if (SvROK(inner_sv) && SvTYPE(SvRV(inner_sv)) == SVt_PVHV) { |
|
|
|
50
|
|
|
|
|
|
|
3405
|
2
|
|
|
|
|
|
HV*restrict inner_hv = (HV*)SvRV(inner_sv); |
|
3406
|
|
|
|
|
|
|
HE*restrict col_entry; |
|
3407
|
2
|
|
|
|
|
|
hv_iterinit(inner_hv); |
|
3408
|
8
|
100
|
|
|
|
|
while ((col_entry = hv_iternext(inner_hv))) { |
|
3409
|
4
|
|
|
|
|
|
SV*restrict col_key = hv_iterkeysv(col_entry); |
|
3410
|
4
|
100
|
|
|
|
|
if (!hv_exists_ent(col_idx_map, col_key, 0)) { |
|
3411
|
2
|
|
|
|
|
|
hv_store_ent(col_idx_map, col_key, newSViv(c), 0); |
|
3412
|
2
|
|
|
|
|
|
av_push(col_keys, newSVsv(col_key)); |
|
3413
|
2
|
|
|
|
|
|
c++; |
|
3414
|
|
|
|
|
|
|
} |
|
3415
|
|
|
|
|
|
|
} |
|
3416
|
|
|
|
|
|
|
} |
|
3417
|
|
|
|
|
|
|
} |
|
3418
|
|
|
|
|
|
|
|
|
3419
|
1
|
|
|
|
|
|
obs_matrix = (double**)safemalloc(r * sizeof(double*)); |
|
3420
|
3
|
100
|
|
|
|
|
for (unsigned int i = 0; i < r; i++) { |
|
3421
|
2
|
|
|
|
|
|
obs_matrix[i] = (double*)safecalloc(c, sizeof(double)); |
|
3422
|
2
|
|
|
|
|
|
SV**restrict row_key_sv = av_fetch(row_keys, i, 0); |
|
3423
|
|
|
|
|
|
|
|
|
3424
|
|
|
|
|
|
|
// FIX 1: Extract HE* instead of SV** |
|
3425
|
2
|
|
|
|
|
|
HE* inner_he = hv_fetch_ent(obs_hv, *row_key_sv, 0, 0); |
|
3426
|
2
|
50
|
|
|
|
|
if (inner_he) { |
|
3427
|
2
|
|
|
|
|
|
SV*restrict inner_sv = HeVAL(inner_he); |
|
3428
|
2
|
50
|
|
|
|
|
if (SvROK(inner_sv)) { |
|
3429
|
2
|
|
|
|
|
|
HV*restrict inner_hv = (HV*)SvRV(inner_sv); |
|
3430
|
6
|
100
|
|
|
|
|
for (unsigned int j = 0; j < c; j++) { |
|
3431
|
4
|
|
|
|
|
|
SV**restrict col_key_sv = av_fetch(col_keys, j, 0); |
|
3432
|
|
|
|
|
|
|
|
|
3433
|
|
|
|
|
|
|
// FIX 2: Extract HE* instead of SV** |
|
3434
|
4
|
|
|
|
|
|
HE*restrict val_he = hv_fetch_ent(inner_hv, *col_key_sv, 0, 0); |
|
3435
|
4
|
50
|
|
|
|
|
if (val_he) { |
|
3436
|
4
|
|
|
|
|
|
obs_matrix[i][j] = SvNV(HeVAL(val_he)); |
|
3437
|
|
|
|
|
|
|
} |
|
3438
|
|
|
|
|
|
|
} |
|
3439
|
|
|
|
|
|
|
} |
|
3440
|
|
|
|
|
|
|
} |
|
3441
|
|
|
|
|
|
|
} |
|
3442
|
1
|
|
|
|
|
|
SvREFCNT_dec(col_idx_map); |
|
3443
|
|
|
|
|
|
|
} else { |
|
3444
|
|
|
|
|
|
|
// 1D Hash Handling |
|
3445
|
2
|
|
|
|
|
|
hv_iterinit(obs_hv); |
|
3446
|
|
|
|
|
|
|
HE*restrict row_entry; |
|
3447
|
6
|
100
|
|
|
|
|
while ((row_entry = hv_iternext(obs_hv))) { |
|
3448
|
4
|
|
|
|
|
|
av_push(col_keys, newSVsv(hv_iterkeysv(row_entry))); |
|
3449
|
4
|
|
|
|
|
|
c++; |
|
3450
|
|
|
|
|
|
|
} |
|
3451
|
2
|
|
|
|
|
|
obs_array = (double*)safemalloc(c * sizeof(double)); |
|
3452
|
5
|
100
|
|
|
|
|
for (unsigned int j = 0; j < c; j++) { |
|
3453
|
4
|
|
|
|
|
|
SV**restrict col_key_sv = av_fetch(col_keys, j, 0); |
|
3454
|
|
|
|
|
|
|
// FIX 3: Extract HE* instead of SV** |
|
3455
|
4
|
|
|
|
|
|
HE*restrict val_he = hv_fetch_ent(obs_hv, *col_key_sv, 0, 0); |
|
3456
|
4
|
50
|
|
|
|
|
if (val_he) { |
|
3457
|
4
|
|
|
|
|
|
obs_array[j] = SvNV(HeVAL(val_he)); |
|
3458
|
|
|
|
|
|
|
} |
|
3459
|
|
|
|
|
|
|
} |
|
3460
|
|
|
|
|
|
|
} |
|
3461
|
|
|
|
|
|
|
} |
|
3462
|
|
|
|
|
|
|
} |
|
3463
|
|
|
|
|
|
|
|
|
3464
|
10
|
100
|
|
|
|
|
if ((is_2d && (r == 0 || c == 0)) || (!is_2d && c == 0)) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
3465
|
2
|
|
|
|
|
|
croak("Empty data structure"); |
|
3466
|
|
|
|
|
|
|
} |
|
3467
|
|
|
|
|
|
|
|
|
3468
|
|
|
|
|
|
|
// 2. Perform Math Algorithm |
|
3469
|
8
|
|
|
|
|
|
double stat = 0.0, grand_total = 0.0; |
|
3470
|
8
|
|
|
|
|
|
unsigned int df = 0; |
|
3471
|
8
|
100
|
|
|
|
|
bool yates = (is_2d && r == 2 && c == 2) ? 1 : 0; |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
3472
|
8
|
|
|
|
|
|
SV*restrict expected_ref = NULL; |
|
3473
|
|
|
|
|
|
|
|
|
3474
|
8
|
100
|
|
|
|
|
if (is_2d) { |
|
3475
|
5
|
|
|
|
|
|
double *restrict row_sum = (double*)safemalloc(r * sizeof(double)); |
|
3476
|
5
|
|
|
|
|
|
double *restrict col_sum = (double*)safemalloc(c * sizeof(double)); |
|
3477
|
15
|
100
|
|
|
|
|
for(unsigned int i=0; i
|
|
3478
|
17
|
100
|
|
|
|
|
for(unsigned int j=0; j
|
|
3479
|
|
|
|
|
|
|
|
|
3480
|
15
|
100
|
|
|
|
|
for (unsigned int i = 0; i < r; i++) { |
|
3481
|
34
|
100
|
|
|
|
|
for (unsigned int j = 0; j < c; j++) { |
|
3482
|
24
|
|
|
|
|
|
double val = obs_matrix[i][j]; |
|
3483
|
24
|
|
|
|
|
|
row_sum[i] += val; |
|
3484
|
24
|
|
|
|
|
|
col_sum[j] += val; |
|
3485
|
24
|
|
|
|
|
|
grand_total += val; |
|
3486
|
|
|
|
|
|
|
} |
|
3487
|
|
|
|
|
|
|
} |
|
3488
|
|
|
|
|
|
|
|
|
3489
|
5
|
100
|
|
|
|
|
if (input_type == SVt_PVAV) { |
|
3490
|
4
|
|
|
|
|
|
AV*restrict expected_av = newAV(); |
|
3491
|
12
|
100
|
|
|
|
|
for (unsigned int i = 0; i < r; i++) { |
|
3492
|
8
|
|
|
|
|
|
AV*restrict exp_row = newAV(); |
|
3493
|
28
|
100
|
|
|
|
|
for (unsigned int j = 0; j < c; j++) { |
|
3494
|
20
|
|
|
|
|
|
double E = (row_sum[i] * col_sum[j]) / grand_total; |
|
3495
|
20
|
|
|
|
|
|
double O = obs_matrix[i][j]; |
|
3496
|
20
|
|
|
|
|
|
av_push(exp_row, newSVnv(E)); |
|
3497
|
20
|
100
|
|
|
|
|
if (yates) { |
|
3498
|
8
|
|
|
|
|
|
double abs_diff = fabs(O - E); |
|
3499
|
8
|
50
|
|
|
|
|
double y_corr = (abs_diff > 0.5) ? 0.5 : abs_diff; |
|
3500
|
8
|
|
|
|
|
|
double diff = abs_diff - y_corr; |
|
3501
|
8
|
|
|
|
|
|
stat += (diff * diff) / E; |
|
3502
|
|
|
|
|
|
|
} else { |
|
3503
|
12
|
|
|
|
|
|
stat += ((O - E) * (O - E)) / E; |
|
3504
|
|
|
|
|
|
|
} |
|
3505
|
|
|
|
|
|
|
} |
|
3506
|
8
|
|
|
|
|
|
av_push(expected_av, newRV_noinc((SV*)exp_row)); |
|
3507
|
|
|
|
|
|
|
} |
|
3508
|
4
|
|
|
|
|
|
expected_ref = newRV_noinc((SV*)expected_av); |
|
3509
|
|
|
|
|
|
|
} else { // SVt_PVHV |
|
3510
|
1
|
|
|
|
|
|
HV*restrict expected_hv = newHV(); |
|
3511
|
3
|
100
|
|
|
|
|
for (unsigned int i = 0; i < r; i++) { |
|
3512
|
2
|
|
|
|
|
|
HV*restrict exp_row = newHV(); |
|
3513
|
6
|
100
|
|
|
|
|
for (unsigned int j = 0; j < c; j++) { |
|
3514
|
4
|
|
|
|
|
|
double E = (row_sum[i] * col_sum[j]) / grand_total; |
|
3515
|
4
|
|
|
|
|
|
double O = obs_matrix[i][j]; |
|
3516
|
4
|
|
|
|
|
|
SV**restrict col_key_sv = av_fetch(col_keys, j, 0); |
|
3517
|
4
|
|
|
|
|
|
hv_store_ent(exp_row, *col_key_sv, newSVnv(E), 0); |
|
3518
|
|
|
|
|
|
|
|
|
3519
|
4
|
50
|
|
|
|
|
if (yates) { |
|
3520
|
4
|
|
|
|
|
|
double abs_diff = fabs(O - E); |
|
3521
|
4
|
50
|
|
|
|
|
double y_corr = (abs_diff > 0.5) ? 0.5 : abs_diff; |
|
3522
|
4
|
|
|
|
|
|
double diff = abs_diff - y_corr; |
|
3523
|
4
|
|
|
|
|
|
stat += (diff * diff) / E; |
|
3524
|
|
|
|
|
|
|
} else { |
|
3525
|
0
|
|
|
|
|
|
stat += ((O - E) * (O - E)) / E; |
|
3526
|
|
|
|
|
|
|
} |
|
3527
|
|
|
|
|
|
|
} |
|
3528
|
2
|
|
|
|
|
|
SV**restrict row_key_sv = av_fetch(row_keys, i, 0); |
|
3529
|
2
|
|
|
|
|
|
hv_store_ent(expected_hv, *row_key_sv, newRV_noinc((SV*)exp_row), 0); |
|
3530
|
|
|
|
|
|
|
} |
|
3531
|
1
|
|
|
|
|
|
expected_ref = newRV_noinc((SV*)expected_hv); |
|
3532
|
|
|
|
|
|
|
} |
|
3533
|
5
|
|
|
|
|
|
safefree(row_sum); safefree(col_sum); |
|
3534
|
5
|
|
|
|
|
|
df = (r - 1) * (c - 1); |
|
3535
|
|
|
|
|
|
|
} else { |
|
3536
|
12
|
100
|
|
|
|
|
for (unsigned int j = 0; j < c; j++) { |
|
3537
|
9
|
|
|
|
|
|
grand_total += obs_array[j]; |
|
3538
|
|
|
|
|
|
|
} |
|
3539
|
3
|
|
|
|
|
|
double E = grand_total / (double)c; |
|
3540
|
|
|
|
|
|
|
|
|
3541
|
3
|
100
|
|
|
|
|
if (input_type == SVt_PVAV) { |
|
3542
|
2
|
|
|
|
|
|
AV*restrict expected_av = newAV(); |
|
3543
|
8
|
100
|
|
|
|
|
for (unsigned int j = 0; j < c; j++) { |
|
3544
|
6
|
|
|
|
|
|
double O = obs_array[j]; |
|
3545
|
6
|
|
|
|
|
|
av_push(expected_av, newSVnv(E)); |
|
3546
|
6
|
|
|
|
|
|
stat += ((O - E) * (O - E)) / E; |
|
3547
|
|
|
|
|
|
|
} |
|
3548
|
2
|
|
|
|
|
|
expected_ref = newRV_noinc((SV*)expected_av); |
|
3549
|
|
|
|
|
|
|
} else { // SVt_PVHV |
|
3550
|
1
|
|
|
|
|
|
HV*restrict expected_hv = newHV(); |
|
3551
|
4
|
100
|
|
|
|
|
for (unsigned int j = 0; j < c; j++) { |
|
3552
|
3
|
|
|
|
|
|
double O = obs_array[j]; |
|
3553
|
3
|
|
|
|
|
|
SV**restrict col_key_sv = av_fetch(col_keys, j, 0); |
|
3554
|
3
|
|
|
|
|
|
hv_store_ent(expected_hv, *col_key_sv, newSVnv(E), 0); |
|
3555
|
3
|
|
|
|
|
|
stat += ((O - E) * (O - E)) / E; |
|
3556
|
|
|
|
|
|
|
} |
|
3557
|
1
|
|
|
|
|
|
expected_ref = newRV_noinc((SV*)expected_hv); |
|
3558
|
|
|
|
|
|
|
} |
|
3559
|
3
|
|
|
|
|
|
df = c - 1; |
|
3560
|
|
|
|
|
|
|
} |
|
3561
|
|
|
|
|
|
|
|
|
3562
|
|
|
|
|
|
|
// Memory Cleanup for Matrices/Arrays |
|
3563
|
8
|
100
|
|
|
|
|
if (obs_matrix) { |
|
3564
|
15
|
100
|
|
|
|
|
for (unsigned int i = 0; i < r; i++) { |
|
3565
|
10
|
|
|
|
|
|
safefree(obs_matrix[i]); |
|
3566
|
|
|
|
|
|
|
} |
|
3567
|
5
|
|
|
|
|
|
safefree(obs_matrix); |
|
3568
|
|
|
|
|
|
|
} |
|
3569
|
8
|
100
|
|
|
|
|
if (obs_array) safefree(obs_array); |
|
3570
|
8
|
100
|
|
|
|
|
if (row_keys) SvREFCNT_dec(row_keys); |
|
3571
|
8
|
100
|
|
|
|
|
if (col_keys) SvREFCNT_dec(col_keys); |
|
3572
|
|
|
|
|
|
|
|
|
3573
|
8
|
|
|
|
|
|
double p_val = get_p_value(stat, df); |
|
3574
|
|
|
|
|
|
|
|
|
3575
|
|
|
|
|
|
|
// 3. Build the top-level results Hash (mimicking R's htest structure) |
|
3576
|
8
|
|
|
|
|
|
HV*restrict results = newHV(); |
|
3577
|
|
|
|
|
|
|
|
|
3578
|
8
|
|
|
|
|
|
HV*restrict statistic_hv = newHV(); |
|
3579
|
8
|
|
|
|
|
|
hv_store(statistic_hv, "X-squared", 9, newSVnv(stat), 0); |
|
3580
|
8
|
|
|
|
|
|
hv_store(results, "statistic", 9, newRV_noinc((SV*)statistic_hv), 0); |
|
3581
|
|
|
|
|
|
|
|
|
3582
|
8
|
|
|
|
|
|
HV*restrict parameter_hv = newHV(); |
|
3583
|
8
|
|
|
|
|
|
hv_store(parameter_hv, "df", 2, newSViv(df), 0); |
|
3584
|
8
|
|
|
|
|
|
hv_store(results, "parameter", 9, newRV_noinc((SV*)parameter_hv), 0); |
|
3585
|
|
|
|
|
|
|
|
|
3586
|
8
|
|
|
|
|
|
hv_store(results, "p.value", 7, newSVnv(p_val), 0); |
|
3587
|
8
|
|
|
|
|
|
hv_store(results, "expected", 8, expected_ref, 0); |
|
3588
|
8
|
|
|
|
|
|
hv_store(results, "observed", 8, SvREFCNT_inc(data_ref), 0); |
|
3589
|
|
|
|
|
|
|
|
|
3590
|
8
|
100
|
|
|
|
|
if (input_type == SVt_PVAV) { |
|
3591
|
6
|
|
|
|
|
|
hv_store(results, "data.name", 9, newSVpv("Perl ArrayRef", 0), 0); |
|
3592
|
|
|
|
|
|
|
} else { |
|
3593
|
2
|
|
|
|
|
|
hv_store(results, "data.name", 9, newSVpv("Perl HashRef", 0), 0); |
|
3594
|
|
|
|
|
|
|
} |
|
3595
|
|
|
|
|
|
|
|
|
3596
|
8
|
100
|
|
|
|
|
if (is_2d) { |
|
3597
|
5
|
100
|
|
|
|
|
if (yates) { |
|
3598
|
3
|
|
|
|
|
|
hv_store(results, "method", 6, newSVpv("Pearson's Chi-squared test with Yates' continuity correction", 0), 0); |
|
3599
|
|
|
|
|
|
|
} else { |
|
3600
|
2
|
|
|
|
|
|
hv_store(results, "method", 6, newSVpv("Pearson's Chi-squared test", 0), 0); |
|
3601
|
|
|
|
|
|
|
} |
|
3602
|
|
|
|
|
|
|
} else { |
|
3603
|
3
|
|
|
|
|
|
hv_store(results, "method", 6, newSVpv("Chi-squared test for given probabilities", 0), 0); |
|
3604
|
|
|
|
|
|
|
} |
|
3605
|
|
|
|
|
|
|
|
|
3606
|
8
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)results); |
|
3607
|
|
|
|
|
|
|
} |
|
3608
|
|
|
|
|
|
|
OUTPUT: |
|
3609
|
|
|
|
|
|
|
RETVAL |
|
3610
|
|
|
|
|
|
|
|
|
3611
|
|
|
|
|
|
|
PROTOTYPES: ENABLE |
|
3612
|
|
|
|
|
|
|
|
|
3613
|
|
|
|
|
|
|
void write_table(...) |
|
3614
|
|
|
|
|
|
|
PPCODE: |
|
3615
|
|
|
|
|
|
|
{ |
|
3616
|
43
|
|
|
|
|
|
SV *restrict data_sv = NULL; |
|
3617
|
43
|
|
|
|
|
|
SV *restrict file_sv = NULL; |
|
3618
|
43
|
|
|
|
|
|
unsigned int arg_idx = 0; |
|
3619
|
|
|
|
|
|
|
|
|
3620
|
|
|
|
|
|
|
// Mimic the Perl shift logic |
|
3621
|
43
|
100
|
|
|
|
|
if (arg_idx < items && SvROK(ST(arg_idx))) { |
|
|
|
100
|
|
|
|
|
|
|
3622
|
41
|
|
|
|
|
|
int type = SvTYPE(SvRV(ST(arg_idx))); |
|
3623
|
41
|
100
|
|
|
|
|
if (type == SVt_PVHV || type == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
3624
|
41
|
|
|
|
|
|
data_sv = ST(arg_idx); |
|
3625
|
41
|
|
|
|
|
|
arg_idx++; |
|
3626
|
|
|
|
|
|
|
} |
|
3627
|
|
|
|
|
|
|
} |
|
3628
|
|
|
|
|
|
|
// Only consume a positional file argument if it is a plain string that is |
|
3629
|
|
|
|
|
|
|
// NOT one of the named option keys. Otherwise write_table(data=>..., file=>...) |
|
3630
|
|
|
|
|
|
|
// would grab the literal string "data" as the filename. |
|
3631
|
43
|
100
|
|
|
|
|
if (arg_idx < items) { |
|
3632
|
41
|
|
|
|
|
|
SV *restrict cand = ST(arg_idx); |
|
3633
|
41
|
50
|
|
|
|
|
if (SvOK(cand) && !SvROK(cand)) { |
|
|
|
50
|
|
|
|
|
|
|
3634
|
41
|
|
|
|
|
|
const char *restrict k = SvPV_nolen(cand); |
|
3635
|
41
|
100
|
|
|
|
|
if (!(strEQ(k, "data") || strEQ(k, "file") || strEQ(k, "col.names") || |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
3636
|
39
|
50
|
|
|
|
|
strEQ(k, "row.names") || strEQ(k, "sep") || strEQ(k, "delim") || |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
3637
|
39
|
50
|
|
|
|
|
strEQ(k, "undef.val"))) { |
|
3638
|
39
|
|
|
|
|
|
file_sv = cand; |
|
3639
|
39
|
|
|
|
|
|
arg_idx++; |
|
3640
|
|
|
|
|
|
|
} |
|
3641
|
|
|
|
|
|
|
} |
|
3642
|
|
|
|
|
|
|
} |
|
3643
|
|
|
|
|
|
|
|
|
3644
|
43
|
|
|
|
|
|
const char *restrict sep = ","; |
|
3645
|
43
|
|
|
|
|
|
bool explicit_sep = 0; // Track if delimiter was manually specified |
|
3646
|
43
|
|
|
|
|
|
const char *restrict undef_val = NULL; |
|
3647
|
43
|
|
|
|
|
|
SV *restrict row_names_sv = sv_2mortal(newSViv(1)); |
|
3648
|
43
|
|
|
|
|
|
SV *restrict col_names_sv = NULL; |
|
3649
|
|
|
|
|
|
|
|
|
3650
|
|
|
|
|
|
|
// Read the remaining Hash-style arguments |
|
3651
|
106
|
100
|
|
|
|
|
for (; arg_idx < items; arg_idx += 2) { |
|
3652
|
65
|
100
|
|
|
|
|
if (arg_idx + 1 >= items) croak("write_table: Odd number of arguments passed"); |
|
3653
|
64
|
|
|
|
|
|
const char *restrict key = SvPV_nolen(ST(arg_idx)); |
|
3654
|
64
|
|
|
|
|
|
SV *restrict val = ST(arg_idx + 1); |
|
3655
|
|
|
|
|
|
|
|
|
3656
|
64
|
100
|
|
|
|
|
if (strEQ(key, "data")) data_sv = val; |
|
3657
|
63
|
100
|
|
|
|
|
else if (strEQ(key, "col.names")) col_names_sv = val; |
|
3658
|
54
|
100
|
|
|
|
|
else if (strEQ(key, "file")) file_sv = val; |
|
3659
|
52
|
100
|
|
|
|
|
else if (strEQ(key, "row.names")) row_names_sv = val; |
|
3660
|
|
|
|
|
|
|
// Check for either "sep" or "delim" and mark as explicitly provided |
|
3661
|
39
|
100
|
|
|
|
|
else if (strEQ(key, "sep") || strEQ(key, "delim")) { |
|
|
|
100
|
|
|
|
|
|
|
3662
|
17
|
|
|
|
|
|
sep = SvPV_nolen(val); |
|
3663
|
17
|
|
|
|
|
|
explicit_sep = 1; |
|
3664
|
|
|
|
|
|
|
} |
|
3665
|
22
|
100
|
|
|
|
|
else if (strEQ(key, "undef.val")) undef_val = SvPV_nolen(val); |
|
3666
|
1
|
|
|
|
|
|
else croak("write_table: Unknown arguments passed: %s", key); |
|
3667
|
|
|
|
|
|
|
} |
|
3668
|
|
|
|
|
|
|
|
|
3669
|
41
|
100
|
|
|
|
|
if (!data_sv || !SvROK(data_sv)) { |
|
|
|
50
|
|
|
|
|
|
|
3670
|
1
|
|
|
|
|
|
croak("write_table: 'data' must be a HASH or ARRAY reference\n"); |
|
3671
|
|
|
|
|
|
|
} |
|
3672
|
|
|
|
|
|
|
|
|
3673
|
40
|
|
|
|
|
|
SV *restrict data_ref = SvRV(data_sv); |
|
3674
|
40
|
100
|
|
|
|
|
if (SvTYPE(data_ref) != SVt_PVHV && SvTYPE(data_ref) != SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
3675
|
0
|
|
|
|
|
|
croak("write_table: 'data' must be a HASH or ARRAY reference\n"); |
|
3676
|
|
|
|
|
|
|
} |
|
3677
|
|
|
|
|
|
|
|
|
3678
|
40
|
100
|
|
|
|
|
if (!file_sv || !SvOK(file_sv)) croak("write_table: file name missing\n"); |
|
|
|
50
|
|
|
|
|
|
|
3679
|
39
|
|
|
|
|
|
const char *restrict file = SvPV_nolen(file_sv); |
|
3680
|
|
|
|
|
|
|
|
|
3681
|
|
|
|
|
|
|
// Auto-detect separator from file extension if not overridden |
|
3682
|
39
|
100
|
|
|
|
|
if (!explicit_sep) { |
|
3683
|
22
|
|
|
|
|
|
size_t file_len = strlen(file); |
|
3684
|
22
|
50
|
|
|
|
|
if (file_len >= 4) { |
|
3685
|
22
|
|
|
|
|
|
const char *restrict ext = file + file_len - 4; |
|
3686
|
22
|
100
|
|
|
|
|
if (strEQ(ext, ".tsv") || strEQ(ext, ".TSV")) { |
|
|
|
50
|
|
|
|
|
|
|
3687
|
3
|
|
|
|
|
|
sep = "\t"; |
|
3688
|
19
|
50
|
|
|
|
|
} else if (strEQ(ext, ".csv") || strEQ(ext, ".CSV")) { |
|
|
|
0
|
|
|
|
|
|
|
3689
|
19
|
|
|
|
|
|
sep = ","; |
|
3690
|
|
|
|
|
|
|
} |
|
3691
|
|
|
|
|
|
|
} |
|
3692
|
|
|
|
|
|
|
} |
|
3693
|
|
|
|
|
|
|
|
|
3694
|
39
|
100
|
|
|
|
|
if (col_names_sv && SvOK(col_names_sv)) { |
|
|
|
50
|
|
|
|
|
|
|
3695
|
9
|
100
|
|
|
|
|
if (!SvROK(col_names_sv) || SvTYPE(SvRV(col_names_sv)) != SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
3696
|
2
|
|
|
|
|
|
croak("write_table: 'col.names' must be an ARRAY reference\n"); |
|
3697
|
|
|
|
|
|
|
} |
|
3698
|
|
|
|
|
|
|
} |
|
3699
|
|
|
|
|
|
|
|
|
3700
|
37
|
|
|
|
|
|
bool is_hoh = 0, is_hoa = 0, is_aoh = 0, is_flat_hash = 0; |
|
3701
|
37
|
|
|
|
|
|
AV *restrict rows_av = NULL; |
|
3702
|
|
|
|
|
|
|
|
|
3703
|
|
|
|
|
|
|
// Validate Input Structures & Homogeneity |
|
3704
|
37
|
100
|
|
|
|
|
if (SvTYPE(data_ref) == SVt_PVHV) { |
|
3705
|
32
|
|
|
|
|
|
HV *restrict hv = (HV*)data_ref; |
|
3706
|
32
|
50
|
|
|
|
|
if (hv_iterinit(hv) == 0) XSRETURN_EMPTY; |
|
3707
|
32
|
|
|
|
|
|
HE *restrict entry = hv_iternext(hv); |
|
3708
|
32
|
|
|
|
|
|
SV *restrict first_val = hv_iterval(hv, entry); |
|
3709
|
|
|
|
|
|
|
|
|
3710
|
32
|
50
|
|
|
|
|
if (!first_val) { |
|
3711
|
0
|
|
|
|
|
|
croak("write_table: Invalid hash entry\n"); |
|
3712
|
|
|
|
|
|
|
} |
|
3713
|
|
|
|
|
|
|
|
|
3714
|
|
|
|
|
|
|
// Check if top level values are scalars (Flat Hash) |
|
3715
|
32
|
100
|
|
|
|
|
if (!SvROK(first_val)) { |
|
3716
|
11
|
|
|
|
|
|
is_flat_hash = 1; |
|
3717
|
|
|
|
|
|
|
} else { |
|
3718
|
21
|
|
|
|
|
|
int first_type = SvTYPE(SvRV(first_val)); |
|
3719
|
21
|
100
|
|
|
|
|
if (first_type != SVt_PVHV && first_type != SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
3720
|
0
|
|
|
|
|
|
croak("write_table: Data values must be either all HASHes, all ARRAYs, or all scalars\n"); |
|
3721
|
|
|
|
|
|
|
} |
|
3722
|
21
|
|
|
|
|
|
is_hoh = (first_type == SVt_PVHV); |
|
3723
|
21
|
|
|
|
|
|
is_hoa = (first_type == SVt_PVAV); |
|
3724
|
|
|
|
|
|
|
} |
|
3725
|
|
|
|
|
|
|
|
|
3726
|
32
|
|
|
|
|
|
hv_iterinit(hv); |
|
3727
|
109
|
100
|
|
|
|
|
while ((entry = hv_iternext(hv))) { |
|
3728
|
79
|
|
|
|
|
|
SV *restrict val = hv_iterval(hv, entry); |
|
3729
|
79
|
100
|
|
|
|
|
if (is_flat_hash) { |
|
3730
|
30
|
50
|
|
|
|
|
if (val && SvROK(val)) { |
|
|
|
100
|
|
|
|
|
|
|
3731
|
1
|
|
|
|
|
|
croak("write_table: Mixed data types detected. Ensure all values are scalars for a flat hash.\n"); |
|
3732
|
|
|
|
|
|
|
} |
|
3733
|
|
|
|
|
|
|
} else { |
|
3734
|
49
|
50
|
|
|
|
|
if (!val || !SvROK(val) || SvTYPE(SvRV(val)) != (is_hoh ? SVt_PVHV : SVt_PVAV)) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
3735
|
1
|
50
|
|
|
|
|
croak("write_table: Mixed data types detected. Ensure all values are %s references.\n", is_hoh ? "HASH" : "ARRAY"); |
|
3736
|
|
|
|
|
|
|
} |
|
3737
|
|
|
|
|
|
|
} |
|
3738
|
|
|
|
|
|
|
} |
|
3739
|
|
|
|
|
|
|
|
|
3740
|
30
|
100
|
|
|
|
|
if (is_hoh) { // Rows are only explicitly pre-gathered for HOH |
|
3741
|
6
|
|
|
|
|
|
rows_av = newAV(); |
|
3742
|
6
|
|
|
|
|
|
hv_iterinit(hv); |
|
3743
|
17
|
100
|
|
|
|
|
while ((entry = hv_iternext(hv))) { |
|
3744
|
11
|
|
|
|
|
|
av_push(rows_av, newSVsv(hv_iterkeysv(entry))); |
|
3745
|
|
|
|
|
|
|
} |
|
3746
|
|
|
|
|
|
|
} |
|
3747
|
|
|
|
|
|
|
} else { |
|
3748
|
5
|
|
|
|
|
|
AV *restrict av = (AV*)data_ref; |
|
3749
|
5
|
50
|
|
|
|
|
if (av_len(av) < 0) XSRETURN_EMPTY; |
|
3750
|
5
|
|
|
|
|
|
SV **restrict first_ptr = av_fetch(av, 0, 0); |
|
3751
|
5
|
50
|
|
|
|
|
if (!first_ptr || !*first_ptr || !SvROK(*first_ptr) || SvTYPE(SvRV(*first_ptr)) != SVt_PVHV) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
3752
|
1
|
50
|
|
|
|
|
if (first_ptr && *first_ptr && SvROK(*first_ptr)) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
3753
|
0
|
|
|
|
|
|
croak("write_table: For ARRAY data, every element must be a HASH reference " |
|
3754
|
|
|
|
|
|
|
"(Array of Hashes); element 0 is a reference of type '%s'\n", |
|
3755
|
|
|
|
|
|
|
sv_reftype(SvRV(*first_ptr), 0)); |
|
3756
|
1
|
50
|
|
|
|
|
else if (first_ptr && *first_ptr && SvOK(*first_ptr)) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
3757
|
1
|
|
|
|
|
|
croak("write_table: For ARRAY data, every element must be a HASH reference " |
|
3758
|
|
|
|
|
|
|
"(Array of Hashes); element 0 is a non-reference scalar (value: '%s')\n", |
|
3759
|
|
|
|
|
|
|
SvPV_nolen(*first_ptr)); |
|
3760
|
|
|
|
|
|
|
else |
|
3761
|
0
|
|
|
|
|
|
croak("write_table: For ARRAY data, every element must be a HASH reference " |
|
3762
|
|
|
|
|
|
|
"(Array of Hashes); element 0 is undef\n"); |
|
3763
|
|
|
|
|
|
|
} |
|
3764
|
13
|
100
|
|
|
|
|
for (size_t i = 0; i <= av_len(av); i++) { |
|
3765
|
9
|
|
|
|
|
|
SV **restrict ptr = av_fetch(av, i, 0); |
|
3766
|
9
|
50
|
|
|
|
|
if (!ptr || !*ptr || !SvROK(*ptr) || SvTYPE(SvRV(*ptr)) != SVt_PVHV) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
3767
|
0
|
|
|
|
|
|
croak("write_table: Mixed data types detected in Array of Hashes. All elements must be HASH references.\n"); |
|
3768
|
|
|
|
|
|
|
} |
|
3769
|
|
|
|
|
|
|
} |
|
3770
|
4
|
|
|
|
|
|
is_aoh = 1; |
|
3771
|
|
|
|
|
|
|
} |
|
3772
|
34
|
|
|
|
|
|
PerlIO *restrict fh = PerlIO_open(file, "w"); |
|
3773
|
34
|
50
|
|
|
|
|
if (!fh) croak("write_table: Could not open '%s' for writing", file); |
|
3774
|
34
|
|
|
|
|
|
AV *restrict headers_av = newAV(); |
|
3775
|
34
|
50
|
|
|
|
|
bool inc_rownames = (row_names_sv && SvTRUE(row_names_sv)) ? 1 : 0; |
|
|
|
100
|
|
|
|
|
|
|
3776
|
34
|
|
|
|
|
|
const char *restrict rownames_col = NULL; |
|
3777
|
|
|
|
|
|
|
// ----- Hash of Hashes ----- |
|
3778
|
34
|
100
|
|
|
|
|
if (is_hoh) { |
|
3779
|
7
|
100
|
|
|
|
|
if (col_names_sv && SvOK(col_names_sv)) { |
|
|
|
50
|
|
|
|
|
|
|
3780
|
1
|
|
|
|
|
|
AV *restrict c_av = (AV*)SvRV(col_names_sv); |
|
3781
|
4
|
100
|
|
|
|
|
for(size_t i=0; i<=av_len(c_av); i++) { |
|
3782
|
3
|
|
|
|
|
|
SV **restrict c = av_fetch(c_av, i, 0); |
|
3783
|
3
|
50
|
|
|
|
|
if(c && SvOK(*c)) av_push(headers_av, newSVsv(*c)); |
|
|
|
50
|
|
|
|
|
|
|
3784
|
|
|
|
|
|
|
} |
|
3785
|
|
|
|
|
|
|
} else { |
|
3786
|
5
|
|
|
|
|
|
HV *restrict col_map = newHV(); |
|
3787
|
5
|
|
|
|
|
|
hv_iterinit((HV*)data_ref); |
|
3788
|
|
|
|
|
|
|
HE *restrict entry; |
|
3789
|
14
|
100
|
|
|
|
|
while((entry = hv_iternext((HV*)data_ref))) { |
|
3790
|
9
|
|
|
|
|
|
HV *restrict inner = (HV*)SvRV(hv_iterval((HV*)data_ref, entry)); |
|
3791
|
9
|
|
|
|
|
|
hv_iterinit(inner); |
|
3792
|
|
|
|
|
|
|
HE *restrict inner_entry; |
|
3793
|
26
|
100
|
|
|
|
|
while((inner_entry = hv_iternext(inner))) { |
|
3794
|
17
|
|
|
|
|
|
hv_store_ent(col_map, hv_iterkeysv(inner_entry), newSViv(1), 0); |
|
3795
|
|
|
|
|
|
|
} |
|
3796
|
|
|
|
|
|
|
} |
|
3797
|
5
|
|
|
|
|
|
unsigned num_cols = hv_iterinit(col_map); |
|
3798
|
5
|
|
|
|
|
|
const char **restrict col_array = safemalloc(num_cols * sizeof(char*)); |
|
3799
|
17
|
100
|
|
|
|
|
for(unsigned i=0; i
|
|
3800
|
12
|
|
|
|
|
|
HE *restrict ce = hv_iternext(col_map); |
|
3801
|
12
|
|
|
|
|
|
col_array[i] = SvPV_nolen(hv_iterkeysv(ce)); |
|
3802
|
|
|
|
|
|
|
} |
|
3803
|
5
|
|
|
|
|
|
qsort(col_array, num_cols, sizeof(char*), cmp_string_wt); |
|
3804
|
17
|
100
|
|
|
|
|
for(unsigned i=0; i
|
|
3805
|
5
|
|
|
|
|
|
safefree(col_array); |
|
3806
|
5
|
|
|
|
|
|
SvREFCNT_dec(col_map); |
|
3807
|
|
|
|
|
|
|
} |
|
3808
|
6
|
|
|
|
|
|
size_t num_headers = av_len(headers_av) + 1; |
|
3809
|
6
|
|
|
|
|
|
const char **restrict header_row = safemalloc((num_headers + 1) * sizeof(char*)); |
|
3810
|
6
|
|
|
|
|
|
size_t h_idx = 0; |
|
3811
|
6
|
50
|
|
|
|
|
if (inc_rownames) header_row[h_idx++] = ""; |
|
3812
|
21
|
100
|
|
|
|
|
for(unsigned short int i=0; i
|
|
3813
|
15
|
|
|
|
|
|
SV**restrict h_ptr = av_fetch(headers_av, i, 0); |
|
3814
|
15
|
50
|
|
|
|
|
header_row[h_idx++] = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : ""; |
|
|
|
50
|
|
|
|
|
|
|
3815
|
|
|
|
|
|
|
} |
|
3816
|
6
|
|
|
|
|
|
print_string_row(aTHX_ fh, header_row, h_idx, sep); |
|
3817
|
6
|
|
|
|
|
|
safefree(header_row); |
|
3818
|
6
|
|
|
|
|
|
size_t num_rows = av_len(rows_av) + 1; |
|
3819
|
6
|
|
|
|
|
|
const char **restrict row_array = safemalloc(num_rows * sizeof(char*)); |
|
3820
|
17
|
100
|
|
|
|
|
for(size_t i=0; i
|
|
3821
|
11
|
|
|
|
|
|
row_array[i] = SvPV_nolen(*av_fetch(rows_av, i, 0)); |
|
3822
|
|
|
|
|
|
|
} |
|
3823
|
6
|
|
|
|
|
|
qsort(row_array, num_rows, sizeof(char*), cmp_string_wt); |
|
3824
|
6
|
|
|
|
|
|
HV *restrict data_hv = (HV*)data_ref; |
|
3825
|
6
|
|
|
|
|
|
const char **restrict row_data = safemalloc((num_headers + 1) * sizeof(char*)); |
|
3826
|
15
|
100
|
|
|
|
|
for(size_t i=0; i
|
|
3827
|
11
|
|
|
|
|
|
size_t d_idx = 0; |
|
3828
|
11
|
50
|
|
|
|
|
if (inc_rownames) row_data[d_idx++] = row_array[i]; |
|
3829
|
11
|
|
|
|
|
|
SV **restrict inner_hv_ptr = hv_fetch(data_hv, row_array[i], strlen(row_array[i]), 0); |
|
3830
|
11
|
50
|
|
|
|
|
HV *restrict inner_hv = inner_hv_ptr ? (HV*)SvRV(*inner_hv_ptr) : NULL; |
|
3831
|
40
|
100
|
|
|
|
|
for(size_t j=0; j
|
|
3832
|
31
|
|
|
|
|
|
SV**restrict h_ptr = av_fetch(headers_av, j, 0); |
|
3833
|
31
|
50
|
|
|
|
|
const char *restrict col_name = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : ""; |
|
|
|
50
|
|
|
|
|
|
|
3834
|
31
|
50
|
|
|
|
|
SV **restrict cell_ptr = inner_hv ? hv_fetch(inner_hv, col_name, strlen(col_name), 0) : NULL; |
|
3835
|
31
|
100
|
|
|
|
|
if (cell_ptr && SvOK(*cell_ptr)) { |
|
|
|
100
|
|
|
|
|
|
|
3836
|
20
|
100
|
|
|
|
|
if (SvROK(*cell_ptr)) { |
|
3837
|
2
|
|
|
|
|
|
PerlIO_close(fh); |
|
3838
|
2
|
|
|
|
|
|
safefree(row_array); safefree(row_data); |
|
3839
|
2
|
50
|
|
|
|
|
if (headers_av) SvREFCNT_dec(headers_av); |
|
3840
|
2
|
50
|
|
|
|
|
if (rows_av) SvREFCNT_dec(rows_av); |
|
3841
|
2
|
|
|
|
|
|
croak("write_table: Cannot write nested reference types to table\n"); |
|
3842
|
|
|
|
|
|
|
} |
|
3843
|
18
|
|
|
|
|
|
row_data[d_idx++] = SvPV_nolen(*cell_ptr); |
|
3844
|
|
|
|
|
|
|
} else { |
|
3845
|
11
|
|
|
|
|
|
row_data[d_idx++] = undef_val; |
|
3846
|
|
|
|
|
|
|
} |
|
3847
|
|
|
|
|
|
|
} |
|
3848
|
9
|
|
|
|
|
|
print_string_row(aTHX_ fh, row_data, d_idx, sep); |
|
3849
|
|
|
|
|
|
|
} |
|
3850
|
4
|
|
|
|
|
|
safefree(row_array); safefree(row_data); |
|
3851
|
|
|
|
|
|
|
// ----- Flat Hash ----- |
|
3852
|
28
|
100
|
|
|
|
|
} else if (is_flat_hash) { |
|
3853
|
10
|
|
|
|
|
|
HV *restrict data_hv = (HV*)data_ref; |
|
3854
|
10
|
|
|
|
|
|
unsigned int num_cols = hv_iterinit(data_hv); |
|
3855
|
10
|
|
|
|
|
|
const char **restrict col_array = safemalloc(num_cols * sizeof(char*)); |
|
3856
|
38
|
100
|
|
|
|
|
for(unsigned int i=0; i
|
|
3857
|
28
|
|
|
|
|
|
HE *restrict ce = hv_iternext(data_hv); |
|
3858
|
28
|
|
|
|
|
|
col_array[i] = SvPV_nolen(hv_iterkeysv(ce)); |
|
3859
|
|
|
|
|
|
|
} |
|
3860
|
|
|
|
|
|
|
// Ensure consistent key order |
|
3861
|
10
|
|
|
|
|
|
qsort(col_array, num_cols, sizeof(char*), cmp_string_wt); |
|
3862
|
11
|
100
|
|
|
|
|
if (col_names_sv && SvOK(col_names_sv)) { |
|
|
|
50
|
|
|
|
|
|
|
3863
|
1
|
|
|
|
|
|
AV *restrict c_av = (AV*)SvRV(col_names_sv); |
|
3864
|
1
|
50
|
|
|
|
|
for(SSize_t i=0; i<=av_len(c_av); i++) { |
|
3865
|
0
|
|
|
|
|
|
SV **restrict c = av_fetch(c_av, i, 0); |
|
3866
|
0
|
0
|
|
|
|
|
if(c && SvOK(*c)) av_push(headers_av, newSVsv(*c)); |
|
|
|
0
|
|
|
|
|
|
|
3867
|
|
|
|
|
|
|
} |
|
3868
|
|
|
|
|
|
|
} else { |
|
3869
|
34
|
100
|
|
|
|
|
for(unsigned i=0; i
|
|
3870
|
25
|
|
|
|
|
|
av_push(headers_av, newSVpv(col_array[i], 0)); |
|
3871
|
|
|
|
|
|
|
} |
|
3872
|
|
|
|
|
|
|
} |
|
3873
|
10
|
|
|
|
|
|
safefree(col_array); |
|
3874
|
10
|
|
|
|
|
|
size_t num_headers = av_len(headers_av) + 1; |
|
3875
|
10
|
|
|
|
|
|
const char **restrict header_row = safemalloc((num_headers + 1) * sizeof(char*)); |
|
3876
|
10
|
|
|
|
|
|
size_t h_idx = 0; |
|
3877
|
10
|
100
|
|
|
|
|
if (inc_rownames) header_row[h_idx++] = ""; |
|
3878
|
35
|
100
|
|
|
|
|
for(size_t i=0; i
|
|
3879
|
25
|
|
|
|
|
|
SV**restrict h_ptr = av_fetch(headers_av, i, 0); |
|
3880
|
25
|
50
|
|
|
|
|
header_row[h_idx++] = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : ""; |
|
|
|
50
|
|
|
|
|
|
|
3881
|
|
|
|
|
|
|
} |
|
3882
|
10
|
|
|
|
|
|
print_string_row(aTHX_ fh, header_row, h_idx, sep); |
|
3883
|
10
|
|
|
|
|
|
safefree(header_row); |
|
3884
|
10
|
|
|
|
|
|
const char **restrict row_data = safemalloc((num_headers + 1) * sizeof(char*)); |
|
3885
|
10
|
|
|
|
|
|
size_t d_idx = 0; |
|
3886
|
|
|
|
|
|
|
// Give the single row a default numeric identifier if row names are on |
|
3887
|
10
|
100
|
|
|
|
|
if (inc_rownames) row_data[d_idx++] = "1"; |
|
3888
|
35
|
100
|
|
|
|
|
for(size_t j=0; j
|
|
3889
|
25
|
|
|
|
|
|
SV**restrict h_ptr = av_fetch(headers_av, j, 0); |
|
3890
|
25
|
50
|
|
|
|
|
const char *restrict col_name = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : ""; |
|
|
|
50
|
|
|
|
|
|
|
3891
|
|
|
|
|
|
|
|
|
3892
|
25
|
|
|
|
|
|
SV **restrict val_ptr = hv_fetch(data_hv, col_name, strlen(col_name), 0); |
|
3893
|
25
|
50
|
|
|
|
|
row_data[d_idx++] = (val_ptr && SvOK(*val_ptr)) ? SvPV_nolen(*val_ptr) : undef_val; |
|
|
|
50
|
|
|
|
|
|
|
3894
|
|
|
|
|
|
|
} |
|
3895
|
10
|
|
|
|
|
|
print_string_row(aTHX_ fh, row_data, d_idx, sep); |
|
3896
|
10
|
|
|
|
|
|
safefree(row_data); |
|
3897
|
|
|
|
|
|
|
// ----- Hash of Arrays ----- |
|
3898
|
18
|
100
|
|
|
|
|
} else if (is_hoa) { |
|
3899
|
14
|
|
|
|
|
|
HV *restrict data_hv = (HV*)data_ref; |
|
3900
|
14
|
|
|
|
|
|
size_t max_rows = 0; |
|
3901
|
14
|
|
|
|
|
|
hv_iterinit(data_hv); |
|
3902
|
|
|
|
|
|
|
HE *restrict entry; |
|
3903
|
50
|
100
|
|
|
|
|
while((entry = hv_iternext(data_hv))) { |
|
3904
|
36
|
|
|
|
|
|
AV *restrict arr = (AV*)SvRV(hv_iterval(data_hv, entry)); |
|
3905
|
36
|
|
|
|
|
|
size_t len = av_len(arr) + 1; |
|
3906
|
36
|
100
|
|
|
|
|
if (len > max_rows) max_rows = len; |
|
3907
|
|
|
|
|
|
|
} |
|
3908
|
18
|
100
|
|
|
|
|
if (col_names_sv && SvOK(col_names_sv)) { |
|
|
|
50
|
|
|
|
|
|
|
3909
|
4
|
|
|
|
|
|
AV *restrict c_av = (AV*)SvRV(col_names_sv); |
|
3910
|
13
|
100
|
|
|
|
|
for(size_t i=0; i<=av_len(c_av); i++) { |
|
3911
|
9
|
|
|
|
|
|
SV **restrict c = av_fetch(c_av, i, 0); |
|
3912
|
9
|
50
|
|
|
|
|
if(c && SvOK(*c)) av_push(headers_av, newSVsv(*c)); |
|
|
|
50
|
|
|
|
|
|
|
3913
|
|
|
|
|
|
|
} |
|
3914
|
|
|
|
|
|
|
} else { |
|
3915
|
10
|
|
|
|
|
|
unsigned int num_cols = hv_iterinit(data_hv); |
|
3916
|
10
|
|
|
|
|
|
const char **restrict col_array = safemalloc(num_cols * sizeof(char*)); |
|
3917
|
35
|
100
|
|
|
|
|
for(unsigned int i=0; i
|
|
3918
|
25
|
|
|
|
|
|
HE *restrict ce = hv_iternext(data_hv); |
|
3919
|
25
|
|
|
|
|
|
col_array[i] = SvPV_nolen(hv_iterkeysv(ce)); |
|
3920
|
|
|
|
|
|
|
} |
|
3921
|
10
|
|
|
|
|
|
qsort(col_array, num_cols, sizeof(char*), cmp_string_wt); |
|
3922
|
35
|
100
|
|
|
|
|
for(unsigned i=0; i
|
|
3923
|
10
|
|
|
|
|
|
safefree(col_array); |
|
3924
|
|
|
|
|
|
|
} |
|
3925
|
14
|
50
|
|
|
|
|
if (av_len(headers_av) < 0) croak("Could not get headers in write_table"); |
|
3926
|
14
|
100
|
|
|
|
|
if (inc_rownames && contains_nondigit(aTHX_ row_names_sv)) { |
|
|
|
100
|
|
|
|
|
|
|
3927
|
1
|
|
|
|
|
|
rownames_col = SvPV_nolen(row_names_sv); |
|
3928
|
1
|
|
|
|
|
|
AV *restrict filtered_headers = newAV(); |
|
3929
|
|
|
|
|
|
|
|
|
3930
|
3
|
100
|
|
|
|
|
for(size_t i=0; i<=av_len(headers_av); i++) { |
|
3931
|
2
|
|
|
|
|
|
SV**restrict h_ptr = av_fetch(headers_av, i, 0); |
|
3932
|
2
|
50
|
|
|
|
|
if (!h_ptr || !*h_ptr) continue; |
|
|
|
50
|
|
|
|
|
|
|
3933
|
2
|
|
|
|
|
|
SV *restrict h_sv = *h_ptr; |
|
3934
|
2
|
100
|
|
|
|
|
if (strcmp(SvPV_nolen(h_sv), rownames_col) != 0) { |
|
3935
|
1
|
|
|
|
|
|
av_push(filtered_headers, newSVsv(h_sv)); |
|
3936
|
|
|
|
|
|
|
} |
|
3937
|
|
|
|
|
|
|
} |
|
3938
|
1
|
|
|
|
|
|
SvREFCNT_dec(headers_av); |
|
3939
|
1
|
|
|
|
|
|
headers_av = filtered_headers; |
|
3940
|
|
|
|
|
|
|
} |
|
3941
|
14
|
|
|
|
|
|
size_t num_headers = av_len(headers_av) + 1; |
|
3942
|
14
|
|
|
|
|
|
const char **restrict header_row = safemalloc((num_headers + 1) * sizeof(char*)); |
|
3943
|
14
|
|
|
|
|
|
size_t h_idx = 0; |
|
3944
|
14
|
100
|
|
|
|
|
if (inc_rownames) header_row[h_idx++] = ""; |
|
3945
|
47
|
100
|
|
|
|
|
for(size_t i=0; i
|
|
3946
|
33
|
|
|
|
|
|
SV**restrict h_ptr = av_fetch(headers_av, i, 0); |
|
3947
|
33
|
50
|
|
|
|
|
header_row[h_idx++] = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : ""; |
|
|
|
50
|
|
|
|
|
|
|
3948
|
|
|
|
|
|
|
} |
|
3949
|
14
|
|
|
|
|
|
print_string_row(aTHX_ fh, header_row, h_idx, sep); |
|
3950
|
14
|
|
|
|
|
|
safefree(header_row); |
|
3951
|
14
|
|
|
|
|
|
const char **restrict row_data = safemalloc((num_headers + 1) * sizeof(char*)); |
|
3952
|
64
|
100
|
|
|
|
|
for(size_t i=0; i
|
|
3953
|
50
|
|
|
|
|
|
size_t d_idx = 0; |
|
3954
|
50
|
100
|
|
|
|
|
if (inc_rownames) { |
|
3955
|
38
|
100
|
|
|
|
|
if (rownames_col) { |
|
3956
|
2
|
|
|
|
|
|
SV **restrict rn_arr_ptr = hv_fetch(data_hv, rownames_col, strlen(rownames_col), 0); |
|
3957
|
4
|
50
|
|
|
|
|
if (rn_arr_ptr && SvROK(*rn_arr_ptr)) { |
|
|
|
50
|
|
|
|
|
|
|
3958
|
2
|
|
|
|
|
|
AV *restrict rn_arr = (AV*)SvRV(*rn_arr_ptr); |
|
3959
|
2
|
|
|
|
|
|
SV **restrict rn_val_ptr = av_fetch(rn_arr, i, 0); |
|
3960
|
2
|
50
|
|
|
|
|
if (rn_val_ptr && SvOK(*rn_val_ptr)) { |
|
|
|
50
|
|
|
|
|
|
|
3961
|
2
|
50
|
|
|
|
|
if (SvROK(*rn_val_ptr)) { |
|
3962
|
0
|
|
|
|
|
|
PerlIO_close(fh); |
|
3963
|
0
|
|
|
|
|
|
safefree(row_data); |
|
3964
|
0
|
0
|
|
|
|
|
if (headers_av) SvREFCNT_dec(headers_av); |
|
3965
|
0
|
|
|
|
|
|
croak("write_table: Cannot write nested reference types to table\n"); |
|
3966
|
|
|
|
|
|
|
} |
|
3967
|
2
|
|
|
|
|
|
row_data[d_idx++] = SvPV_nolen(*rn_val_ptr); |
|
3968
|
|
|
|
|
|
|
} else { |
|
3969
|
0
|
|
|
|
|
|
row_data[d_idx++] = undef_val; |
|
3970
|
|
|
|
|
|
|
} |
|
3971
|
|
|
|
|
|
|
} else { |
|
3972
|
0
|
|
|
|
|
|
row_data[d_idx++] = undef_val; |
|
3973
|
|
|
|
|
|
|
} |
|
3974
|
|
|
|
|
|
|
} else { |
|
3975
|
|
|
|
|
|
|
char buf[32]; |
|
3976
|
36
|
|
|
|
|
|
snprintf(buf, sizeof(buf), "%ld", (long)(i + 1)); |
|
3977
|
36
|
|
|
|
|
|
row_data[d_idx++] = savepv(buf); |
|
3978
|
|
|
|
|
|
|
} |
|
3979
|
|
|
|
|
|
|
} |
|
3980
|
178
|
100
|
|
|
|
|
for(size_t j=0; j
|
|
3981
|
128
|
|
|
|
|
|
SV**restrict h_ptr = av_fetch(headers_av, j, 0); |
|
3982
|
128
|
50
|
|
|
|
|
const char *restrict col_name = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : ""; |
|
|
|
50
|
|
|
|
|
|
|
3983
|
128
|
|
|
|
|
|
SV **restrict arr_ptr = hv_fetch(data_hv, col_name, strlen(col_name), 0); |
|
3984
|
256
|
50
|
|
|
|
|
if (arr_ptr && SvROK(*arr_ptr)) { |
|
|
|
50
|
|
|
|
|
|
|
3985
|
128
|
|
|
|
|
|
AV *restrict arr = (AV*)SvRV(*arr_ptr); |
|
3986
|
128
|
|
|
|
|
|
SV **restrict cell_ptr = av_fetch(arr, i, 0); |
|
3987
|
128
|
100
|
|
|
|
|
if (cell_ptr && SvOK(*cell_ptr)) { |
|
|
|
100
|
|
|
|
|
|
|
3988
|
81
|
50
|
|
|
|
|
if (SvROK(*cell_ptr)) { |
|
3989
|
0
|
|
|
|
|
|
PerlIO_close(fh); |
|
3990
|
0
|
|
|
|
|
|
safefree(row_data); |
|
3991
|
0
|
0
|
|
|
|
|
if (headers_av) SvREFCNT_dec(headers_av); |
|
3992
|
0
|
|
|
|
|
|
croak("write_table: Cannot write nested reference types to table\n"); |
|
3993
|
|
|
|
|
|
|
} |
|
3994
|
81
|
|
|
|
|
|
row_data[d_idx++] = SvPV_nolen(*cell_ptr); |
|
3995
|
|
|
|
|
|
|
} else { |
|
3996
|
47
|
|
|
|
|
|
row_data[d_idx++] = undef_val; |
|
3997
|
|
|
|
|
|
|
} |
|
3998
|
|
|
|
|
|
|
} else { |
|
3999
|
0
|
|
|
|
|
|
row_data[d_idx++] = undef_val; |
|
4000
|
|
|
|
|
|
|
} |
|
4001
|
|
|
|
|
|
|
} |
|
4002
|
50
|
|
|
|
|
|
print_string_row(aTHX_ fh, row_data, d_idx, sep); |
|
4003
|
50
|
100
|
|
|
|
|
if (inc_rownames && !rownames_col) safefree((char*)row_data[0]); |
|
|
|
100
|
|
|
|
|
|
|
4004
|
|
|
|
|
|
|
} |
|
4005
|
14
|
|
|
|
|
|
safefree(row_data); |
|
4006
|
4
|
50
|
|
|
|
|
} else if (is_aoh) {// ----- Array of Hashes |
|
4007
|
4
|
|
|
|
|
|
AV *restrict data_av = (AV*)data_ref; |
|
4008
|
4
|
|
|
|
|
|
size_t num_rows = av_len(data_av) + 1; |
|
4009
|
5
|
100
|
|
|
|
|
if (col_names_sv && SvOK(col_names_sv)) { |
|
|
|
50
|
|
|
|
|
|
|
4010
|
1
|
|
|
|
|
|
AV *restrict c_av = (AV*)SvRV(col_names_sv); |
|
4011
|
3
|
100
|
|
|
|
|
for(size_t i=0; i<=av_len(c_av); i++) { |
|
4012
|
2
|
|
|
|
|
|
SV **restrict c = av_fetch(c_av, i, 0); |
|
4013
|
2
|
50
|
|
|
|
|
if(c && SvOK(*c)) av_push(headers_av, newSVsv(*c)); |
|
|
|
50
|
|
|
|
|
|
|
4014
|
|
|
|
|
|
|
} |
|
4015
|
|
|
|
|
|
|
} else { |
|
4016
|
3
|
|
|
|
|
|
HV *restrict col_map = newHV(); |
|
4017
|
10
|
100
|
|
|
|
|
for(size_t i=0; i
|
|
4018
|
7
|
|
|
|
|
|
SV **restrict row_ptr = av_fetch(data_av, i, 0); |
|
4019
|
7
|
50
|
|
|
|
|
if (row_ptr && SvROK(*row_ptr)) { |
|
|
|
50
|
|
|
|
|
|
|
4020
|
7
|
|
|
|
|
|
HV *restrict row_hv = (HV*)SvRV(*row_ptr); |
|
4021
|
7
|
|
|
|
|
|
hv_iterinit(row_hv); |
|
4022
|
|
|
|
|
|
|
HE *restrict entry; |
|
4023
|
20
|
100
|
|
|
|
|
while((entry = hv_iternext(row_hv))) { |
|
4024
|
13
|
|
|
|
|
|
hv_store_ent(col_map, hv_iterkeysv(entry), newSViv(1), 0); |
|
4025
|
|
|
|
|
|
|
} |
|
4026
|
|
|
|
|
|
|
} |
|
4027
|
|
|
|
|
|
|
} |
|
4028
|
3
|
|
|
|
|
|
unsigned num_cols = hv_iterinit(col_map); |
|
4029
|
3
|
|
|
|
|
|
const char **restrict col_array = safemalloc(num_cols * sizeof(char*)); |
|
4030
|
10
|
100
|
|
|
|
|
for(unsigned int i=0; i
|
|
4031
|
7
|
|
|
|
|
|
HE *restrict ce = hv_iternext(col_map); |
|
4032
|
7
|
|
|
|
|
|
col_array[i] = SvPV_nolen(hv_iterkeysv(ce)); |
|
4033
|
|
|
|
|
|
|
} |
|
4034
|
3
|
|
|
|
|
|
qsort(col_array, num_cols, sizeof(char*), cmp_string_wt); |
|
4035
|
10
|
100
|
|
|
|
|
for(unsigned int i=0; i
|
|
4036
|
3
|
|
|
|
|
|
safefree(col_array); |
|
4037
|
3
|
|
|
|
|
|
SvREFCNT_dec(col_map); |
|
4038
|
|
|
|
|
|
|
} |
|
4039
|
4
|
100
|
|
|
|
|
if (inc_rownames && contains_nondigit(aTHX_ row_names_sv)) { |
|
|
|
50
|
|
|
|
|
|
|
4040
|
0
|
|
|
|
|
|
rownames_col = SvPV_nolen(row_names_sv); |
|
4041
|
0
|
|
|
|
|
|
AV *restrict filtered_headers = newAV(); |
|
4042
|
0
|
0
|
|
|
|
|
for(size_t i=0; i<=av_len(headers_av); i++) { |
|
4043
|
0
|
|
|
|
|
|
SV**restrict h_ptr = av_fetch(headers_av, i, 0); |
|
4044
|
0
|
0
|
|
|
|
|
if (!h_ptr || !*h_ptr) continue; |
|
|
|
0
|
|
|
|
|
|
|
4045
|
0
|
|
|
|
|
|
SV *restrict h_sv = *h_ptr; |
|
4046
|
0
|
0
|
|
|
|
|
if (strcmp(SvPV_nolen(h_sv), rownames_col) != 0) { |
|
4047
|
0
|
|
|
|
|
|
av_push(filtered_headers, newSVsv(h_sv)); |
|
4048
|
|
|
|
|
|
|
} |
|
4049
|
|
|
|
|
|
|
} |
|
4050
|
0
|
|
|
|
|
|
SvREFCNT_dec(headers_av); |
|
4051
|
0
|
|
|
|
|
|
headers_av = filtered_headers; |
|
4052
|
|
|
|
|
|
|
} |
|
4053
|
4
|
|
|
|
|
|
size_t num_headers = av_len(headers_av) + 1; |
|
4054
|
4
|
|
|
|
|
|
const char **restrict header_row = safemalloc((num_headers + 1) * sizeof(char*)); |
|
4055
|
4
|
|
|
|
|
|
size_t h_idx = 0; |
|
4056
|
4
|
100
|
|
|
|
|
if (inc_rownames) header_row[h_idx++] = ""; |
|
4057
|
13
|
100
|
|
|
|
|
for(size_t i=0; i
|
|
4058
|
9
|
|
|
|
|
|
SV**restrict h_ptr = av_fetch(headers_av, i, 0); |
|
4059
|
9
|
50
|
|
|
|
|
header_row[h_idx++] = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : ""; |
|
|
|
50
|
|
|
|
|
|
|
4060
|
|
|
|
|
|
|
} |
|
4061
|
4
|
|
|
|
|
|
print_string_row(aTHX_ fh, header_row, h_idx, sep); |
|
4062
|
4
|
|
|
|
|
|
safefree(header_row); |
|
4063
|
4
|
|
|
|
|
|
const char **restrict row_data = safemalloc((num_headers + 1) * sizeof(char*)); |
|
4064
|
13
|
100
|
|
|
|
|
for(size_t i=0; i
|
|
4065
|
9
|
|
|
|
|
|
size_t d_idx = 0; |
|
4066
|
9
|
|
|
|
|
|
SV **restrict row_ptr = av_fetch(data_av, i, 0); |
|
4067
|
9
|
50
|
|
|
|
|
HV *restrict row_hv = (row_ptr && SvROK(*row_ptr)) ? (HV*)SvRV(*row_ptr) : NULL; |
|
|
|
50
|
|
|
|
|
|
|
4068
|
9
|
100
|
|
|
|
|
if (inc_rownames) { |
|
4069
|
5
|
50
|
|
|
|
|
if (rownames_col) { |
|
4070
|
0
|
0
|
|
|
|
|
SV **restrict rn_val_ptr = row_hv ? hv_fetch(row_hv, rownames_col, strlen(rownames_col), 0) : NULL; |
|
4071
|
0
|
0
|
|
|
|
|
if (rn_val_ptr && SvOK(*rn_val_ptr)) { |
|
|
|
0
|
|
|
|
|
|
|
4072
|
0
|
0
|
|
|
|
|
if (SvROK(*rn_val_ptr)) { |
|
4073
|
0
|
|
|
|
|
|
PerlIO_close(fh); |
|
4074
|
0
|
|
|
|
|
|
safefree(row_data); |
|
4075
|
0
|
0
|
|
|
|
|
if (headers_av) SvREFCNT_dec(headers_av); |
|
4076
|
0
|
|
|
|
|
|
croak("write_table: Cannot write nested reference types to table\n"); |
|
4077
|
|
|
|
|
|
|
} |
|
4078
|
0
|
|
|
|
|
|
row_data[d_idx++] = SvPV_nolen(*rn_val_ptr); |
|
4079
|
|
|
|
|
|
|
} else { |
|
4080
|
0
|
|
|
|
|
|
row_data[d_idx++] = undef_val; |
|
4081
|
|
|
|
|
|
|
} |
|
4082
|
|
|
|
|
|
|
} else { |
|
4083
|
|
|
|
|
|
|
char buf[32]; |
|
4084
|
5
|
|
|
|
|
|
snprintf(buf, sizeof(buf), "%ld", (long)(i + 1)); |
|
4085
|
5
|
|
|
|
|
|
row_data[d_idx++] = savepv(buf); |
|
4086
|
|
|
|
|
|
|
} |
|
4087
|
|
|
|
|
|
|
} |
|
4088
|
30
|
100
|
|
|
|
|
for(size_t j=0; j
|
|
4089
|
21
|
|
|
|
|
|
SV**restrict h_ptr = av_fetch(headers_av, j, 0); |
|
4090
|
21
|
50
|
|
|
|
|
const char *restrict col_name = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : ""; |
|
|
|
50
|
|
|
|
|
|
|
4091
|
21
|
50
|
|
|
|
|
SV **restrict cell_ptr = row_hv ? hv_fetch(row_hv, col_name, strlen(col_name), 0) : NULL; |
|
4092
|
21
|
100
|
|
|
|
|
if (cell_ptr && SvOK(*cell_ptr)) { |
|
|
|
50
|
|
|
|
|
|
|
4093
|
17
|
50
|
|
|
|
|
if (SvROK(*cell_ptr)) { |
|
4094
|
0
|
|
|
|
|
|
PerlIO_close(fh); |
|
4095
|
0
|
|
|
|
|
|
safefree(row_data); |
|
4096
|
0
|
0
|
|
|
|
|
if (headers_av) SvREFCNT_dec(headers_av); |
|
4097
|
0
|
|
|
|
|
|
croak("write_table: Cannot write nested reference types to table\n"); |
|
4098
|
|
|
|
|
|
|
} |
|
4099
|
17
|
|
|
|
|
|
row_data[d_idx++] = SvPV_nolen(*cell_ptr); |
|
4100
|
|
|
|
|
|
|
} else { |
|
4101
|
4
|
|
|
|
|
|
row_data[d_idx++] = undef_val; |
|
4102
|
|
|
|
|
|
|
} |
|
4103
|
|
|
|
|
|
|
} |
|
4104
|
9
|
|
|
|
|
|
print_string_row(aTHX_ fh, row_data, d_idx, sep); |
|
4105
|
9
|
100
|
|
|
|
|
if (inc_rownames && !rownames_col) safefree((char*)row_data[0]); |
|
|
|
50
|
|
|
|
|
|
|
4106
|
|
|
|
|
|
|
} |
|
4107
|
4
|
|
|
|
|
|
safefree(row_data); |
|
4108
|
|
|
|
|
|
|
} |
|
4109
|
32
|
50
|
|
|
|
|
if (headers_av) SvREFCNT_dec(headers_av); |
|
4110
|
32
|
100
|
|
|
|
|
if (rows_av) SvREFCNT_dec(rows_av); |
|
4111
|
32
|
|
|
|
|
|
PerlIO_close(fh); |
|
4112
|
32
|
|
|
|
|
|
XSRETURN_EMPTY; |
|
4113
|
|
|
|
|
|
|
} |
|
4114
|
|
|
|
|
|
|
|
|
4115
|
|
|
|
|
|
|
SV* _parse_csv_file(char* file, const char* sep_str, const char* comment_str, SV* callback = &PL_sv_undef) |
|
4116
|
|
|
|
|
|
|
INIT: |
|
4117
|
|
|
|
|
|
|
PerlIO *restrict fp; |
|
4118
|
522
|
|
|
|
|
|
AV *restrict data = NULL; |
|
4119
|
522
|
|
|
|
|
|
AV *restrict current_row = newAV(); |
|
4120
|
522
|
|
|
|
|
|
SV *restrict field = newSVpvs(""); |
|
4121
|
522
|
|
|
|
|
|
bool in_quotes = 0, post_quote = 0; |
|
4122
|
|
|
|
|
|
|
size_t sep_len, comment_len; |
|
4123
|
|
|
|
|
|
|
SV *restrict line_sv; |
|
4124
|
522
|
|
|
|
|
|
bool use_cb = 0; |
|
4125
|
|
|
|
|
|
|
CODE: |
|
4126
|
522
|
50
|
|
|
|
|
if (SvOK(callback) && SvROK(callback) && SvTYPE(SvRV(callback)) == SVt_PVCV) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
4127
|
522
|
|
|
|
|
|
use_cb = 1; |
|
4128
|
|
|
|
|
|
|
} else { |
|
4129
|
0
|
|
|
|
|
|
data = newAV(); |
|
4130
|
|
|
|
|
|
|
} |
|
4131
|
522
|
50
|
|
|
|
|
sep_len = sep_str ? strlen(sep_str) : 0; |
|
4132
|
522
|
50
|
|
|
|
|
comment_len = comment_str ? strlen(comment_str) : 0; |
|
4133
|
|
|
|
|
|
|
|
|
4134
|
522
|
|
|
|
|
|
fp = PerlIO_open(file, "r"); |
|
4135
|
522
|
50
|
|
|
|
|
if (!fp) { |
|
4136
|
0
|
|
|
|
|
|
croak("Could not open file '%s'", file); |
|
4137
|
|
|
|
|
|
|
} |
|
4138
|
522
|
|
|
|
|
|
line_sv = newSV_type(SVt_PV); |
|
4139
|
|
|
|
|
|
|
// Read line by line using PerlIO |
|
4140
|
7239
|
100
|
|
|
|
|
while (sv_gets(line_sv, fp, 0) != NULL) { |
|
4141
|
6718
|
|
|
|
|
|
char *restrict line = SvPV_nolen(line_sv); |
|
4142
|
6718
|
|
|
|
|
|
size_t len = SvCUR(line_sv); |
|
4143
|
|
|
|
|
|
|
// chomp \r\n (Handles Windows invisible \r natively) |
|
4144
|
6718
|
50
|
|
|
|
|
if (len > 0 && line[len-1] == '\n') { |
|
|
|
100
|
|
|
|
|
|
|
4145
|
6717
|
|
|
|
|
|
len--; |
|
4146
|
6717
|
50
|
|
|
|
|
if (len > 0 && line[len-1] == '\r') { |
|
|
|
100
|
|
|
|
|
|
|
4147
|
4928
|
|
|
|
|
|
len--; |
|
4148
|
|
|
|
|
|
|
} |
|
4149
|
|
|
|
|
|
|
} |
|
4150
|
6718
|
50
|
|
|
|
|
if (!in_quotes) { |
|
4151
|
|
|
|
|
|
|
// Skip completely empty lines (\h*[\r\n]+$ equivalent) |
|
4152
|
6718
|
|
|
|
|
|
bool is_empty = 1; |
|
4153
|
6720
|
50
|
|
|
|
|
for (size_t i = 0; i < len; i++) { |
|
4154
|
6720
|
50
|
|
|
|
|
if (line[i] != ' ' && line[i] != '\t') { is_empty = 0; break; } |
|
|
|
100
|
|
|
|
|
|
|
4155
|
|
|
|
|
|
|
} |
|
4156
|
6718
|
50
|
|
|
|
|
if (is_empty) continue; |
|
4157
|
|
|
|
|
|
|
|
|
4158
|
|
|
|
|
|
|
// Skip comments |
|
4159
|
6718
|
50
|
|
|
|
|
if (comment_len > 0 && len >= comment_len && strncmp(line, comment_str, comment_len) == 0) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
4160
|
0
|
|
|
|
|
|
continue; |
|
4161
|
|
|
|
|
|
|
} |
|
4162
|
|
|
|
|
|
|
} |
|
4163
|
390865
|
100
|
|
|
|
|
for (size_t i = 0; i < len; i++) {// --- CORE PARSING MACHINE |
|
4164
|
384147
|
|
|
|
|
|
const char ch = line[i]; |
|
4165
|
384147
|
50
|
|
|
|
|
if (ch == '\r') continue; |
|
4166
|
384147
|
100
|
|
|
|
|
if (ch == '"') { |
|
4167
|
29758
|
100
|
|
|
|
|
if (in_quotes && (i + 1 < len) && line[i+1] == '"') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
4168
|
4
|
|
|
|
|
|
sv_catpvn(field, "\"", 1); |
|
4169
|
4
|
|
|
|
|
|
i++; // Skip the escaped second quote |
|
4170
|
29754
|
100
|
|
|
|
|
} else if (in_quotes) { |
|
4171
|
14877
|
|
|
|
|
|
in_quotes = 0; // Close quotes |
|
4172
|
14877
|
|
|
|
|
|
post_quote = 1; |
|
4173
|
14877
|
50
|
|
|
|
|
} else if (!post_quote) { |
|
4174
|
14877
|
|
|
|
|
|
in_quotes = 1; // Open quotes (only when not in post-quote state) |
|
4175
|
|
|
|
|
|
|
} |
|
4176
|
354389
|
100
|
|
|
|
|
} else if (!in_quotes && sep_len > 0 && (len - i) >= sep_len && strncmp(line + i, sep_str, sep_len) == 0) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
4177
|
69184
|
|
|
|
|
|
av_push(current_row, newSVsv(field)); |
|
4178
|
69184
|
|
|
|
|
|
sv_setpvs(field, ""); // Reset for next field |
|
4179
|
69184
|
|
|
|
|
|
i += sep_len - 1; // Advance past multi-char separators |
|
4180
|
69184
|
|
|
|
|
|
post_quote = 0; |
|
4181
|
|
|
|
|
|
|
} else { |
|
4182
|
285205
|
|
|
|
|
|
sv_catpvn(field, &ch, 1); |
|
4183
|
|
|
|
|
|
|
} |
|
4184
|
|
|
|
|
|
|
} |
|
4185
|
6718
|
50
|
|
|
|
|
if (in_quotes) { |
|
4186
|
|
|
|
|
|
|
// Line ended but quotes are still open! Append newline and fetch next |
|
4187
|
0
|
|
|
|
|
|
sv_catpvn(field, "\n", 1); |
|
4188
|
|
|
|
|
|
|
} else { |
|
4189
|
6718
|
|
|
|
|
|
post_quote = 0; // Reset post-quote state at row boundary |
|
4190
|
|
|
|
|
|
|
// Push the final field of the record |
|
4191
|
6718
|
|
|
|
|
|
av_push(current_row, newSVsv(field)); |
|
4192
|
6718
|
|
|
|
|
|
sv_setpvs(field, ""); |
|
4193
|
|
|
|
|
|
|
// If a callback is provided, invoke it in a streaming fashion |
|
4194
|
6718
|
50
|
|
|
|
|
if (use_cb) { |
|
4195
|
6718
|
|
|
|
|
|
dSP; |
|
4196
|
6718
|
|
|
|
|
|
ENTER; |
|
4197
|
6718
|
|
|
|
|
|
SAVETMPS; |
|
4198
|
6718
|
50
|
|
|
|
|
PUSHMARK(SP); |
|
4199
|
6718
|
50
|
|
|
|
|
XPUSHs(sv_2mortal(newRV_inc((SV*)current_row))); |
|
4200
|
6718
|
|
|
|
|
|
PUTBACK; |
|
4201
|
6718
|
|
|
|
|
|
call_sv(callback, G_DISCARD); |
|
4202
|
6717
|
50
|
|
|
|
|
FREETMPS; |
|
4203
|
6717
|
|
|
|
|
|
LEAVE; |
|
4204
|
6717
|
|
|
|
|
|
SvREFCNT_dec(current_row); // Frees the row from C memory if Perl didn't keep it |
|
4205
|
|
|
|
|
|
|
} else { |
|
4206
|
0
|
|
|
|
|
|
av_push(data, newRV_noinc((SV*)current_row)); |
|
4207
|
|
|
|
|
|
|
} |
|
4208
|
6717
|
|
|
|
|
|
current_row = newAV(); |
|
4209
|
|
|
|
|
|
|
} |
|
4210
|
|
|
|
|
|
|
} |
|
4211
|
521
|
|
|
|
|
|
PerlIO_close(fp); |
|
4212
|
521
|
|
|
|
|
|
SvREFCNT_dec(line_sv); |
|
4213
|
|
|
|
|
|
|
|
|
4214
|
521
|
50
|
|
|
|
|
if (in_quotes) { |
|
4215
|
0
|
|
|
|
|
|
av_push(current_row, newSVsv(field)); |
|
4216
|
0
|
0
|
|
|
|
|
if (use_cb) { |
|
4217
|
0
|
|
|
|
|
|
dSP; |
|
4218
|
0
|
|
|
|
|
|
ENTER; |
|
4219
|
0
|
|
|
|
|
|
SAVETMPS; |
|
4220
|
0
|
0
|
|
|
|
|
PUSHMARK(SP); |
|
4221
|
0
|
0
|
|
|
|
|
XPUSHs(sv_2mortal(newRV_inc((SV*)current_row))); |
|
4222
|
0
|
|
|
|
|
|
PUTBACK; |
|
4223
|
0
|
|
|
|
|
|
call_sv(callback, G_DISCARD); |
|
4224
|
0
|
0
|
|
|
|
|
FREETMPS; |
|
4225
|
0
|
|
|
|
|
|
LEAVE; |
|
4226
|
0
|
|
|
|
|
|
SvREFCNT_dec(current_row); |
|
4227
|
|
|
|
|
|
|
} else { |
|
4228
|
0
|
|
|
|
|
|
av_push(data, newRV_noinc((SV*)current_row)); |
|
4229
|
|
|
|
|
|
|
} |
|
4230
|
0
|
|
|
|
|
|
current_row = newAV(); |
|
4231
|
|
|
|
|
|
|
} |
|
4232
|
521
|
|
|
|
|
|
SvREFCNT_dec(field); |
|
4233
|
521
|
|
|
|
|
|
SvREFCNT_dec(current_row); |
|
4234
|
521
|
50
|
|
|
|
|
if (use_cb) { |
|
4235
|
521
|
|
|
|
|
|
RETVAL = newSV(0); // fresh undef; mortalizing immortal &PL_sv_undef underflows it on perl<5.18 |
|
4236
|
|
|
|
|
|
|
} else { |
|
4237
|
0
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)data); |
|
4238
|
|
|
|
|
|
|
} |
|
4239
|
|
|
|
|
|
|
OUTPUT: |
|
4240
|
|
|
|
|
|
|
RETVAL |
|
4241
|
|
|
|
|
|
|
|
|
4242
|
|
|
|
|
|
|
SV* cov(SV* x_sv, SV* y_sv, const char* method = "pearson") |
|
4243
|
|
|
|
|
|
|
CODE: |
|
4244
|
|
|
|
|
|
|
{ |
|
4245
|
|
|
|
|
|
|
// 1. Validate inputs are Array References |
|
4246
|
4
|
50
|
|
|
|
|
if (!SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
4247
|
0
|
|
|
|
|
|
croak("cov: first argument 'x' must be an ARRAY reference"); |
|
4248
|
|
|
|
|
|
|
} |
|
4249
|
4
|
50
|
|
|
|
|
if (!SvROK(y_sv) || SvTYPE(SvRV(y_sv)) != SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
4250
|
0
|
|
|
|
|
|
croak("cov: second argument 'y' must be an ARRAY reference"); |
|
4251
|
|
|
|
|
|
|
} |
|
4252
|
|
|
|
|
|
|
|
|
4253
|
|
|
|
|
|
|
// 2. Validate method argument |
|
4254
|
4
|
100
|
|
|
|
|
if (strcmp(method, "pearson") != 0 && |
|
4255
|
2
|
100
|
|
|
|
|
strcmp(method, "spearman") != 0 && |
|
4256
|
1
|
50
|
|
|
|
|
strcmp(method, "kendall") != 0) { |
|
4257
|
0
|
|
|
|
|
|
croak("cov: unknown method '%s' (use 'pearson', 'spearman', or 'kendall')", method); |
|
4258
|
|
|
|
|
|
|
} |
|
4259
|
|
|
|
|
|
|
|
|
4260
|
4
|
|
|
|
|
|
AV *restrict x_av = (AV*)SvRV(x_sv); |
|
4261
|
4
|
|
|
|
|
|
AV *restrict y_av = (AV*)SvRV(y_sv); |
|
4262
|
4
|
|
|
|
|
|
size_t nx = av_len(x_av) + 1; |
|
4263
|
4
|
|
|
|
|
|
size_t ny = av_len(y_av) + 1; |
|
4264
|
|
|
|
|
|
|
|
|
4265
|
4
|
50
|
|
|
|
|
if (nx != ny) { |
|
4266
|
0
|
|
|
|
|
|
croak("cov: incompatible dimensions (x has %lu, y has %lu)", |
|
4267
|
|
|
|
|
|
|
(unsigned long)nx, (unsigned long)ny); |
|
4268
|
|
|
|
|
|
|
} |
|
4269
|
|
|
|
|
|
|
|
|
4270
|
|
|
|
|
|
|
// 3. Extract Valid Pairwise Data |
|
4271
|
|
|
|
|
|
|
// Allocate temporary C arrays for numeric processing |
|
4272
|
4
|
|
|
|
|
|
double *restrict x_val = (double*)safemalloc(nx * sizeof(double)); |
|
4273
|
4
|
|
|
|
|
|
double *restrict y_val = (double*)safemalloc(nx * sizeof(double)); |
|
4274
|
4
|
|
|
|
|
|
size_t n = 0; |
|
4275
|
|
|
|
|
|
|
|
|
4276
|
24
|
100
|
|
|
|
|
for (size_t i = 0; i < nx; i++) { |
|
4277
|
20
|
|
|
|
|
|
SV **restrict x_tv = av_fetch(x_av, i, 0); |
|
4278
|
20
|
|
|
|
|
|
SV **restrict y_tv = av_fetch(y_av, i, 0); |
|
4279
|
|
|
|
|
|
|
|
|
4280
|
|
|
|
|
|
|
// Extract numeric values, defaulting to NAN for missing/invalid data |
|
4281
|
20
|
50
|
|
|
|
|
double xv = (x_tv && SvOK(*x_tv) && looks_like_number(*x_tv)) ? SvNV(*x_tv) : NAN; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
4282
|
20
|
50
|
|
|
|
|
double yv = (y_tv && SvOK(*y_tv) && looks_like_number(*y_tv)) ? SvNV(*y_tv) : NAN; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
4283
|
|
|
|
|
|
|
|
|
4284
|
|
|
|
|
|
|
// Pairwise complete observations (skips NAs seamlessly like R) |
|
4285
|
20
|
50
|
|
|
|
|
if (!isnan(xv) && !isnan(yv)) { |
|
|
|
50
|
|
|
|
|
|
|
4286
|
20
|
|
|
|
|
|
x_val[n] = xv; |
|
4287
|
20
|
|
|
|
|
|
y_val[n] = yv; |
|
4288
|
20
|
|
|
|
|
|
n++; |
|
4289
|
|
|
|
|
|
|
} |
|
4290
|
|
|
|
|
|
|
} |
|
4291
|
|
|
|
|
|
|
|
|
4292
|
|
|
|
|
|
|
// 4. Handle edge cases where data is too sparse |
|
4293
|
4
|
50
|
|
|
|
|
if (n < 2) { |
|
4294
|
0
|
|
|
|
|
|
Safefree(x_val); Safefree(y_val); |
|
4295
|
0
|
|
|
|
|
|
RETVAL = newSVnv(NAN); |
|
4296
|
|
|
|
|
|
|
} else { |
|
4297
|
4
|
|
|
|
|
|
double ans = 0.0; |
|
4298
|
|
|
|
|
|
|
// 5. Algorithm routing |
|
4299
|
4
|
100
|
|
|
|
|
if (strcmp(method, "kendall") == 0) { |
|
4300
|
|
|
|
|
|
|
// R's default cov(..., method="kendall") iterates the full n x n space |
|
4301
|
6
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
4302
|
30
|
100
|
|
|
|
|
for (size_t j = 0; j < n; j++) { |
|
4303
|
25
|
|
|
|
|
|
int sx = (x_val[i] > x_val[j]) - (x_val[i] < x_val[j]); |
|
4304
|
25
|
|
|
|
|
|
int sy = (y_val[i] > y_val[j]) - (y_val[i] < y_val[j]); |
|
4305
|
25
|
|
|
|
|
|
ans += (double)(sx * sy); |
|
4306
|
|
|
|
|
|
|
} |
|
4307
|
|
|
|
|
|
|
} |
|
4308
|
|
|
|
|
|
|
} else { |
|
4309
|
3
|
|
|
|
|
|
double mean_x = 0.0, mean_y = 0.0, cov_sum = 0.0; |
|
4310
|
3
|
100
|
|
|
|
|
if (strcmp(method, "spearman") == 0) { |
|
4311
|
|
|
|
|
|
|
// Spearman: Rank the data first, then run standard covariance |
|
4312
|
1
|
|
|
|
|
|
double *restrict rx = (double*)safemalloc(n * sizeof(double)); |
|
4313
|
1
|
|
|
|
|
|
double *restrict ry = (double*)safemalloc(n * sizeof(double)); |
|
4314
|
|
|
|
|
|
|
// Uses your existing rank_data() helper from LikeR.xs |
|
4315
|
1
|
|
|
|
|
|
rank_data(x_val, rx, n); |
|
4316
|
1
|
|
|
|
|
|
rank_data(y_val, ry, n); |
|
4317
|
6
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
4318
|
5
|
|
|
|
|
|
double dx = rx[i] - mean_x; |
|
4319
|
5
|
|
|
|
|
|
mean_x += dx / (i + 1); |
|
4320
|
5
|
|
|
|
|
|
double dy = ry[i] - mean_y; |
|
4321
|
5
|
|
|
|
|
|
mean_y += dy / (i + 1); |
|
4322
|
5
|
|
|
|
|
|
cov_sum += dx * (ry[i] - mean_y); |
|
4323
|
|
|
|
|
|
|
} |
|
4324
|
1
|
|
|
|
|
|
Safefree(rx); Safefree(ry); |
|
4325
|
|
|
|
|
|
|
} else { |
|
4326
|
|
|
|
|
|
|
// Pearson: Welford's Single-Pass Covariance Algorithm |
|
4327
|
12
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
4328
|
10
|
|
|
|
|
|
double dx = x_val[i] - mean_x; |
|
4329
|
10
|
|
|
|
|
|
mean_x += dx / (i + 1); |
|
4330
|
10
|
|
|
|
|
|
double dy = y_val[i] - mean_y; |
|
4331
|
10
|
|
|
|
|
|
mean_y += dy / (i + 1); |
|
4332
|
10
|
|
|
|
|
|
cov_sum += dx * (y_val[i] - mean_y); |
|
4333
|
|
|
|
|
|
|
} |
|
4334
|
|
|
|
|
|
|
} |
|
4335
|
|
|
|
|
|
|
|
|
4336
|
|
|
|
|
|
|
// Unbiased Sample Covariance (N - 1) for Pearson & Spearman |
|
4337
|
3
|
|
|
|
|
|
ans = cov_sum / (n - 1); |
|
4338
|
|
|
|
|
|
|
} |
|
4339
|
4
|
|
|
|
|
|
Safefree(x_val); Safefree(y_val); |
|
4340
|
4
|
|
|
|
|
|
RETVAL = newSVnv(ans); |
|
4341
|
|
|
|
|
|
|
} |
|
4342
|
|
|
|
|
|
|
} |
|
4343
|
|
|
|
|
|
|
OUTPUT: |
|
4344
|
|
|
|
|
|
|
RETVAL |
|
4345
|
|
|
|
|
|
|
|
|
4346
|
|
|
|
|
|
|
SV* glm(...) |
|
4347
|
|
|
|
|
|
|
CODE: |
|
4348
|
|
|
|
|
|
|
{ |
|
4349
|
10
|
|
|
|
|
|
const char *restrict formula = NULL; |
|
4350
|
10
|
|
|
|
|
|
SV *restrict data_sv = NULL; |
|
4351
|
10
|
|
|
|
|
|
const char *restrict family_str = "gaussian"; |
|
4352
|
|
|
|
|
|
|
char f_cpy[512]; |
|
4353
|
|
|
|
|
|
|
char *restrict src, *restrict dst, *restrict tilde, *restrict lhs, *restrict rhs, *restrict chunk; |
|
4354
|
|
|
|
|
|
|
|
|
4355
|
|
|
|
|
|
|
// Dynamic Term Arrays |
|
4356
|
10
|
|
|
|
|
|
char **restrict terms = NULL, **restrict uniq_terms = NULL, **restrict exp_terms = NULL; |
|
4357
|
10
|
|
|
|
|
|
bool *restrict is_dummy = NULL; |
|
4358
|
10
|
|
|
|
|
|
char **restrict dummy_base = NULL, **restrict dummy_level = NULL; |
|
4359
|
10
|
|
|
|
|
|
unsigned int term_cap = 64, exp_cap = 64, num_terms = 0, num_uniq = 0, p = 0, p_exp = 0; |
|
4360
|
10
|
|
|
|
|
|
size_t n = 0, valid_n = 0, i; |
|
4361
|
10
|
|
|
|
|
|
bool has_intercept = TRUE, converged = FALSE, boundary = FALSE; |
|
4362
|
10
|
|
|
|
|
|
unsigned int iter = 0, max_iter = 25, final_rank = 0, df_res = 0; |
|
4363
|
10
|
|
|
|
|
|
double deviance_old = 0.0, deviance_new = 0.0, null_dev = 0.0, aic = 0.0; |
|
4364
|
10
|
|
|
|
|
|
double dispersion = 0.0, epsilon = 1e-8; |
|
4365
|
|
|
|
|
|
|
|
|
4366
|
10
|
|
|
|
|
|
char **restrict row_names = NULL; |
|
4367
|
10
|
|
|
|
|
|
char **restrict valid_row_names = NULL; |
|
4368
|
10
|
|
|
|
|
|
HV **restrict row_hashes = NULL; |
|
4369
|
10
|
|
|
|
|
|
HV *restrict data_hoa = NULL; |
|
4370
|
10
|
|
|
|
|
|
SV *restrict ref = NULL; |
|
4371
|
|
|
|
|
|
|
|
|
4372
|
10
|
|
|
|
|
|
double *restrict X = NULL, *restrict Y = NULL, *restrict mu = NULL, *restrict eta = NULL; |
|
4373
|
10
|
|
|
|
|
|
double *restrict W = NULL, *restrict Z = NULL, *restrict beta = NULL, *restrict beta_old = NULL; |
|
4374
|
10
|
|
|
|
|
|
bool *restrict aliased = NULL; |
|
4375
|
10
|
|
|
|
|
|
double *restrict XtWX = NULL, *restrict XtWZ = NULL; |
|
4376
|
|
|
|
|
|
|
|
|
4377
|
|
|
|
|
|
|
HV *restrict res_hv, *restrict coef_hv, *restrict fitted_hv, *restrict resid_hv, *restrict summary_hv; |
|
4378
|
|
|
|
|
|
|
AV *restrict terms_av; |
|
4379
|
|
|
|
|
|
|
HE *restrict entry; |
|
4380
|
|
|
|
|
|
|
|
|
4381
|
10
|
50
|
|
|
|
|
if (items % 2 != 0) croak("Usage: glm(formula => 'am ~ wt + hp', data => \\%mtcars)"); |
|
4382
|
|
|
|
|
|
|
|
|
4383
|
38
|
100
|
|
|
|
|
for (unsigned short i_arg = 0; i_arg < items; i_arg += 2) { |
|
4384
|
28
|
|
|
|
|
|
const char *restrict key = SvPV_nolen(ST(i_arg)); |
|
4385
|
28
|
|
|
|
|
|
SV *restrict val = ST(i_arg + 1); |
|
4386
|
28
|
100
|
|
|
|
|
if (strEQ(key, "formula")) formula = SvPV_nolen(val); |
|
4387
|
18
|
100
|
|
|
|
|
else if (strEQ(key, "data")) data_sv = val; |
|
4388
|
8
|
50
|
|
|
|
|
else if (strEQ(key, "family")) family_str = SvPV_nolen(val); |
|
4389
|
0
|
|
|
|
|
|
else croak("glm: unknown argument '%s'", key); |
|
4390
|
|
|
|
|
|
|
} |
|
4391
|
10
|
50
|
|
|
|
|
if (!formula) croak("glm: formula is required"); |
|
4392
|
10
|
50
|
|
|
|
|
if (!data_sv || !SvROK(data_sv)) croak("glm: data is required and must be a reference"); |
|
|
|
50
|
|
|
|
|
|
|
4393
|
|
|
|
|
|
|
|
|
4394
|
10
|
|
|
|
|
|
bool is_binomial = (strcmp(family_str, "binomial") == 0); |
|
4395
|
10
|
|
|
|
|
|
bool is_gaussian = (strcmp(family_str, "gaussian") == 0); |
|
4396
|
10
|
100
|
|
|
|
|
if (!is_binomial && !is_gaussian) croak("glm: unsupported family '%s'", family_str); |
|
|
|
50
|
|
|
|
|
|
|
4397
|
|
|
|
|
|
|
|
|
4398
|
|
|
|
|
|
|
// --- Formula Parsing & Expansion --- |
|
4399
|
10
|
|
|
|
|
|
Newx(terms, term_cap, char*); Newx(uniq_terms, term_cap, char*); |
|
4400
|
10
|
|
|
|
|
|
Newx(exp_terms, exp_cap, char*); Newx(is_dummy, exp_cap, bool); |
|
4401
|
10
|
|
|
|
|
|
Newx(dummy_base, exp_cap, char*); Newx(dummy_level, exp_cap, char*); |
|
4402
|
|
|
|
|
|
|
|
|
4403
|
10
|
|
|
|
|
|
src = (char*restrict)formula; dst = f_cpy; |
|
4404
|
148
|
100
|
|
|
|
|
while (*src && (dst - f_cpy < 511)) { if (!isspace(*src)) { *dst++ = *src; } src++; } |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
4405
|
10
|
|
|
|
|
|
*dst = '\0'; |
|
4406
|
|
|
|
|
|
|
|
|
4407
|
10
|
|
|
|
|
|
tilde = strchr(f_cpy, '~'); |
|
4408
|
10
|
50
|
|
|
|
|
if (!tilde) croak("glm: invalid formula, missing '~'"); |
|
4409
|
10
|
|
|
|
|
|
*tilde = '\0'; |
|
4410
|
10
|
|
|
|
|
|
lhs = f_cpy; |
|
4411
|
10
|
|
|
|
|
|
rhs = tilde + 1; |
|
4412
|
|
|
|
|
|
|
char *restrict minus_one; |
|
4413
|
10
|
100
|
|
|
|
|
if ((minus_one = strstr(rhs, "-1")) != NULL) { |
|
4414
|
1
|
|
|
|
|
|
has_intercept = FALSE; |
|
4415
|
1
|
|
|
|
|
|
memmove( |
|
4416
|
1
|
|
|
|
|
|
minus_one, minus_one + 2, strlen(minus_one + 2) + 1 |
|
4417
|
|
|
|
|
|
|
); |
|
4418
|
|
|
|
|
|
|
} |
|
4419
|
10
|
|
|
|
|
|
char *restrict minus1 = strstr(rhs, "-1"); |
|
4420
|
10
|
50
|
|
|
|
|
if (minus1) { |
|
4421
|
0
|
|
|
|
|
|
has_intercept = FALSE; |
|
4422
|
0
|
|
|
|
|
|
memmove(/* remove the "-1" token from the RHS */ |
|
4423
|
0
|
|
|
|
|
|
minus1, minus1 + 2, strlen(minus1 + 2) + 1 |
|
4424
|
|
|
|
|
|
|
); |
|
4425
|
|
|
|
|
|
|
} |
|
4426
|
10
|
100
|
|
|
|
|
if (has_intercept) terms[num_terms++] = savepv("Intercept"); |
|
4427
|
|
|
|
|
|
|
|
|
4428
|
10
|
|
|
|
|
|
chunk = strtok(rhs, "+"); |
|
4429
|
26
|
100
|
|
|
|
|
while (chunk != NULL) { |
|
4430
|
16
|
50
|
|
|
|
|
if (num_terms >= term_cap - 3) { |
|
4431
|
0
|
|
|
|
|
|
term_cap *= 2; |
|
4432
|
0
|
|
|
|
|
|
Renew(terms, term_cap, char*); Renew(uniq_terms, term_cap, char*); |
|
4433
|
|
|
|
|
|
|
} |
|
4434
|
16
|
50
|
|
|
|
|
if (strcmp(chunk, "1") == 0 || strcmp(chunk, "-1") == 0) { |
|
|
|
50
|
|
|
|
|
|
|
4435
|
0
|
|
|
|
|
|
chunk = strtok(NULL, "+"); |
|
4436
|
0
|
|
|
|
|
|
continue; |
|
4437
|
|
|
|
|
|
|
} |
|
4438
|
16
|
|
|
|
|
|
char *restrict star = strchr(chunk, '*'); |
|
4439
|
16
|
50
|
|
|
|
|
if (star) { |
|
4440
|
0
|
|
|
|
|
|
*star = '\0'; |
|
4441
|
0
|
|
|
|
|
|
char *restrict left = chunk; char *restrict right = star + 1; |
|
4442
|
0
|
0
|
|
|
|
|
char *restrict c_l = strchr(left, '^'); if (c_l && strncmp(left, "I(", 2) != 0) *c_l = '\0'; |
|
|
|
0
|
|
|
|
|
|
|
4443
|
0
|
0
|
|
|
|
|
char *restrict c_r = strchr(right, '^'); if (c_r && strncmp(right, "I(", 2) != 0) *c_r = '\0'; |
|
|
|
0
|
|
|
|
|
|
|
4444
|
0
|
|
|
|
|
|
terms[num_terms++] = savepv(left); |
|
4445
|
0
|
|
|
|
|
|
terms[num_terms++] = savepv(right); |
|
4446
|
0
|
|
|
|
|
|
size_t inter_len = strlen(left) + strlen(right) + 2; |
|
4447
|
0
|
|
|
|
|
|
terms[num_terms] = (char*)safemalloc(inter_len); |
|
4448
|
0
|
|
|
|
|
|
snprintf(terms[num_terms++], inter_len, "%s:%s", left, right); |
|
4449
|
|
|
|
|
|
|
} else { |
|
4450
|
16
|
|
|
|
|
|
char *restrict c_chunk = strchr(chunk, '^'); |
|
4451
|
16
|
50
|
|
|
|
|
if (c_chunk && strncmp(chunk, "I(", 2) != 0) *c_chunk = '\0'; |
|
|
|
0
|
|
|
|
|
|
|
4452
|
16
|
|
|
|
|
|
terms[num_terms++] = savepv(chunk); |
|
4453
|
|
|
|
|
|
|
} |
|
4454
|
16
|
|
|
|
|
|
chunk = strtok(NULL, "+"); |
|
4455
|
|
|
|
|
|
|
} |
|
4456
|
|
|
|
|
|
|
|
|
4457
|
35
|
100
|
|
|
|
|
for (i = 0; i < num_terms; i++) { |
|
4458
|
25
|
|
|
|
|
|
bool found = FALSE; |
|
4459
|
46
|
100
|
|
|
|
|
for (size_t j = 0; j < num_uniq; j++) { |
|
4460
|
21
|
50
|
|
|
|
|
if (strcmp(terms[i], uniq_terms[j]) == 0) { found = TRUE; break; } |
|
4461
|
|
|
|
|
|
|
} |
|
4462
|
25
|
50
|
|
|
|
|
if (!found) uniq_terms[num_uniq++] = savepv(terms[i]); |
|
4463
|
|
|
|
|
|
|
} |
|
4464
|
10
|
|
|
|
|
|
p = num_uniq; |
|
4465
|
|
|
|
|
|
|
// --- Data Extraction --- |
|
4466
|
10
|
|
|
|
|
|
ref = SvRV(data_sv); |
|
4467
|
10
|
50
|
|
|
|
|
if (SvTYPE(ref) == SVt_PVHV) { |
|
4468
|
10
|
|
|
|
|
|
HV*restrict hv = (HV*)ref; |
|
4469
|
10
|
50
|
|
|
|
|
if (hv_iterinit(hv) == 0) croak("glm: Data hash is empty"); |
|
4470
|
10
|
|
|
|
|
|
entry = hv_iternext(hv); |
|
4471
|
10
|
50
|
|
|
|
|
if (entry) { |
|
4472
|
10
|
|
|
|
|
|
SV*restrict val = hv_iterval(hv, entry); |
|
4473
|
10
|
50
|
|
|
|
|
if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVAV) { |
|
|
|
100
|
|
|
|
|
|
|
4474
|
5
|
|
|
|
|
|
data_hoa = hv; |
|
4475
|
5
|
|
|
|
|
|
n = av_len((AV*)SvRV(val)) + 1; |
|
4476
|
5
|
50
|
|
|
|
|
Newx(row_names, n, char*); |
|
4477
|
136
|
100
|
|
|
|
|
for(i = 0; i < n; i++) { |
|
4478
|
131
|
|
|
|
|
|
char buf[32]; snprintf(buf, sizeof(buf), "%lu", i+1); |
|
4479
|
131
|
|
|
|
|
|
row_names[i] = savepv(buf); |
|
4480
|
|
|
|
|
|
|
} |
|
4481
|
5
|
50
|
|
|
|
|
} else if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVHV) { |
|
|
|
50
|
|
|
|
|
|
|
4482
|
5
|
|
|
|
|
|
n = hv_iterinit(hv); |
|
4483
|
5
|
50
|
|
|
|
|
Newx(row_names, n, char*); Newx(row_hashes, n, HV*); |
|
|
|
50
|
|
|
|
|
|
|
4484
|
5
|
|
|
|
|
|
i = 0; |
|
4485
|
165
|
100
|
|
|
|
|
while ((entry = hv_iternext(hv))) { |
|
4486
|
|
|
|
|
|
|
I32 len; |
|
4487
|
160
|
|
|
|
|
|
row_names[i] = savepv(hv_iterkey(entry, &len)); |
|
4488
|
160
|
|
|
|
|
|
row_hashes[i] = (HV*)SvRV(hv_iterval(hv, entry)); |
|
4489
|
160
|
|
|
|
|
|
i++; |
|
4490
|
|
|
|
|
|
|
} |
|
4491
|
0
|
|
|
|
|
|
} else croak("glm: Hash values must be ArrayRefs (HoA) or HashRefs (HoH)"); |
|
4492
|
|
|
|
|
|
|
} |
|
4493
|
0
|
0
|
|
|
|
|
} else if (SvTYPE(ref) == SVt_PVAV) { |
|
4494
|
0
|
|
|
|
|
|
AV*restrict av = (AV*)ref; |
|
4495
|
0
|
|
|
|
|
|
n = av_len(av) + 1; |
|
4496
|
0
|
0
|
|
|
|
|
Newx(row_names, n, char*); Newx(row_hashes, n, HV*); |
|
|
|
0
|
|
|
|
|
|
|
4497
|
0
|
0
|
|
|
|
|
for (i = 0; i < n; i++) { |
|
4498
|
0
|
|
|
|
|
|
SV**restrict val = av_fetch(av, i, 0); |
|
4499
|
0
|
0
|
|
|
|
|
if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVHV) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4500
|
0
|
|
|
|
|
|
row_hashes[i] = (HV*)SvRV(*val); |
|
4501
|
0
|
|
|
|
|
|
char buf[32]; snprintf(buf, sizeof(buf), "%lu", i + 1); |
|
4502
|
0
|
|
|
|
|
|
row_names[i] = savepv(buf); |
|
4503
|
|
|
|
|
|
|
} else { |
|
4504
|
0
|
0
|
|
|
|
|
for (size_t k = 0; k < i; k++) Safefree(row_names[k]); |
|
4505
|
0
|
|
|
|
|
|
Safefree(row_names); Safefree(row_hashes); |
|
4506
|
0
|
|
|
|
|
|
croak("glm: Array values must be HashRefs (AoH)"); |
|
4507
|
|
|
|
|
|
|
} |
|
4508
|
|
|
|
|
|
|
} |
|
4509
|
0
|
|
|
|
|
|
} else croak("glm: Data must be an Array or Hash reference"); |
|
4510
|
|
|
|
|
|
|
// --- Categorical Expansion --- |
|
4511
|
35
|
100
|
|
|
|
|
for (size_t j = 0; j < p; j++) { |
|
4512
|
25
|
50
|
|
|
|
|
if (p_exp + 32 >= exp_cap) { |
|
4513
|
0
|
|
|
|
|
|
exp_cap *= 2; |
|
4514
|
0
|
|
|
|
|
|
Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool); |
|
4515
|
0
|
|
|
|
|
|
Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*); |
|
4516
|
|
|
|
|
|
|
} |
|
4517
|
25
|
100
|
|
|
|
|
if (strcmp(uniq_terms[j], "Intercept") == 0) { |
|
4518
|
9
|
|
|
|
|
|
exp_terms[p_exp] = savepv("Intercept"); is_dummy[p_exp] = FALSE; p_exp++; continue; |
|
4519
|
|
|
|
|
|
|
} |
|
4520
|
16
|
100
|
|
|
|
|
if (is_column_categorical(aTHX_ data_hoa, row_hashes, n, uniq_terms[j])) { |
|
4521
|
1
|
|
|
|
|
|
char **restrict levels = NULL; size_t num_levels = 0, levels_cap = 8; |
|
4522
|
1
|
50
|
|
|
|
|
Newx(levels, levels_cap, char*); |
|
4523
|
61
|
100
|
|
|
|
|
for (i = 0; i < n; i++) { |
|
4524
|
60
|
|
|
|
|
|
char*restrict str_val = get_data_string_alloc(aTHX_ data_hoa, row_hashes, i, uniq_terms[j]); |
|
4525
|
60
|
50
|
|
|
|
|
if (str_val) { |
|
4526
|
60
|
|
|
|
|
|
bool found = FALSE; |
|
4527
|
90
|
100
|
|
|
|
|
for (size_t l = 0; l < num_levels; l++) { |
|
4528
|
88
|
100
|
|
|
|
|
if (strcmp(levels[l], str_val) == 0) { found = TRUE; break; } |
|
4529
|
|
|
|
|
|
|
} |
|
4530
|
60
|
100
|
|
|
|
|
if (!found) { |
|
4531
|
2
|
50
|
|
|
|
|
if (num_levels >= levels_cap) { levels_cap *= 2; Renew(levels, levels_cap, char*); } |
|
|
|
0
|
|
|
|
|
|
|
4532
|
2
|
|
|
|
|
|
levels[num_levels++] = savepv(str_val); |
|
4533
|
|
|
|
|
|
|
} |
|
4534
|
60
|
|
|
|
|
|
Safefree(str_val); |
|
4535
|
|
|
|
|
|
|
} |
|
4536
|
|
|
|
|
|
|
} |
|
4537
|
1
|
50
|
|
|
|
|
if (num_levels > 0) { |
|
4538
|
2
|
100
|
|
|
|
|
for (size_t l1 = 0; l1 < num_levels - 1; l1++) { |
|
4539
|
2
|
100
|
|
|
|
|
for (size_t l2 = l1 + 1; l2 < num_levels; l2++) { |
|
4540
|
1
|
50
|
|
|
|
|
if (strcmp(levels[l1], levels[l2]) > 0) { |
|
4541
|
1
|
|
|
|
|
|
char *restrict tmp = levels[l1]; levels[l1] = levels[l2]; levels[l2] = tmp; |
|
4542
|
|
|
|
|
|
|
} |
|
4543
|
|
|
|
|
|
|
} |
|
4544
|
|
|
|
|
|
|
} |
|
4545
|
2
|
100
|
|
|
|
|
for (size_t l = 1; l < num_levels; l++) { |
|
4546
|
1
|
50
|
|
|
|
|
if (p_exp >= exp_cap) { |
|
4547
|
0
|
|
|
|
|
|
exp_cap *= 2; |
|
4548
|
0
|
|
|
|
|
|
Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool); |
|
4549
|
0
|
|
|
|
|
|
Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*); |
|
4550
|
|
|
|
|
|
|
} |
|
4551
|
1
|
|
|
|
|
|
size_t t_len = strlen(uniq_terms[j]) + strlen(levels[l]) + 1; |
|
4552
|
1
|
|
|
|
|
|
exp_terms[p_exp] = (char*)safemalloc(t_len); |
|
4553
|
1
|
|
|
|
|
|
snprintf(exp_terms[p_exp], t_len, "%s%s", uniq_terms[j], levels[l]); |
|
4554
|
1
|
|
|
|
|
|
is_dummy[p_exp] = TRUE; dummy_base[p_exp] = savepv(uniq_terms[j]); dummy_level[p_exp] = savepv(levels[l]); |
|
4555
|
1
|
|
|
|
|
|
p_exp++; |
|
4556
|
|
|
|
|
|
|
} |
|
4557
|
3
|
100
|
|
|
|
|
for (size_t l = 0; l < num_levels; l++) Safefree(levels[l]); |
|
4558
|
1
|
|
|
|
|
|
Safefree(levels); |
|
4559
|
|
|
|
|
|
|
} else { |
|
4560
|
0
|
|
|
|
|
|
Safefree(levels); exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = FALSE; p_exp++; |
|
4561
|
|
|
|
|
|
|
} |
|
4562
|
|
|
|
|
|
|
} else { |
|
4563
|
15
|
|
|
|
|
|
exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = FALSE; p_exp++; |
|
4564
|
|
|
|
|
|
|
} |
|
4565
|
|
|
|
|
|
|
} |
|
4566
|
10
|
|
|
|
|
|
p = p_exp; |
|
4567
|
|
|
|
|
|
|
|
|
4568
|
10
|
50
|
|
|
|
|
Newx(X, n * p, double); Newx(Y, n, double); |
|
|
|
50
|
|
|
|
|
|
|
4569
|
10
|
50
|
|
|
|
|
Newx(valid_row_names, n, char*); |
|
4570
|
|
|
|
|
|
|
|
|
4571
|
|
|
|
|
|
|
// --- Listwise Deletion --- |
|
4572
|
301
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
4573
|
291
|
|
|
|
|
|
double y_val = evaluate_term(aTHX_ data_hoa, row_hashes, i, lhs); |
|
4574
|
291
|
50
|
|
|
|
|
if (isnan(y_val)) { Safefree(row_names[i]); continue; } |
|
4575
|
|
|
|
|
|
|
|
|
4576
|
291
|
|
|
|
|
|
bool row_ok = TRUE; |
|
4577
|
291
|
|
|
|
|
|
double *restrict row_x = (double*)safemalloc(p * sizeof(double)); |
|
4578
|
1090
|
100
|
|
|
|
|
for (size_t j = 0; j < p; j++) { |
|
4579
|
799
|
100
|
|
|
|
|
if (strcmp(exp_terms[j], "Intercept") == 0) { |
|
4580
|
288
|
|
|
|
|
|
row_x[j] = 1.0; |
|
4581
|
511
|
100
|
|
|
|
|
} else if (is_dummy[j]) { |
|
4582
|
60
|
|
|
|
|
|
char* str_val = get_data_string_alloc(aTHX_ data_hoa, row_hashes, i, dummy_base[j]); |
|
4583
|
60
|
50
|
|
|
|
|
if (str_val) { |
|
4584
|
60
|
100
|
|
|
|
|
row_x[j] = (strcmp(str_val, dummy_level[j]) == 0) ? 1.0 : 0.0; |
|
4585
|
60
|
|
|
|
|
|
Safefree(str_val); |
|
4586
|
0
|
|
|
|
|
|
} else { row_ok = FALSE; break; } |
|
4587
|
|
|
|
|
|
|
} else { |
|
4588
|
451
|
|
|
|
|
|
row_x[j] = evaluate_term(aTHX_ data_hoa, row_hashes, i, exp_terms[j]); |
|
4589
|
451
|
50
|
|
|
|
|
if (isnan(row_x[j])) { row_ok = FALSE; break; } |
|
4590
|
|
|
|
|
|
|
} |
|
4591
|
|
|
|
|
|
|
} |
|
4592
|
291
|
50
|
|
|
|
|
if (!row_ok) { Safefree(row_names[i]); Safefree(row_x); continue; } |
|
4593
|
291
|
|
|
|
|
|
Y[valid_n] = y_val; |
|
4594
|
1090
|
100
|
|
|
|
|
for (size_t j = 0; j < p; j++) X[valid_n * p + j] = row_x[j]; |
|
4595
|
291
|
|
|
|
|
|
valid_row_names[valid_n] = row_names[i]; |
|
4596
|
291
|
|
|
|
|
|
valid_n++; |
|
4597
|
291
|
|
|
|
|
|
Safefree(row_x); |
|
4598
|
|
|
|
|
|
|
} |
|
4599
|
10
|
|
|
|
|
|
Safefree(row_names); |
|
4600
|
10
|
50
|
|
|
|
|
if (valid_n < p) { |
|
4601
|
0
|
0
|
|
|
|
|
Safefree(X); Safefree(Y); Safefree(valid_row_names); if (row_hashes) Safefree(row_hashes); |
|
4602
|
0
|
|
|
|
|
|
croak("glm: 0 degrees of freedom (too many NAs or parameters > observations)"); |
|
4603
|
|
|
|
|
|
|
} |
|
4604
|
|
|
|
|
|
|
// --- R glm.fit IRLS Implementation --- |
|
4605
|
10
|
|
|
|
|
|
mu = (double*)safemalloc(valid_n * sizeof(double)); eta = (double*)safemalloc(valid_n * sizeof(double)); |
|
4606
|
10
|
|
|
|
|
|
W = (double*)safemalloc(valid_n * sizeof(double)); Z = (double*)safemalloc(valid_n * sizeof(double)); |
|
4607
|
10
|
|
|
|
|
|
beta = (double*)safemalloc(p * sizeof(double)); beta_old = (double*)safemalloc(p * sizeof(double)); |
|
4608
|
10
|
|
|
|
|
|
aliased = (bool*)safemalloc(p * sizeof(bool)); |
|
4609
|
10
|
|
|
|
|
|
XtWX = (double*)safemalloc(p * p * sizeof(double)); XtWZ = (double*)safemalloc(p * sizeof(double)); |
|
4610
|
35
|
100
|
|
|
|
|
for (i = 0; i < p; i++) { beta[i] = 0.0; beta_old[i] = 0.0; } |
|
4611
|
|
|
|
|
|
|
// Initialize (mustart / etastart equivalent) |
|
4612
|
10
|
|
|
|
|
|
double sum_y = 0.0; |
|
4613
|
301
|
100
|
|
|
|
|
for (i = 0; i < valid_n; i++) sum_y += Y[i]; |
|
4614
|
10
|
|
|
|
|
|
double mean_y = sum_y / valid_n; |
|
4615
|
297
|
100
|
|
|
|
|
for (i = 0; i < valid_n; i++) { |
|
4616
|
288
|
100
|
|
|
|
|
if (is_binomial) { |
|
4617
|
37
|
100
|
|
|
|
|
if (Y[i] < 0.0 || Y[i] > 1.0) croak("glm: binomial family requires response between 0 and 1"); |
|
|
|
50
|
|
|
|
|
|
|
4618
|
36
|
|
|
|
|
|
mu[i] = (Y[i] + 0.5) / 2.0; |
|
4619
|
36
|
|
|
|
|
|
eta[i] = log(mu[i] / (1.0 - mu[i])); |
|
4620
|
36
|
|
|
|
|
|
double dev = 0.0; |
|
4621
|
36
|
100
|
|
|
|
|
if (Y[i] == 0.0) dev = -2.0 * log(1.0 - mu[i]); |
|
4622
|
15
|
50
|
|
|
|
|
else if (Y[i] == 1.0) dev = -2.0 * log(mu[i]); |
|
4623
|
0
|
|
|
|
|
|
else dev = 2.0 * (Y[i] * log(Y[i] / mu[i]) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - mu[i]))); |
|
4624
|
36
|
|
|
|
|
|
deviance_old += dev; |
|
4625
|
|
|
|
|
|
|
} else { |
|
4626
|
251
|
|
|
|
|
|
mu[i] = mean_y; // R gaussian init |
|
4627
|
251
|
|
|
|
|
|
eta[i] = mu[i]; |
|
4628
|
|
|
|
|
|
|
} |
|
4629
|
|
|
|
|
|
|
} |
|
4630
|
|
|
|
|
|
|
// IRLS Loop |
|
4631
|
45
|
50
|
|
|
|
|
for (iter = 1; iter <= max_iter; iter++) { |
|
4632
|
924
|
100
|
|
|
|
|
for (i = 0; i < valid_n; i++) { |
|
4633
|
879
|
100
|
|
|
|
|
if (is_binomial) { |
|
4634
|
380
|
|
|
|
|
|
double varmu = mu[i] * (1.0 - mu[i]); |
|
4635
|
380
|
|
|
|
|
|
double mu_eta = varmu; // Link derivative for logit |
|
4636
|
380
|
100
|
|
|
|
|
if (varmu < 1e-10) varmu = 1e-10; |
|
4637
|
380
|
|
|
|
|
|
Z[i] = eta[i] + (Y[i] - mu[i]) / mu_eta; |
|
4638
|
380
|
|
|
|
|
|
W[i] = (mu_eta * mu_eta) / varmu; |
|
4639
|
|
|
|
|
|
|
} else { |
|
4640
|
499
|
|
|
|
|
|
W[i] = 1.0; |
|
4641
|
499
|
|
|
|
|
|
Z[i] = Y[i]; |
|
4642
|
|
|
|
|
|
|
} |
|
4643
|
|
|
|
|
|
|
} |
|
4644
|
|
|
|
|
|
|
// Formulate XtWX and XtWZ |
|
4645
|
425
|
100
|
|
|
|
|
for (i = 0; i < p; i++) { XtWZ[i] = 0.0; for (size_t j = 0; j < p; j++) XtWX[i * p + j] = 0.0; } |
|
|
|
100
|
|
|
|
|
|
|
4646
|
924
|
100
|
|
|
|
|
for (size_t k = 0; k < valid_n; k++) { |
|
4647
|
879
|
|
|
|
|
|
double w = W[k], z = Z[k]; |
|
4648
|
3298
|
100
|
|
|
|
|
for (i = 0; i < p; i++) { |
|
4649
|
2419
|
|
|
|
|
|
XtWZ[i] += X[k * p + i] * w * z; |
|
4650
|
2419
|
|
|
|
|
|
double xw = X[k * p + i] * w; |
|
4651
|
9246
|
100
|
|
|
|
|
for (size_t j = 0; j < p; j++) XtWX[i * p + j] += xw * X[k * p + j]; |
|
4652
|
|
|
|
|
|
|
} |
|
4653
|
|
|
|
|
|
|
} |
|
4654
|
45
|
|
|
|
|
|
final_rank = sweep_matrix_ols(XtWX, p, aliased); |
|
4655
|
153
|
100
|
|
|
|
|
for (i = 0; i < p; i++) { |
|
4656
|
108
|
50
|
|
|
|
|
if (aliased[i]) { beta[i] = NAN; } else { |
|
4657
|
108
|
|
|
|
|
|
double sum = 0.0; |
|
4658
|
380
|
50
|
|
|
|
|
for (size_t j = 0; j < p; j++) if (!aliased[j]) sum += XtWX[i * p + j] * XtWZ[j]; |
|
|
|
100
|
|
|
|
|
|
|
4659
|
108
|
|
|
|
|
|
beta[i] = sum; |
|
4660
|
|
|
|
|
|
|
} |
|
4661
|
|
|
|
|
|
|
} |
|
4662
|
|
|
|
|
|
|
// Calculate updated ETA, MU, and Deviance (with Step-Halving) |
|
4663
|
45
|
|
|
|
|
|
boundary = FALSE; |
|
4664
|
495
|
100
|
|
|
|
|
for (unsigned short int half = 0; half < 10; half++) { |
|
4665
|
450
|
|
|
|
|
|
deviance_new = 0.0; |
|
4666
|
9240
|
100
|
|
|
|
|
for (i = 0; i < valid_n; i++) { |
|
4667
|
8790
|
|
|
|
|
|
double linear_pred = 0.0; |
|
4668
|
32980
|
50
|
|
|
|
|
for (size_t j = 0; j < p; j++) if (!aliased[j]) linear_pred += X[i * p + j] * beta[j]; |
|
|
|
100
|
|
|
|
|
|
|
4669
|
8790
|
|
|
|
|
|
eta[i] = linear_pred; |
|
4670
|
8790
|
100
|
|
|
|
|
if (is_binomial) { |
|
4671
|
3800
|
|
|
|
|
|
mu[i] = 1.0 / (1.0 + exp(-eta[i])); |
|
4672
|
|
|
|
|
|
|
// Boundary enforcement |
|
4673
|
3800
|
50
|
|
|
|
|
if (mu[i] < 10 * DBL_EPSILON) mu[i] = 10 * DBL_EPSILON; |
|
4674
|
3800
|
50
|
|
|
|
|
if (mu[i] > 1.0 - 10 * DBL_EPSILON) mu[i] = 1.0 - 10 * DBL_EPSILON; |
|
4675
|
3800
|
|
|
|
|
|
double dev = 0.0; |
|
4676
|
3800
|
100
|
|
|
|
|
if (Y[i] == 0.0) dev = -2.0 * log(1.0 - mu[i]); |
|
4677
|
1630
|
50
|
|
|
|
|
else if (Y[i] == 1.0) dev = -2.0 * log(mu[i]); |
|
4678
|
0
|
|
|
|
|
|
else dev = 2.0 * (Y[i] * log(Y[i] / mu[i]) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - mu[i]))); |
|
4679
|
3800
|
|
|
|
|
|
deviance_new += dev; |
|
4680
|
|
|
|
|
|
|
} else { |
|
4681
|
4990
|
|
|
|
|
|
mu[i] = eta[i]; |
|
4682
|
4990
|
|
|
|
|
|
double res = Y[i] - mu[i]; |
|
4683
|
4990
|
|
|
|
|
|
deviance_new += res * res; |
|
4684
|
|
|
|
|
|
|
} |
|
4685
|
|
|
|
|
|
|
} |
|
4686
|
|
|
|
|
|
|
// Step halving divergence check |
|
4687
|
450
|
100
|
|
|
|
|
if (!is_binomial || deviance_new <= deviance_old + 1e-7 || !isfinite(deviance_new)) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
4688
|
440
|
|
|
|
|
|
continue; |
|
4689
|
|
|
|
|
|
|
} |
|
4690
|
10
|
|
|
|
|
|
boundary = TRUE; |
|
4691
|
40
|
100
|
|
|
|
|
for (size_t j = 0; j < p; j++) beta[j] = (beta[j] + beta_old[j]) / 2.0; |
|
4692
|
|
|
|
|
|
|
} |
|
4693
|
|
|
|
|
|
|
// Convergence Check |
|
4694
|
45
|
100
|
|
|
|
|
if (fabs(deviance_new - deviance_old) / (0.1 + fabs(deviance_new)) < epsilon) { |
|
4695
|
9
|
|
|
|
|
|
converged = TRUE; break; |
|
4696
|
|
|
|
|
|
|
} |
|
4697
|
36
|
|
|
|
|
|
deviance_old = deviance_new; |
|
4698
|
121
|
100
|
|
|
|
|
for (size_t j = 0; j < p; j++) beta_old[j] = beta[j]; |
|
4699
|
|
|
|
|
|
|
} |
|
4700
|
|
|
|
|
|
|
// Final accurate calculation of W for standard errors |
|
4701
|
95
|
100
|
|
|
|
|
for (i = 0; i < p; i++) { for (size_t j = 0; j < p; j++) XtWX[i * p + j] = 0.0; } |
|
|
|
100
|
|
|
|
|
|
|
4702
|
296
|
100
|
|
|
|
|
for (size_t k = 0; k < valid_n; k++) { |
|
4703
|
287
|
100
|
|
|
|
|
double w = is_binomial ? (mu[k] * (1.0 - mu[k])) : 1.0; |
|
4704
|
287
|
100
|
|
|
|
|
if (w < 1e-10) w = 1e-10; |
|
4705
|
1078
|
100
|
|
|
|
|
for (i = 0; i < p; i++) { |
|
4706
|
791
|
|
|
|
|
|
double xw = X[k * p + i] * w; |
|
4707
|
3030
|
100
|
|
|
|
|
for (size_t j = 0; j < p; j++) XtWX[i * p + j] += xw * X[k * p + j]; |
|
4708
|
|
|
|
|
|
|
} |
|
4709
|
|
|
|
|
|
|
} |
|
4710
|
9
|
|
|
|
|
|
final_rank = sweep_matrix_ols(XtWX, p, aliased); |
|
4711
|
|
|
|
|
|
|
// --- Null Deviance Calculation --- |
|
4712
|
|
|
|
|
|
|
// If no intercept, the null model predicts the inverse-link of 0. |
|
4713
|
9
|
100
|
|
|
|
|
double wtdmu = has_intercept ? mean_y : (is_binomial ? 0.5 : 0.0); |
|
|
|
50
|
|
|
|
|
|
|
4714
|
|
|
|
|
|
|
|
|
4715
|
296
|
100
|
|
|
|
|
for (i = 0; i < valid_n; i++) { |
|
4716
|
287
|
100
|
|
|
|
|
if (is_binomial) { |
|
4717
|
36
|
100
|
|
|
|
|
if (Y[i] == 0.0) null_dev += -2.0 * log(1.0 - wtdmu); |
|
4718
|
15
|
50
|
|
|
|
|
else if (Y[i] == 1.0) null_dev += -2.0 * log(wtdmu); |
|
4719
|
0
|
|
|
|
|
|
else null_dev += 2.0 * (Y[i] * log(Y[i] / wtdmu) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - wtdmu))); |
|
4720
|
|
|
|
|
|
|
} else { |
|
4721
|
251
|
|
|
|
|
|
double diff = Y[i] - wtdmu; |
|
4722
|
251
|
|
|
|
|
|
null_dev += diff * diff; |
|
4723
|
|
|
|
|
|
|
} |
|
4724
|
|
|
|
|
|
|
} |
|
4725
|
|
|
|
|
|
|
// --- AIC Calculation --- |
|
4726
|
9
|
100
|
|
|
|
|
if (is_gaussian) { |
|
4727
|
7
|
|
|
|
|
|
double n_f = (double)valid_n; |
|
4728
|
7
|
|
|
|
|
|
double dev_for_aic = deviance_new; |
|
4729
|
|
|
|
|
|
|
// Guard against perfect fits (deviance == 0.0) causing log(0) = -inf. |
|
4730
|
|
|
|
|
|
|
// R's QR decomposition leaves a noise floor of ~1.0355e-30 for perfect integer fits. |
|
4731
|
|
|
|
|
|
|
// Clamping to this exact boundary replicates R's output of -197.91. |
|
4732
|
7
|
100
|
|
|
|
|
if (dev_for_aic < 1.0355727742801604e-30) { |
|
4733
|
1
|
|
|
|
|
|
dev_for_aic = 1.0355727742801604e-30; |
|
4734
|
|
|
|
|
|
|
} |
|
4735
|
|
|
|
|
|
|
// Mathematically matches R's gaussian()$aic + 2*rank |
|
4736
|
7
|
|
|
|
|
|
aic = n_f * (log(2.0 * M_PI) + 1.0 + log(dev_for_aic / n_f)) + 2.0 * (final_rank + 1.0); |
|
4737
|
2
|
50
|
|
|
|
|
} else if (is_binomial) { |
|
4738
|
2
|
|
|
|
|
|
aic = deviance_new + 2.0 * final_rank; |
|
4739
|
|
|
|
|
|
|
} |
|
4740
|
|
|
|
|
|
|
// --- Return Structures --- |
|
4741
|
9
|
|
|
|
|
|
res_hv = newHV(); coef_hv = newHV(); fitted_hv = newHV(); resid_hv = newHV(); |
|
4742
|
9
|
|
|
|
|
|
df_res = valid_n - final_rank; |
|
4743
|
9
|
100
|
|
|
|
|
dispersion = is_binomial ? 1.0 : ((df_res > 0) ? (deviance_new / df_res) : NAN); |
|
|
|
50
|
|
|
|
|
|
|
4744
|
296
|
100
|
|
|
|
|
for (size_t i = 0; i < valid_n; i++) { |
|
4745
|
287
|
|
|
|
|
|
double res = Y[i] - mu[i]; |
|
4746
|
287
|
100
|
|
|
|
|
if (is_binomial) { |
|
4747
|
|
|
|
|
|
|
// Deviance residuals for binomial |
|
4748
|
36
|
|
|
|
|
|
double d_res = 0.0; |
|
4749
|
36
|
100
|
|
|
|
|
if (Y[i] == 0.0) d_res = sqrt(-2.0 * log(1.0 - mu[i])); |
|
4750
|
15
|
50
|
|
|
|
|
else if (Y[i] == 1.0) d_res = sqrt(-2.0 * log(mu[i])); |
|
4751
|
0
|
|
|
|
|
|
else d_res = sqrt(2.0 * (Y[i] * log(Y[i] / mu[i]) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - mu[i])))); |
|
4752
|
36
|
100
|
|
|
|
|
res = (Y[i] > mu[i]) ? d_res : -d_res; |
|
4753
|
|
|
|
|
|
|
} |
|
4754
|
287
|
|
|
|
|
|
hv_store(fitted_hv, valid_row_names[i], strlen(valid_row_names[i]), newSVnv(mu[i]), 0); |
|
4755
|
287
|
|
|
|
|
|
hv_store(resid_hv, valid_row_names[i], strlen(valid_row_names[i]), newSVnv(res), 0); |
|
4756
|
287
|
|
|
|
|
|
Safefree(valid_row_names[i]); |
|
4757
|
|
|
|
|
|
|
} |
|
4758
|
9
|
|
|
|
|
|
Safefree(valid_row_names); |
|
4759
|
9
|
|
|
|
|
|
summary_hv = newHV(); terms_av = newAV(); |
|
4760
|
32
|
100
|
|
|
|
|
for (size_t j = 0; j < p; j++) { |
|
4761
|
23
|
|
|
|
|
|
hv_store(coef_hv, exp_terms[j], strlen(exp_terms[j]), newSVnv(beta[j]), 0); |
|
4762
|
23
|
|
|
|
|
|
av_push(terms_av, newSVpv(exp_terms[j], 0)); |
|
4763
|
|
|
|
|
|
|
|
|
4764
|
23
|
|
|
|
|
|
HV *restrict row_hv = newHV(); |
|
4765
|
23
|
50
|
|
|
|
|
if (aliased[j]) { |
|
4766
|
0
|
|
|
|
|
|
hv_store(row_hv, "Estimate", 8, newSVpv("NaN", 0), 0); |
|
4767
|
0
|
|
|
|
|
|
hv_store(row_hv, "Std. Error", 10, newSVpv("NaN", 0), 0); |
|
4768
|
0
|
0
|
|
|
|
|
hv_store(row_hv, is_binomial ? "z value" : "t value", 7, newSVpv("NaN", 0), 0); |
|
4769
|
0
|
0
|
|
|
|
|
hv_store(row_hv, is_binomial ? "Pr(>|z|)" : "Pr(>|t|)", 8, newSVpv("NaN", 0), 0); |
|
4770
|
|
|
|
|
|
|
} else { |
|
4771
|
23
|
|
|
|
|
|
double se = sqrt(dispersion * XtWX[j * p + j]); |
|
4772
|
23
|
|
|
|
|
|
double val_stat = beta[j] / se; |
|
4773
|
23
|
100
|
|
|
|
|
double p_val = is_binomial ? 2.0 * (1.0 - approx_pnorm(fabs(val_stat))) : get_t_pvalue(val_stat, df_res, "two.sided"); |
|
4774
|
23
|
|
|
|
|
|
hv_store(row_hv, "Estimate", 8, newSVnv(beta[j]), 0); |
|
4775
|
23
|
|
|
|
|
|
hv_store(row_hv, "Std. Error", 10, newSVnv(se), 0); |
|
4776
|
23
|
100
|
|
|
|
|
hv_store(row_hv, is_binomial ? "z value" : "t value", 7, newSVnv(val_stat), 0); |
|
4777
|
23
|
100
|
|
|
|
|
hv_store(row_hv, is_binomial ? "Pr(>|z|)" : "Pr(>|t|)", 8, newSVnv(p_val), 0); |
|
4778
|
|
|
|
|
|
|
} |
|
4779
|
23
|
|
|
|
|
|
hv_store(summary_hv, exp_terms[j], strlen(exp_terms[j]), newRV_noinc((SV*)row_hv), 0); |
|
4780
|
|
|
|
|
|
|
} |
|
4781
|
9
|
|
|
|
|
|
hv_store(res_hv, "aic", 3, newSVnv(aic), 0); |
|
4782
|
9
|
|
|
|
|
|
hv_store(res_hv, "coefficients", 12, newRV_noinc((SV*)coef_hv), 0); |
|
4783
|
9
|
|
|
|
|
|
hv_store(res_hv, "converged", 9, newSVuv(converged ? 1 : 0), 0); |
|
4784
|
9
|
|
|
|
|
|
hv_store(res_hv, "boundary", 8, newSVuv(boundary ? 1 : 0), 0); |
|
4785
|
9
|
|
|
|
|
|
hv_store(res_hv, "deviance", 8, newSVnv(deviance_new), 0); |
|
4786
|
9
|
|
|
|
|
|
hv_store(res_hv, "deviance.resid", 14, newRV_noinc((SV*)resid_hv), 0); |
|
4787
|
9
|
|
|
|
|
|
hv_store(res_hv, "df.null", 7, newSVuv(valid_n - has_intercept), 0); |
|
4788
|
9
|
|
|
|
|
|
hv_store(res_hv, "df.residual", 11, newSVuv(df_res), 0); |
|
4789
|
9
|
|
|
|
|
|
hv_store(res_hv, "family", 6, newSVpv(family_str, 0), 0); |
|
4790
|
9
|
|
|
|
|
|
hv_store(res_hv, "fitted.values", 13, newRV_noinc((SV*)fitted_hv), 0); |
|
4791
|
9
|
|
|
|
|
|
hv_store(res_hv, "iter", 4, newSVuv(iter > max_iter ? max_iter : iter), 0); |
|
4792
|
9
|
|
|
|
|
|
hv_store(res_hv, "null.deviance", 13, newSVnv(null_dev), 0); |
|
4793
|
9
|
|
|
|
|
|
hv_store(res_hv, "rank", 4, newSVuv(final_rank), 0); |
|
4794
|
9
|
|
|
|
|
|
hv_store(res_hv, "summary", 7, newRV_noinc((SV*)summary_hv), 0); |
|
4795
|
9
|
|
|
|
|
|
hv_store(res_hv, "terms", 5, newRV_noinc((SV*)terms_av), 0); |
|
4796
|
|
|
|
|
|
|
// --- Cleanup --- |
|
4797
|
32
|
100
|
|
|
|
|
for (i = 0; i < num_terms; i++) Safefree(terms[i]); |
|
4798
|
9
|
|
|
|
|
|
Safefree(terms); |
|
4799
|
32
|
100
|
|
|
|
|
for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); |
|
4800
|
9
|
|
|
|
|
|
Safefree(uniq_terms); |
|
4801
|
32
|
100
|
|
|
|
|
for (size_t j = 0; j < p_exp; j++) { |
|
4802
|
23
|
|
|
|
|
|
Safefree(exp_terms[j]); |
|
4803
|
23
|
100
|
|
|
|
|
if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); } |
|
4804
|
|
|
|
|
|
|
} |
|
4805
|
9
|
|
|
|
|
|
Safefree(exp_terms); Safefree(is_dummy); Safefree(dummy_base); Safefree(dummy_level); |
|
4806
|
9
|
|
|
|
|
|
Safefree(mu); Safefree(eta); Safefree(Z); Safefree(W); |
|
4807
|
9
|
|
|
|
|
|
Safefree(beta); Safefree(beta_old); Safefree(aliased); |
|
4808
|
9
|
|
|
|
|
|
Safefree(XtWX); Safefree(XtWZ); Safefree(X); Safefree(Y); |
|
4809
|
9
|
100
|
|
|
|
|
if (row_hashes) Safefree(row_hashes); |
|
4810
|
9
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)res_hv); |
|
4811
|
|
|
|
|
|
|
} |
|
4812
|
|
|
|
|
|
|
OUTPUT: |
|
4813
|
|
|
|
|
|
|
RETVAL |
|
4814
|
|
|
|
|
|
|
|
|
4815
|
|
|
|
|
|
|
SV* cor_test(...) |
|
4816
|
|
|
|
|
|
|
CODE: |
|
4817
|
|
|
|
|
|
|
{ |
|
4818
|
12
|
50
|
|
|
|
|
if (items < 2 || items % 2 != 0) |
|
|
|
50
|
|
|
|
|
|
|
4819
|
0
|
|
|
|
|
|
croak("Usage: cor_test(\\@x, \\@y, method => 'pearson', ...)"); |
|
4820
|
12
|
|
|
|
|
|
SV *restrict x_ref = ST(0), *restrict y_ref = ST(1); |
|
4821
|
12
|
|
|
|
|
|
const char *restrict alternative = "two.sided"; |
|
4822
|
12
|
|
|
|
|
|
const char *restrict method = "pearson"; |
|
4823
|
12
|
|
|
|
|
|
SV *restrict exact_sv = NULL; |
|
4824
|
12
|
|
|
|
|
|
double conf_level = 0.95; |
|
4825
|
12
|
|
|
|
|
|
bool continuity = 0; |
|
4826
|
|
|
|
|
|
|
/* Parse named arguments from the flat stack starting at index 2 */ |
|
4827
|
46
|
100
|
|
|
|
|
for (unsigned short int i = 2; i < items; i += 2) { |
|
4828
|
34
|
|
|
|
|
|
const char *restrict key = SvPV_nolen(ST(i)); |
|
4829
|
34
|
|
|
|
|
|
SV *restrict val = ST(i + 1); |
|
4830
|
34
|
100
|
|
|
|
|
if (strEQ(key, "alternative")) alternative = SvPV_nolen(val); |
|
4831
|
27
|
100
|
|
|
|
|
else if (strEQ(key, "method")) method = SvPV_nolen(val); |
|
4832
|
15
|
100
|
|
|
|
|
else if (strEQ(key, "exact")) exact_sv = val; |
|
4833
|
14
|
100
|
|
|
|
|
else if (strEQ(key, "conf.level") || strEQ(key, "conf_level")) conf_level = SvNV(val); |
|
|
|
50
|
|
|
|
|
|
|
4834
|
7
|
50
|
|
|
|
|
else if (strEQ(key, "continuity")) continuity = SvTRUE(val); |
|
4835
|
0
|
|
|
|
|
|
else croak("cor_test: unknown argument '%s'", key); |
|
4836
|
|
|
|
|
|
|
} |
|
4837
|
|
|
|
|
|
|
AV *restrict x_av, *restrict y_av; |
|
4838
|
|
|
|
|
|
|
double *restrict x, *restrict y; |
|
4839
|
12
|
|
|
|
|
|
double estimate = 0, p_value = 0, statistic = 0, df = 0, ci_lower = 0, ci_upper = 0; |
|
4840
|
12
|
|
|
|
|
|
bool is_pearson = (strcmp(method, "pearson") == 0); |
|
4841
|
12
|
|
|
|
|
|
bool is_kendall = (strcmp(method, "kendall") == 0); |
|
4842
|
12
|
|
|
|
|
|
bool is_spearman = (strcmp(method, "spearman") == 0); |
|
4843
|
|
|
|
|
|
|
HV *restrict rhv; |
|
4844
|
12
|
50
|
|
|
|
|
if (!SvOK(x_ref) || !SvROK(x_ref) || SvTYPE(SvRV(x_ref)) != SVt_PVAV || |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
4845
|
12
|
50
|
|
|
|
|
!SvOK(y_ref) || !SvROK(y_ref) || SvTYPE(SvRV(y_ref)) != SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
4846
|
0
|
|
|
|
|
|
croak("cor_test: x and y must be array references"); |
|
4847
|
|
|
|
|
|
|
} |
|
4848
|
12
|
|
|
|
|
|
x_av = (AV*)SvRV(x_ref); |
|
4849
|
12
|
|
|
|
|
|
y_av = (AV*)SvRV(y_ref); |
|
4850
|
12
|
|
|
|
|
|
size_t n_raw = av_len(x_av) + 1; |
|
4851
|
12
|
50
|
|
|
|
|
if (n_raw != (size_t)(av_len(y_av) + 1)) croak("incompatible dimensions"); |
|
4852
|
12
|
|
|
|
|
|
x = safemalloc(n_raw * sizeof(double)); |
|
4853
|
12
|
|
|
|
|
|
y = safemalloc(n_raw * sizeof(double)); |
|
4854
|
12
|
|
|
|
|
|
size_t n = 0; /* Final count of pairwise complete observations */ |
|
4855
|
281
|
100
|
|
|
|
|
for (size_t i = 0; i < n_raw; i++) { |
|
4856
|
269
|
|
|
|
|
|
SV **restrict x_val = av_fetch(x_av, i, 0); |
|
4857
|
269
|
|
|
|
|
|
SV **restrict y_val = av_fetch(y_av, i, 0); |
|
4858
|
269
|
50
|
|
|
|
|
double xv = (x_val && SvOK(*x_val) && looks_like_number(*x_val)) ? SvNV(*x_val) : NAN; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
4859
|
269
|
50
|
|
|
|
|
double yv = (y_val && SvOK(*y_val) && looks_like_number(*y_val)) ? SvNV(*y_val) : NAN; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
4860
|
|
|
|
|
|
|
/* Pairwise complete observations (skips NAs seamlessly like R) */ |
|
4861
|
269
|
100
|
|
|
|
|
if (!isnan(xv) && !isnan(yv)) { |
|
|
|
100
|
|
|
|
|
|
|
4862
|
265
|
|
|
|
|
|
x[n] = xv; |
|
4863
|
265
|
|
|
|
|
|
y[n] = yv; |
|
4864
|
265
|
|
|
|
|
|
n++; |
|
4865
|
|
|
|
|
|
|
} |
|
4866
|
|
|
|
|
|
|
} |
|
4867
|
12
|
50
|
|
|
|
|
if (n < 3) { |
|
4868
|
0
|
|
|
|
|
|
Safefree(x); |
|
4869
|
0
|
|
|
|
|
|
Safefree(y); |
|
4870
|
0
|
|
|
|
|
|
croak("not enough finite observations"); |
|
4871
|
|
|
|
|
|
|
} |
|
4872
|
12
|
100
|
|
|
|
|
if (is_pearson) { |
|
4873
|
|
|
|
|
|
|
/* Welford's one-pass algorithm for Pearson correlation */ |
|
4874
|
6
|
|
|
|
|
|
double mean_x = 0.0, mean_y = 0.0, M2_x = 0.0, M2_y = 0.0, cov = 0.0; |
|
4875
|
36
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
4876
|
30
|
|
|
|
|
|
double dx = x[i] - mean_x; |
|
4877
|
30
|
|
|
|
|
|
mean_x += dx / (i + 1); |
|
4878
|
30
|
|
|
|
|
|
double dy = y[i] - mean_y; |
|
4879
|
30
|
|
|
|
|
|
mean_y += dy / (i + 1); |
|
4880
|
30
|
|
|
|
|
|
M2_x += dx * (x[i] - mean_x); |
|
4881
|
30
|
|
|
|
|
|
M2_y += dy * (y[i] - mean_y); |
|
4882
|
30
|
|
|
|
|
|
cov += dx * (y[i] - mean_y); |
|
4883
|
|
|
|
|
|
|
} |
|
4884
|
6
|
50
|
|
|
|
|
estimate = (M2_x > 0.0 && M2_y > 0.0) ? cov / sqrt(M2_x * M2_y) : 0.0; |
|
|
|
50
|
|
|
|
|
|
|
4885
|
|
|
|
|
|
|
/* Clamp to [-1, 1] to guard against floating-point overshoot */ |
|
4886
|
6
|
50
|
|
|
|
|
if (estimate > 1.0) estimate = 1.0; |
|
4887
|
6
|
50
|
|
|
|
|
else if (estimate < -1.0) estimate = -1.0; |
|
4888
|
6
|
|
|
|
|
|
df = (double)(n - 2); |
|
4889
|
|
|
|
|
|
|
/* BUG FIX: guard divide-by-zero when |estimate| == 1 exactly. |
|
4890
|
|
|
|
|
|
|
* A perfect correlation gives t = ±Inf, matching R's behaviour. */ |
|
4891
|
6
|
|
|
|
|
|
double denom_t = 1.0 - estimate * estimate; |
|
4892
|
6
|
100
|
|
|
|
|
if (denom_t <= 0.0) |
|
4893
|
2
|
100
|
|
|
|
|
statistic = (estimate > 0.0) ? INFINITY : -INFINITY; |
|
4894
|
|
|
|
|
|
|
else |
|
4895
|
4
|
|
|
|
|
|
statistic = estimate * sqrt(df / denom_t); |
|
4896
|
|
|
|
|
|
|
/* Confidence interval via Fisher's Z transform. |
|
4897
|
|
|
|
|
|
|
* BUG FIX: when |estimate| == 1 the log blows up; clamp first. |
|
4898
|
|
|
|
|
|
|
* We use a half-ULP margin so tanh can recover ±1 cleanly. */ |
|
4899
|
6
|
|
|
|
|
|
double est_clamped = estimate; |
|
4900
|
6
|
100
|
|
|
|
|
if (est_clamped >= 1.0) est_clamped = 1.0 - DBL_EPSILON; |
|
4901
|
5
|
100
|
|
|
|
|
else if (est_clamped <= -1.0) est_clamped = -1.0 + DBL_EPSILON; |
|
4902
|
6
|
|
|
|
|
|
double z = 0.5 * log((1.0 + est_clamped) / (1.0 - est_clamped)); |
|
4903
|
6
|
|
|
|
|
|
double se = 1.0 / sqrt((double)(n - 3)); |
|
4904
|
6
|
|
|
|
|
|
double alpha = 1.0 - conf_level; |
|
4905
|
6
|
|
|
|
|
|
double q = inverse_normal_cdf(1.0 - alpha / 2.0); |
|
4906
|
6
|
|
|
|
|
|
ci_lower = tanh(z - q * se); |
|
4907
|
6
|
|
|
|
|
|
ci_upper = tanh(z + q * se); |
|
4908
|
|
|
|
|
|
|
// High-precision p-value using incomplete beta |
|
4909
|
6
|
|
|
|
|
|
p_value = get_t_pvalue(statistic, df, alternative); |
|
4910
|
6
|
100
|
|
|
|
|
} else if (is_kendall) { |
|
4911
|
|
|
|
|
|
|
// BUG FIX: use long to avoid int overflow for large n |
|
4912
|
3
|
|
|
|
|
|
long c = 0, d = 0, tie_x = 0, tie_y = 0; |
|
4913
|
210
|
100
|
|
|
|
|
for (size_t i = 0; i < n - 1; i++) { |
|
4914
|
20127
|
100
|
|
|
|
|
for (size_t j = i + 1; j < n; j++) { |
|
4915
|
19920
|
|
|
|
|
|
double sign_x = (x[i] > x[j]) - (x[i] < x[j]); |
|
4916
|
19920
|
|
|
|
|
|
double sign_y = (y[i] > y[j]) - (y[i] < y[j]); |
|
4917
|
19920
|
50
|
|
|
|
|
if (sign_x == 0 && sign_y == 0) { /* joint tie — ignore */ } |
|
|
|
0
|
|
|
|
|
|
|
4918
|
19920
|
50
|
|
|
|
|
else if (sign_x == 0) tie_x++; |
|
4919
|
19920
|
50
|
|
|
|
|
else if (sign_y == 0) tie_y++; |
|
4920
|
19920
|
100
|
|
|
|
|
else if (sign_x * sign_y > 0) c++; |
|
4921
|
19904
|
|
|
|
|
|
else d++; |
|
4922
|
|
|
|
|
|
|
} |
|
4923
|
|
|
|
|
|
|
} |
|
4924
|
3
|
|
|
|
|
|
double denom = sqrt((double)(c + d + tie_x) * (double)(c + d + tie_y)); |
|
4925
|
|
|
|
|
|
|
// BUG FIX: use NAN (from ) instead of 0.0/0.0 (UB in C) |
|
4926
|
3
|
50
|
|
|
|
|
estimate = (denom == 0.0) ? NAN : (double)(c - d) / denom; |
|
4927
|
3
|
50
|
|
|
|
|
bool has_ties = (tie_x > 0 || tie_y > 0); |
|
|
|
50
|
|
|
|
|
|
|
4928
|
|
|
|
|
|
|
bool do_exact; |
|
4929
|
|
|
|
|
|
|
/* Mirror R: exact defaults to TRUE if n < 50 and no ties */ |
|
4930
|
3
|
100
|
|
|
|
|
if (!exact_sv || !SvOK(exact_sv)) |
|
|
|
50
|
|
|
|
|
|
|
4931
|
2
|
50
|
|
|
|
|
do_exact = (n < 50) && !has_ties; |
|
|
|
50
|
|
|
|
|
|
|
4932
|
|
|
|
|
|
|
else |
|
4933
|
1
|
|
|
|
|
|
do_exact = SvTRUE(exact_sv) ? 1 : 0; |
|
4934
|
|
|
|
|
|
|
/* R overrides forced-exact back to approximation when ties exist */ |
|
4935
|
3
|
100
|
|
|
|
|
if (do_exact && has_ties) do_exact = 0; |
|
|
|
50
|
|
|
|
|
|
|
4936
|
3
|
100
|
|
|
|
|
if (do_exact) { |
|
4937
|
2
|
|
|
|
|
|
double S_stat = (double)(c - d); |
|
4938
|
2
|
|
|
|
|
|
statistic = (double)c; |
|
4939
|
2
|
|
|
|
|
|
p_value = kendall_exact_pvalue(n, S_stat, alternative); |
|
4940
|
|
|
|
|
|
|
} else { |
|
4941
|
|
|
|
|
|
|
/* Normal approximation for large n or when ties are present */ |
|
4942
|
1
|
|
|
|
|
|
double var_S = (double)n * (double)(n - 1) * (2.0 * (double)n + 5.0) / 18.0; |
|
4943
|
1
|
|
|
|
|
|
double S = (double)(c - d); |
|
4944
|
1
|
50
|
|
|
|
|
if (continuity) S -= (S > 0.0 ? 1.0 : -1.0); |
|
|
|
0
|
|
|
|
|
|
|
4945
|
1
|
|
|
|
|
|
statistic = S / sqrt(var_S); |
|
4946
|
|
|
|
|
|
|
|
|
4947
|
1
|
50
|
|
|
|
|
if (strcmp(alternative, "two.sided") == 0) |
|
4948
|
1
|
|
|
|
|
|
p_value = 2.0 * (1.0 - approx_pnorm(fabs(statistic))); |
|
4949
|
0
|
0
|
|
|
|
|
else if (strcmp(alternative, "less") == 0) |
|
4950
|
0
|
|
|
|
|
|
p_value = approx_pnorm(statistic); |
|
4951
|
|
|
|
|
|
|
else |
|
4952
|
0
|
|
|
|
|
|
p_value = 1.0 - approx_pnorm(statistic); |
|
4953
|
|
|
|
|
|
|
} |
|
4954
|
|
|
|
|
|
|
|
|
4955
|
3
|
50
|
|
|
|
|
} else if (is_spearman) { |
|
4956
|
3
|
|
|
|
|
|
double *restrict rank_x = safemalloc(n * sizeof(double)); |
|
4957
|
3
|
|
|
|
|
|
double *restrict rank_y = safemalloc(n * sizeof(double)); |
|
4958
|
3
|
|
|
|
|
|
compute_ranks(x, rank_x, n); |
|
4959
|
3
|
|
|
|
|
|
compute_ranks(y, rank_y, n); |
|
4960
|
|
|
|
|
|
|
|
|
4961
|
|
|
|
|
|
|
/* Spearman rho = Pearson r of the ranks (Welford's algorithm) */ |
|
4962
|
3
|
|
|
|
|
|
double mean_x = 0.0, mean_y = 0.0, M2_x = 0.0, M2_y = 0.0, cov = 0.0; |
|
4963
|
28
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
4964
|
25
|
|
|
|
|
|
double dx = rank_x[i] - mean_x; |
|
4965
|
25
|
|
|
|
|
|
mean_x += dx / (i + 1); |
|
4966
|
25
|
|
|
|
|
|
double dy = rank_y[i] - mean_y; |
|
4967
|
25
|
|
|
|
|
|
mean_y += dy / (i + 1); |
|
4968
|
25
|
|
|
|
|
|
M2_x += dx * (rank_x[i] - mean_x); |
|
4969
|
25
|
|
|
|
|
|
M2_y += dy * (rank_y[i] - mean_y); |
|
4970
|
25
|
|
|
|
|
|
cov += dx * (rank_y[i] - mean_y); |
|
4971
|
|
|
|
|
|
|
} |
|
4972
|
3
|
50
|
|
|
|
|
estimate = (M2_x > 0.0 && M2_y > 0.0) ? cov / sqrt(M2_x * M2_y) : 0.0; |
|
|
|
50
|
|
|
|
|
|
|
4973
|
|
|
|
|
|
|
|
|
4974
|
|
|
|
|
|
|
/* Clamp to [-1, 1] to guard against floating-point overshoot */ |
|
4975
|
3
|
50
|
|
|
|
|
if (estimate > 1.0) estimate = 1.0; |
|
4976
|
3
|
50
|
|
|
|
|
else if (estimate < -1.0) estimate = -1.0; |
|
4977
|
|
|
|
|
|
|
|
|
4978
|
|
|
|
|
|
|
/* S = sum of squared rank differences (R's reported statistic) */ |
|
4979
|
3
|
|
|
|
|
|
double S_stat = 0.0; |
|
4980
|
28
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
4981
|
25
|
|
|
|
|
|
double diff = rank_x[i] - rank_y[i]; |
|
4982
|
25
|
|
|
|
|
|
S_stat += diff * diff; |
|
4983
|
|
|
|
|
|
|
} |
|
4984
|
|
|
|
|
|
|
|
|
4985
|
|
|
|
|
|
|
/* Ties produce fractional (averaged) ranks — detect them */ |
|
4986
|
3
|
|
|
|
|
|
bool has_ties = 0; |
|
4987
|
28
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
4988
|
25
|
50
|
|
|
|
|
if (rank_x[i] != floor(rank_x[i]) || rank_y[i] != floor(rank_y[i])) { |
|
|
|
50
|
|
|
|
|
|
|
4989
|
0
|
|
|
|
|
|
has_ties = 1; |
|
4990
|
0
|
|
|
|
|
|
break; |
|
4991
|
|
|
|
|
|
|
} |
|
4992
|
|
|
|
|
|
|
} |
|
4993
|
|
|
|
|
|
|
|
|
4994
|
|
|
|
|
|
|
bool do_exact; |
|
4995
|
3
|
50
|
|
|
|
|
if (!exact_sv || !SvOK(exact_sv)) |
|
|
|
0
|
|
|
|
|
|
|
4996
|
3
|
100
|
|
|
|
|
do_exact = (n < 10) && !has_ties; |
|
|
|
50
|
|
|
|
|
|
|
4997
|
|
|
|
|
|
|
else |
|
4998
|
0
|
|
|
|
|
|
do_exact = SvTRUE(exact_sv) ? 1 : 0; |
|
4999
|
|
|
|
|
|
|
|
|
5000
|
3
|
100
|
|
|
|
|
if (do_exact) { |
|
5001
|
1
|
|
|
|
|
|
statistic = S_stat; |
|
5002
|
1
|
|
|
|
|
|
p_value = spearman_exact_pvalue(S_stat, n, alternative); |
|
5003
|
|
|
|
|
|
|
} else { |
|
5004
|
2
|
|
|
|
|
|
double r = estimate; |
|
5005
|
|
|
|
|
|
|
/* NOTE: R silently ignores continuity correction for Spearman. |
|
5006
|
|
|
|
|
|
|
* The adjustment below is non-standard; a warning is emitted |
|
5007
|
|
|
|
|
|
|
* so callers are not silently misled. */ |
|
5008
|
2
|
50
|
|
|
|
|
if (continuity) { |
|
5009
|
0
|
|
|
|
|
|
warn("cor_test: continuity correction is not defined for Spearman in R and is ignored here"); |
|
5010
|
|
|
|
|
|
|
} |
|
5011
|
|
|
|
|
|
|
/* BUG FIX: guard divide-by-zero when |r| == 1 exactly */ |
|
5012
|
2
|
|
|
|
|
|
double denom_t = 1.0 - r * r; |
|
5013
|
2
|
50
|
|
|
|
|
if (denom_t <= 0.0) |
|
5014
|
2
|
100
|
|
|
|
|
statistic = (r > 0.0) ? INFINITY : -INFINITY; |
|
5015
|
|
|
|
|
|
|
else |
|
5016
|
0
|
|
|
|
|
|
statistic = r * sqrt((double)(n - 2) / denom_t); |
|
5017
|
2
|
|
|
|
|
|
p_value = get_t_pvalue(statistic, (double)(n - 2), alternative); |
|
5018
|
|
|
|
|
|
|
} |
|
5019
|
3
|
|
|
|
|
|
Safefree(rank_x); |
|
5020
|
3
|
|
|
|
|
|
Safefree(rank_y); |
|
5021
|
|
|
|
|
|
|
|
|
5022
|
|
|
|
|
|
|
} else { |
|
5023
|
0
|
|
|
|
|
|
Safefree(x); |
|
5024
|
0
|
|
|
|
|
|
Safefree(y); |
|
5025
|
0
|
|
|
|
|
|
croak("Unknown method '%s': must be 'pearson', 'kendall', or 'spearman'", method); |
|
5026
|
|
|
|
|
|
|
} |
|
5027
|
|
|
|
|
|
|
|
|
5028
|
12
|
|
|
|
|
|
Safefree(x); |
|
5029
|
12
|
|
|
|
|
|
Safefree(y); |
|
5030
|
|
|
|
|
|
|
|
|
5031
|
12
|
|
|
|
|
|
rhv = newHV(); |
|
5032
|
12
|
|
|
|
|
|
hv_stores(rhv, "estimate", newSVnv(estimate)); |
|
5033
|
12
|
|
|
|
|
|
hv_stores(rhv, "p.value", newSVnv(p_value)); |
|
5034
|
12
|
|
|
|
|
|
hv_stores(rhv, "statistic", newSVnv(statistic)); |
|
5035
|
12
|
|
|
|
|
|
hv_stores(rhv, "method", newSVpv(method, 0)); |
|
5036
|
12
|
|
|
|
|
|
hv_stores(rhv, "alternative", newSVpv(alternative, 0)); |
|
5037
|
12
|
100
|
|
|
|
|
if (is_pearson) { |
|
5038
|
6
|
|
|
|
|
|
hv_stores(rhv, "parameter", newSVnv(df)); |
|
5039
|
6
|
|
|
|
|
|
AV *restrict ci_av = newAV(); |
|
5040
|
6
|
|
|
|
|
|
av_push(ci_av, newSVnv(ci_lower)); |
|
5041
|
6
|
|
|
|
|
|
av_push(ci_av, newSVnv(ci_upper)); |
|
5042
|
6
|
|
|
|
|
|
hv_stores(rhv, "conf.int", newRV_noinc((SV*)ci_av)); |
|
5043
|
|
|
|
|
|
|
} |
|
5044
|
|
|
|
|
|
|
|
|
5045
|
12
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)rhv); |
|
5046
|
|
|
|
|
|
|
} |
|
5047
|
|
|
|
|
|
|
OUTPUT: |
|
5048
|
|
|
|
|
|
|
RETVAL |
|
5049
|
|
|
|
|
|
|
|
|
5050
|
|
|
|
|
|
|
void shapiro_test(data) |
|
5051
|
|
|
|
|
|
|
SV *data |
|
5052
|
|
|
|
|
|
|
PREINIT: |
|
5053
|
|
|
|
|
|
|
AV *restrict av; |
|
5054
|
|
|
|
|
|
|
HV *restrict ret_hash; |
|
5055
|
2
|
|
|
|
|
|
size_t n_raw, n = 0; |
|
5056
|
2
|
|
|
|
|
|
double *restrict x, w = 0.0, p_val = 0.0, mean = 0.0, ssq = 0.0; |
|
5057
|
|
|
|
|
|
|
PPCODE: |
|
5058
|
2
|
50
|
|
|
|
|
if (!SvROK(data) || SvTYPE(SvRV(data)) != SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
5059
|
0
|
|
|
|
|
|
croak("Expected an array reference"); |
|
5060
|
|
|
|
|
|
|
} |
|
5061
|
|
|
|
|
|
|
|
|
5062
|
2
|
|
|
|
|
|
av = (AV *)SvRV(data); |
|
5063
|
2
|
|
|
|
|
|
n_raw = av_len(av) + 1; |
|
5064
|
|
|
|
|
|
|
|
|
5065
|
2
|
50
|
|
|
|
|
Newx(x, n_raw, double); |
|
5066
|
|
|
|
|
|
|
|
|
5067
|
|
|
|
|
|
|
// Extract variables and calculate mean (skipping undefined/NaN values) |
|
5068
|
26
|
100
|
|
|
|
|
for (size_t i = 0; i < n_raw; i++) { |
|
5069
|
24
|
|
|
|
|
|
SV **restrict elem = av_fetch(av, i, 0); |
|
5070
|
24
|
50
|
|
|
|
|
if (elem && SvOK(*elem)) { |
|
|
|
50
|
|
|
|
|
|
|
5071
|
24
|
|
|
|
|
|
double val = SvNV(*elem); |
|
5072
|
24
|
50
|
|
|
|
|
if (!isnan(val)) { |
|
5073
|
24
|
|
|
|
|
|
x[n] = val; |
|
5074
|
24
|
|
|
|
|
|
mean += val; |
|
5075
|
24
|
|
|
|
|
|
n++; |
|
5076
|
|
|
|
|
|
|
} |
|
5077
|
|
|
|
|
|
|
} |
|
5078
|
|
|
|
|
|
|
} |
|
5079
|
|
|
|
|
|
|
|
|
5080
|
2
|
50
|
|
|
|
|
if (n < 3 || n > 5000) { |
|
|
|
50
|
|
|
|
|
|
|
5081
|
0
|
|
|
|
|
|
Safefree(x); |
|
5082
|
0
|
|
|
|
|
|
croak("Sample size must be between 3 and 5000 (R's limit)"); |
|
5083
|
|
|
|
|
|
|
} |
|
5084
|
|
|
|
|
|
|
|
|
5085
|
2
|
|
|
|
|
|
mean /= n; |
|
5086
|
|
|
|
|
|
|
// Calculate Sum of Squares |
|
5087
|
26
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
5088
|
24
|
|
|
|
|
|
ssq += (x[i] - mean) * (x[i] - mean); |
|
5089
|
|
|
|
|
|
|
} |
|
5090
|
2
|
50
|
|
|
|
|
if (ssq == 0.0) { |
|
5091
|
0
|
|
|
|
|
|
Safefree(x); |
|
5092
|
0
|
|
|
|
|
|
croak("Data is perfectly constant; cannot compute Shapiro-Wilk test"); |
|
5093
|
|
|
|
|
|
|
} |
|
5094
|
2
|
|
|
|
|
|
qsort(x, n, sizeof(double), compare_doubles); |
|
5095
|
|
|
|
|
|
|
// --- Core AS R94 Algorithm: Weights and Statistic W |
|
5096
|
2
|
50
|
|
|
|
|
if (n == 3) { |
|
5097
|
0
|
|
|
|
|
|
double a_val = 0.7071067811865475; // sqrt(1/2) |
|
5098
|
0
|
|
|
|
|
|
double b_val = a_val * (x[2] - x[0]); |
|
5099
|
0
|
|
|
|
|
|
w = (b_val * b_val) / ssq; |
|
5100
|
0
|
0
|
|
|
|
|
if (w < 0.75) w = 0.75; |
|
5101
|
|
|
|
|
|
|
// Exact P-value for n=3 |
|
5102
|
0
|
|
|
|
|
|
p_val = 1.90985931710274 * (asin(sqrt(w)) - 1.04719755119660); |
|
5103
|
|
|
|
|
|
|
} else { |
|
5104
|
|
|
|
|
|
|
double *restrict m, *restrict a; |
|
5105
|
2
|
|
|
|
|
|
double sum_m2 = 0.0, b_val = 0.0; |
|
5106
|
2
|
50
|
|
|
|
|
Newx(m, n, double); |
|
5107
|
2
|
50
|
|
|
|
|
Newx(a, n, double); |
|
5108
|
26
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
5109
|
24
|
|
|
|
|
|
m[i] = inverse_normal_cdf((i + 1.0 - 0.375) / (n + 0.25)); |
|
5110
|
24
|
|
|
|
|
|
sum_m2 += m[i] * m[i]; |
|
5111
|
|
|
|
|
|
|
} |
|
5112
|
2
|
|
|
|
|
|
double u = 1.0 / sqrt((double)n); |
|
5113
|
2
|
|
|
|
|
|
double a_n = -2.706056*pow(u,5) + 4.434685*pow(u,4) - 2.071190*pow(u,3) - 0.147981*pow(u,2) + 0.221157*u + m[n-1]/sqrt(sum_m2); |
|
5114
|
2
|
|
|
|
|
|
a[n-1] = a_n; |
|
5115
|
2
|
|
|
|
|
|
a[0] = -a_n; |
|
5116
|
3
|
50
|
|
|
|
|
if (n == 4 || n == 5) { |
|
|
|
100
|
|
|
|
|
|
|
5117
|
1
|
|
|
|
|
|
double eps = (sum_m2 - 2.0 * m[n-1]*m[n-1]) / (1.0 - 2.0 * a_n*a_n); |
|
5118
|
4
|
100
|
|
|
|
|
for (unsigned int i = 1; i < n-1; i++) { |
|
5119
|
3
|
|
|
|
|
|
a[i] = m[i] / sqrt(eps); |
|
5120
|
|
|
|
|
|
|
} |
|
5121
|
|
|
|
|
|
|
} else { |
|
5122
|
1
|
|
|
|
|
|
double a_n1 = -3.582633*pow(u,5) + 5.682633*pow(u,4) - 1.752461*pow(u,3) - 0.293762*pow(u,2) + 0.042981*u + m[n-2]/sqrt(sum_m2); |
|
5123
|
1
|
|
|
|
|
|
a[n-2] = a_n1; |
|
5124
|
1
|
|
|
|
|
|
a[1] = -a_n1; |
|
5125
|
1
|
|
|
|
|
|
double eps = (sum_m2 - 2.0 * m[n-1]*m[n-1] - 2.0 * m[n-2]*m[n-2]) / (1.0 - 2.0 * a_n*a_n - 2.0 * a_n1*a_n1); |
|
5126
|
16
|
100
|
|
|
|
|
for (unsigned int i = 2; i < n-2; i++) { |
|
5127
|
15
|
|
|
|
|
|
a[i] = m[i] / sqrt(eps); |
|
5128
|
|
|
|
|
|
|
} |
|
5129
|
|
|
|
|
|
|
} |
|
5130
|
26
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
5131
|
24
|
|
|
|
|
|
b_val += a[i] * x[i]; |
|
5132
|
|
|
|
|
|
|
} |
|
5133
|
2
|
|
|
|
|
|
w = (b_val * b_val) / ssq; |
|
5134
|
|
|
|
|
|
|
// --- AS R94 P-Value Calculation: High Precision Refinement --- |
|
5135
|
|
|
|
|
|
|
/* NOTE: p_val is declared in PREINIT above; |
|
5136
|
|
|
|
|
|
|
* do NOT shadow it with a local 'double p_val' here or the result will never reach the caller. |
|
5137
|
|
|
|
|
|
|
*/ |
|
5138
|
2
|
|
|
|
|
|
double y = log(1.0 - w); |
|
5139
|
|
|
|
|
|
|
double z; |
|
5140
|
2
|
100
|
|
|
|
|
if (n <= 11) { |
|
5141
|
|
|
|
|
|
|
// Royston's branch for 4 <= n <= 11 (AS R94, small-sample path). |
|
5142
|
|
|
|
|
|
|
// gamma is the upper bound on y = log(1-W); |
|
5143
|
|
|
|
|
|
|
// if y reaches gamma the p-value is essentially zero |
|
5144
|
1
|
|
|
|
|
|
double nn = (double)n; |
|
5145
|
1
|
|
|
|
|
|
double gamma = 0.459 * nn - 2.273; |
|
5146
|
1
|
50
|
|
|
|
|
if (y >= gamma) { |
|
5147
|
0
|
|
|
|
|
|
p_val = 1e-19; |
|
5148
|
|
|
|
|
|
|
} else { |
|
5149
|
|
|
|
|
|
|
// Horner-form polynomials in n for mu and log(sigma) |
|
5150
|
1
|
|
|
|
|
|
double mu = 0.544 + nn * (-0.39978 + nn * ( 0.025054 - nn * 0.0006714)); |
|
5151
|
1
|
|
|
|
|
|
double sig_val= 1.3822 + nn * (-0.77857 + nn * ( 0.062767 - nn * 0.0020322)); |
|
5152
|
1
|
|
|
|
|
|
double sigma = exp(sig_val); |
|
5153
|
1
|
|
|
|
|
|
z = (-log(gamma - y) - mu) / sigma; |
|
5154
|
|
|
|
|
|
|
/* Upper-tail probability P(Z > z): small W → large z → small p-value. |
|
5155
|
|
|
|
|
|
|
*/ |
|
5156
|
1
|
|
|
|
|
|
p_val = 0.5 * erfc(z * M_SQRT1_2); |
|
5157
|
|
|
|
|
|
|
} |
|
5158
|
|
|
|
|
|
|
} else { |
|
5159
|
|
|
|
|
|
|
// Royston's branch for n >= 12 (AS R94, large-sample path) |
|
5160
|
1
|
|
|
|
|
|
double ln_n = log((double)n); |
|
5161
|
|
|
|
|
|
|
// Horner-form polynomials in log(n) for mu and log(sigma). */ |
|
5162
|
1
|
|
|
|
|
|
double mu = -1.5861 + ln_n * (-0.31082 + ln_n * (-0.083751 + ln_n * 0.0038915)); |
|
5163
|
1
|
|
|
|
|
|
double sig_val= -0.4803 + ln_n * (-0.082676 + ln_n * 0.0030302); |
|
5164
|
1
|
|
|
|
|
|
double sigma = exp(sig_val); |
|
5165
|
1
|
|
|
|
|
|
z = (y - mu) / sigma; |
|
5166
|
1
|
|
|
|
|
|
p_val = 0.5 * erfc(z * M_SQRT1_2); |
|
5167
|
|
|
|
|
|
|
} |
|
5168
|
|
|
|
|
|
|
// Clamp the p-value |
|
5169
|
2
|
50
|
|
|
|
|
if (p_val > 1.0) p_val = 1.0; |
|
5170
|
2
|
50
|
|
|
|
|
if (p_val < 0.0) p_val = 0.0; |
|
5171
|
2
|
|
|
|
|
|
Safefree(m); m = NULL; Safefree(a); a = NULL; |
|
5172
|
|
|
|
|
|
|
} |
|
5173
|
2
|
|
|
|
|
|
Safefree(x); x = NULL; |
|
5174
|
2
|
|
|
|
|
|
ret_hash = newHV(); |
|
5175
|
2
|
|
|
|
|
|
hv_stores(ret_hash, "statistic", newSVnv(w)); |
|
5176
|
2
|
|
|
|
|
|
hv_stores(ret_hash, "W", newSVnv(w)); |
|
5177
|
2
|
|
|
|
|
|
hv_stores(ret_hash, "p_value", newSVnv(p_val)); |
|
5178
|
2
|
|
|
|
|
|
hv_stores(ret_hash, "p.value", newSVnv(p_val)); |
|
5179
|
2
|
50
|
|
|
|
|
EXTEND(SP, 1); |
|
5180
|
2
|
|
|
|
|
|
PUSHs(sv_2mortal(newRV_noinc((SV *)ret_hash))); |
|
5181
|
|
|
|
|
|
|
|
|
5182
|
|
|
|
|
|
|
double min(...) |
|
5183
|
|
|
|
|
|
|
PROTOTYPE: @ |
|
5184
|
|
|
|
|
|
|
INIT: |
|
5185
|
19
|
|
|
|
|
|
NV min_val = 0.0; |
|
5186
|
19
|
|
|
|
|
|
size_t count = 0; |
|
5187
|
19
|
|
|
|
|
|
bool first = TRUE; |
|
5188
|
|
|
|
|
|
|
CODE: |
|
5189
|
10052
|
100
|
|
|
|
|
for (unsigned short int i = 0; i < items; i++) { |
|
5190
|
10035
|
|
|
|
|
|
SV* restrict arg = ST(i); |
|
5191
|
10045
|
100
|
|
|
|
|
if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
5192
|
11
|
|
|
|
|
|
AV* restrict av = (AV*)SvRV(arg); |
|
5193
|
11
|
|
|
|
|
|
size_t len = av_len(av) + 1; |
|
5194
|
466
|
100
|
|
|
|
|
for (size_t j = 0; j < len; j++) { |
|
5195
|
456
|
|
|
|
|
|
SV** restrict tv = av_fetch(av, j, 0); |
|
5196
|
456
|
50
|
|
|
|
|
if (tv && SvOK(*tv)) { |
|
|
|
100
|
|
|
|
|
|
|
5197
|
455
|
|
|
|
|
|
NV val = SvNV(*tv); |
|
5198
|
455
|
100
|
|
|
|
|
if (first || val < min_val) { |
|
|
|
100
|
|
|
|
|
|
|
5199
|
23
|
|
|
|
|
|
min_val = val; |
|
5200
|
23
|
|
|
|
|
|
first = FALSE; |
|
5201
|
|
|
|
|
|
|
} |
|
5202
|
455
|
|
|
|
|
|
count++; |
|
5203
|
|
|
|
|
|
|
} else { |
|
5204
|
1
|
|
|
|
|
|
croak("min: undefined value at array ref index %zu (argument %d)", j, (int)i); |
|
5205
|
|
|
|
|
|
|
} |
|
5206
|
|
|
|
|
|
|
} |
|
5207
|
10024
|
100
|
|
|
|
|
} else if (SvOK(arg)) { |
|
5208
|
10023
|
|
|
|
|
|
NV val = SvNV(arg); |
|
5209
|
10023
|
100
|
|
|
|
|
if (first || val < min_val) { |
|
|
|
100
|
|
|
|
|
|
|
5210
|
21
|
|
|
|
|
|
min_val = val; |
|
5211
|
21
|
|
|
|
|
|
first = FALSE; |
|
5212
|
|
|
|
|
|
|
} |
|
5213
|
10023
|
|
|
|
|
|
count++; |
|
5214
|
|
|
|
|
|
|
} else { |
|
5215
|
1
|
|
|
|
|
|
croak("min: undefined value at argument index %d", (int)i); |
|
5216
|
|
|
|
|
|
|
} |
|
5217
|
|
|
|
|
|
|
} |
|
5218
|
17
|
100
|
|
|
|
|
if (count == 0) croak("min needs >= 1 numeric element"); |
|
5219
|
16
|
100
|
|
|
|
|
RETVAL = min_val; |
|
5220
|
|
|
|
|
|
|
OUTPUT: |
|
5221
|
|
|
|
|
|
|
RETVAL |
|
5222
|
|
|
|
|
|
|
|
|
5223
|
|
|
|
|
|
|
double max(...) |
|
5224
|
|
|
|
|
|
|
PROTOTYPE: @ |
|
5225
|
|
|
|
|
|
|
INIT: |
|
5226
|
20
|
|
|
|
|
|
NV max_val = 0.0; |
|
5227
|
20
|
|
|
|
|
|
size_t count = 0; |
|
5228
|
20
|
|
|
|
|
|
bool first = TRUE; |
|
5229
|
|
|
|
|
|
|
CODE: |
|
5230
|
10053
|
100
|
|
|
|
|
for (size_t i = 0; i < items; i++) { |
|
5231
|
10035
|
|
|
|
|
|
SV* restrict arg = ST(i); |
|
5232
|
10046
|
100
|
|
|
|
|
if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
5233
|
12
|
|
|
|
|
|
AV* restrict av = (AV*)SvRV(arg); |
|
5234
|
12
|
|
|
|
|
|
size_t len = av_len(av) + 1; |
|
5235
|
567
|
100
|
|
|
|
|
for (size_t j = 0; j < len; j++) { |
|
5236
|
556
|
|
|
|
|
|
SV** restrict tv = av_fetch(av, j, 0); |
|
5237
|
556
|
50
|
|
|
|
|
if (tv && SvOK(*tv)) { |
|
|
|
100
|
|
|
|
|
|
|
5238
|
555
|
|
|
|
|
|
NV val = SvNV(*tv); |
|
5239
|
555
|
100
|
|
|
|
|
if (first || val > max_val) { |
|
|
|
100
|
|
|
|
|
|
|
5240
|
50
|
|
|
|
|
|
max_val = val; |
|
5241
|
50
|
|
|
|
|
|
first = FALSE; |
|
5242
|
|
|
|
|
|
|
} |
|
5243
|
555
|
|
|
|
|
|
count++; |
|
5244
|
|
|
|
|
|
|
} else { |
|
5245
|
1
|
|
|
|
|
|
croak("max: undefined value at array ref index %zu (argument %zu)", j, i); |
|
5246
|
|
|
|
|
|
|
} |
|
5247
|
|
|
|
|
|
|
} |
|
5248
|
10023
|
100
|
|
|
|
|
} else if (SvOK(arg)) { |
|
5249
|
10022
|
|
|
|
|
|
NV val = SvNV(arg); |
|
5250
|
10022
|
100
|
|
|
|
|
if (first || val > max_val) { |
|
|
|
100
|
|
|
|
|
|
|
5251
|
29
|
|
|
|
|
|
max_val = val; |
|
5252
|
29
|
|
|
|
|
|
first = FALSE; |
|
5253
|
|
|
|
|
|
|
} |
|
5254
|
10022
|
|
|
|
|
|
count++; |
|
5255
|
|
|
|
|
|
|
} else { |
|
5256
|
1
|
|
|
|
|
|
croak("max: undefined value at argument index %zu", i); |
|
5257
|
|
|
|
|
|
|
} |
|
5258
|
|
|
|
|
|
|
} |
|
5259
|
18
|
100
|
|
|
|
|
if (count == 0) croak("max needs >= 1 numeric element"); |
|
5260
|
17
|
100
|
|
|
|
|
RETVAL = max_val; |
|
5261
|
|
|
|
|
|
|
OUTPUT: |
|
5262
|
|
|
|
|
|
|
RETVAL |
|
5263
|
|
|
|
|
|
|
|
|
5264
|
|
|
|
|
|
|
SV* runif(...) |
|
5265
|
|
|
|
|
|
|
CODE: |
|
5266
|
|
|
|
|
|
|
{ |
|
5267
|
11
|
|
|
|
|
|
size_t n = 0; |
|
5268
|
11
|
|
|
|
|
|
NV min = 0.0, max = 1.0; |
|
5269
|
|
|
|
|
|
|
|
|
5270
|
|
|
|
|
|
|
// Flags to track what has been assigned |
|
5271
|
11
|
|
|
|
|
|
bool n_set = 0, min_set = 0, max_set = 0; |
|
5272
|
|
|
|
|
|
|
|
|
5273
|
11
|
|
|
|
|
|
unsigned int i = 0; |
|
5274
|
|
|
|
|
|
|
|
|
5275
|
11
|
50
|
|
|
|
|
if (items == 0) { |
|
5276
|
0
|
|
|
|
|
|
croak("Usage: runif(n, [min=0], [max=1]) or runif(n => $n, ...)"); |
|
5277
|
|
|
|
|
|
|
} |
|
5278
|
|
|
|
|
|
|
|
|
5279
|
28
|
100
|
|
|
|
|
while (i < items) { |
|
5280
|
|
|
|
|
|
|
// 1. Check if the current argument is a string key for a named parameter |
|
5281
|
17
|
100
|
|
|
|
|
if (i + 1 < items && SvPOK(ST(i))) { |
|
|
|
100
|
|
|
|
|
|
|
5282
|
6
|
|
|
|
|
|
char *restrict key = SvPV_nolen(ST(i)); |
|
5283
|
6
|
100
|
|
|
|
|
if (strEQ(key, "n")) { |
|
5284
|
2
|
|
|
|
|
|
n = (size_t)SvUV(ST(i+1)); |
|
5285
|
2
|
|
|
|
|
|
n_set = 1; |
|
5286
|
2
|
|
|
|
|
|
i += 2; |
|
5287
|
2
|
|
|
|
|
|
continue; |
|
5288
|
4
|
100
|
|
|
|
|
} else if (strEQ(key, "min")) { |
|
5289
|
2
|
|
|
|
|
|
min = SvNV(ST(i+1)); |
|
5290
|
2
|
|
|
|
|
|
min_set = 1; |
|
5291
|
2
|
|
|
|
|
|
i += 2; |
|
5292
|
2
|
|
|
|
|
|
continue; |
|
5293
|
2
|
50
|
|
|
|
|
} else if (strEQ(key, "max")) { |
|
5294
|
2
|
|
|
|
|
|
max = SvNV(ST(i+1)); |
|
5295
|
2
|
|
|
|
|
|
max_set = 1; |
|
5296
|
2
|
|
|
|
|
|
i += 2; |
|
5297
|
2
|
|
|
|
|
|
continue; |
|
5298
|
|
|
|
|
|
|
} |
|
5299
|
|
|
|
|
|
|
} |
|
5300
|
|
|
|
|
|
|
|
|
5301
|
|
|
|
|
|
|
// 2. Fallback to positional parsing if it's not a recognized key |
|
5302
|
11
|
100
|
|
|
|
|
if (!n_set) { |
|
5303
|
9
|
|
|
|
|
|
n = (size_t)SvUV(ST(i)); |
|
5304
|
9
|
|
|
|
|
|
n_set = 1; |
|
5305
|
2
|
100
|
|
|
|
|
} else if (!min_set) { |
|
5306
|
1
|
|
|
|
|
|
min = SvNV(ST(i)); |
|
5307
|
1
|
|
|
|
|
|
min_set = 1; |
|
5308
|
1
|
50
|
|
|
|
|
} else if (!max_set) { |
|
5309
|
1
|
|
|
|
|
|
max = SvNV(ST(i)); |
|
5310
|
1
|
|
|
|
|
|
max_set = 1; |
|
5311
|
|
|
|
|
|
|
} else { |
|
5312
|
0
|
|
|
|
|
|
croak("Too many arguments or unrecognized parameter passed to runif()"); |
|
5313
|
|
|
|
|
|
|
} |
|
5314
|
11
|
|
|
|
|
|
i++; |
|
5315
|
|
|
|
|
|
|
} |
|
5316
|
11
|
50
|
|
|
|
|
if (!n_set) { |
|
5317
|
0
|
|
|
|
|
|
croak("runif() requires at least the 'n' parameter"); |
|
5318
|
|
|
|
|
|
|
} |
|
5319
|
|
|
|
|
|
|
// Ensure PRNG is seeded |
|
5320
|
11
|
50
|
|
|
|
|
AUTO_SEED_PRNG(); |
|
5321
|
11
|
|
|
|
|
|
AV *restrict results = newAV(); |
|
5322
|
11
|
50
|
|
|
|
|
if (n > 0) { |
|
5323
|
11
|
|
|
|
|
|
av_extend(results, n - 1); |
|
5324
|
|
|
|
|
|
|
} |
|
5325
|
11
|
|
|
|
|
|
const NV range = max - min; |
|
5326
|
20090
|
100
|
|
|
|
|
for (size_t j = 0; j < n; j++) { |
|
5327
|
|
|
|
|
|
|
double r; |
|
5328
|
20079
|
50
|
|
|
|
|
if (max < min) { |
|
5329
|
0
|
|
|
|
|
|
r = NAN; // R behavior for inverted ranges |
|
5330
|
|
|
|
|
|
|
} else { |
|
5331
|
20079
|
|
|
|
|
|
r = min + range * Drand01(); |
|
5332
|
|
|
|
|
|
|
} |
|
5333
|
20079
|
|
|
|
|
|
av_push(results, newSVnv(r)); |
|
5334
|
|
|
|
|
|
|
} |
|
5335
|
11
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)results); |
|
5336
|
|
|
|
|
|
|
} |
|
5337
|
|
|
|
|
|
|
OUTPUT: |
|
5338
|
|
|
|
|
|
|
RETVAL |
|
5339
|
|
|
|
|
|
|
|
|
5340
|
|
|
|
|
|
|
SV* rbinom(...) |
|
5341
|
|
|
|
|
|
|
CODE: |
|
5342
|
|
|
|
|
|
|
{ |
|
5343
|
|
|
|
|
|
|
// Auto-seed the PRNG if the Perl script hasn't done so yet |
|
5344
|
12
|
50
|
|
|
|
|
AUTO_SEED_PRNG(); |
|
5345
|
12
|
100
|
|
|
|
|
if (items % 2 != 0) |
|
5346
|
1
|
|
|
|
|
|
croak("Usage: rbinom(n => 10, size => 100, prob => 0.5)"); |
|
5347
|
|
|
|
|
|
|
//Parse named arguments |
|
5348
|
11
|
|
|
|
|
|
size_t n = 0, size = 0; |
|
5349
|
11
|
|
|
|
|
|
NV prob = 0.5; |
|
5350
|
|
|
|
|
|
|
|
|
5351
|
11
|
|
|
|
|
|
bool size_set = FALSE, prob_set = FALSE; |
|
5352
|
|
|
|
|
|
|
|
|
5353
|
42
|
100
|
|
|
|
|
for (unsigned short i = 0; i < items; i += 2) { |
|
5354
|
31
|
|
|
|
|
|
const char* restrict key = SvPV_nolen(ST(i)); |
|
5355
|
31
|
|
|
|
|
|
SV* restrict val = ST(i + 1); |
|
5356
|
|
|
|
|
|
|
|
|
5357
|
31
|
100
|
|
|
|
|
if (strEQ(key, "n")) n = (unsigned int)SvUV(val); |
|
5358
|
20
|
100
|
|
|
|
|
else if (strEQ(key, "size")) { size = (unsigned int)SvUV(val); size_set = TRUE; } |
|
5359
|
10
|
50
|
|
|
|
|
else if (strEQ(key, "prob")) { prob = SvNV(val); prob_set = TRUE; } |
|
5360
|
0
|
|
|
|
|
|
else croak("rbinom: unknown argument '%s'", key); |
|
5361
|
|
|
|
|
|
|
} |
|
5362
|
|
|
|
|
|
|
|
|
5363
|
|
|
|
|
|
|
// R requires size and prob to be explicitly passed in rbinom |
|
5364
|
11
|
100
|
|
|
|
|
if (!size_set || !prob_set) croak("rbinom: 'size' and 'prob' are required arguments"); |
|
|
|
100
|
|
|
|
|
|
|
5365
|
9
|
100
|
|
|
|
|
if (prob < 0.0 || prob > 1.0) croak("rbinom: prob must be between 0 and 1"); |
|
|
|
100
|
|
|
|
|
|
|
5366
|
|
|
|
|
|
|
|
|
5367
|
7
|
|
|
|
|
|
AV *restrict result_av = newAV(); |
|
5368
|
7
|
50
|
|
|
|
|
if (n > 0) { |
|
5369
|
7
|
|
|
|
|
|
av_extend(result_av, n - 1); |
|
5370
|
20506
|
100
|
|
|
|
|
for (unsigned int i = 0; i < n; i++) { |
|
5371
|
20499
|
|
|
|
|
|
av_store(result_av, i, newSVuv(generate_binomial(aTHX_ size, prob))); |
|
5372
|
|
|
|
|
|
|
} |
|
5373
|
|
|
|
|
|
|
} |
|
5374
|
|
|
|
|
|
|
|
|
5375
|
7
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)result_av); |
|
5376
|
|
|
|
|
|
|
} |
|
5377
|
|
|
|
|
|
|
OUTPUT: |
|
5378
|
|
|
|
|
|
|
RETVAL |
|
5379
|
|
|
|
|
|
|
|
|
5380
|
|
|
|
|
|
|
SV* hist(SV* x_sv, ...) |
|
5381
|
|
|
|
|
|
|
CODE: |
|
5382
|
|
|
|
|
|
|
{ |
|
5383
|
|
|
|
|
|
|
// 1. Validate Input |
|
5384
|
9
|
100
|
|
|
|
|
if (!SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV) |
|
|
|
100
|
|
|
|
|
|
|
5385
|
2
|
|
|
|
|
|
croak("hist: first argument must be an array reference"); |
|
5386
|
|
|
|
|
|
|
|
|
5387
|
7
|
|
|
|
|
|
AV*restrict x_av = (AV*)SvRV(x_sv); |
|
5388
|
7
|
|
|
|
|
|
size_t n_raw = av_len(x_av) + 1; |
|
5389
|
7
|
100
|
|
|
|
|
if (n_raw == 0) croak("hist: input array is empty"); |
|
5390
|
|
|
|
|
|
|
|
|
5391
|
|
|
|
|
|
|
// 2. Extract Data & Find Range |
|
5392
|
|
|
|
|
|
|
NV *restrict x; |
|
5393
|
6
|
50
|
|
|
|
|
Newx(x, n_raw, double); |
|
5394
|
6
|
|
|
|
|
|
size_t n = 0; |
|
5395
|
6
|
|
|
|
|
|
NV min_val = DBL_MAX, max_val = -DBL_MAX; |
|
5396
|
|
|
|
|
|
|
|
|
5397
|
2026
|
100
|
|
|
|
|
for (size_t i = 0; i < n_raw; i++) { |
|
5398
|
2021
|
|
|
|
|
|
SV**restrict tv = av_fetch(x_av, i, 0); |
|
5399
|
2021
|
50
|
|
|
|
|
if (tv && SvOK(*tv)) { |
|
|
|
50
|
|
|
|
|
|
|
5400
|
2021
|
|
|
|
|
|
NV val = SvNV(*tv); |
|
5401
|
2020
|
|
|
|
|
|
x[n++] = val; |
|
5402
|
2020
|
100
|
|
|
|
|
if (val < min_val) min_val = val; |
|
5403
|
2020
|
100
|
|
|
|
|
if (val > max_val) max_val = val; |
|
5404
|
|
|
|
|
|
|
} |
|
5405
|
|
|
|
|
|
|
} |
|
5406
|
5
|
50
|
|
|
|
|
if (n == 0) { |
|
5407
|
0
|
|
|
|
|
|
Safefree(x); |
|
5408
|
0
|
|
|
|
|
|
croak("hist: input contains no valid numeric data"); |
|
5409
|
|
|
|
|
|
|
} |
|
5410
|
|
|
|
|
|
|
// 3. Determine Bin Count (Sturges default or user-provided) |
|
5411
|
5
|
|
|
|
|
|
size_t n_bins = 0; |
|
5412
|
5
|
50
|
|
|
|
|
if (items == 2) { |
|
5413
|
|
|
|
|
|
|
// Support pure positional argument: hist($data, 22) |
|
5414
|
0
|
|
|
|
|
|
n_bins = (size_t)SvIV(ST(1)); |
|
5415
|
5
|
50
|
|
|
|
|
} else if (items > 2) { |
|
5416
|
|
|
|
|
|
|
// Support named parameters even if mixed with positional arguments |
|
5417
|
5
|
50
|
|
|
|
|
for (unsigned short i = 1; i < items - 1; i++) { |
|
5418
|
|
|
|
|
|
|
// Make sure the SV holds a string before doing string comparison |
|
5419
|
5
|
50
|
|
|
|
|
if (SvPOK(ST(i)) && strEQ(SvPV_nolen(ST(i)), "breaks")) { |
|
|
|
50
|
|
|
|
|
|
|
5420
|
5
|
|
|
|
|
|
n_bins = (size_t)SvIV(ST(i+1)); |
|
5421
|
5
|
|
|
|
|
|
break; |
|
5422
|
|
|
|
|
|
|
} |
|
5423
|
|
|
|
|
|
|
} |
|
5424
|
|
|
|
|
|
|
/* Fallback: if 'breaks' wasn't found but a positional number was given first */ |
|
5425
|
5
|
50
|
|
|
|
|
if (n_bins == 0 && looks_like_number(ST(1))) { |
|
|
|
0
|
|
|
|
|
|
|
5426
|
0
|
|
|
|
|
|
n_bins = (size_t)SvIV(ST(1)); |
|
5427
|
|
|
|
|
|
|
} |
|
5428
|
|
|
|
|
|
|
} |
|
5429
|
5
|
50
|
|
|
|
|
if (n_bins == 0) n_bins = calculate_sturges_bins(n); |
|
5430
|
|
|
|
|
|
|
// 4. Allocate Result Arrays |
|
5431
|
|
|
|
|
|
|
NV *restrict breaks, *restrict mids, *restrict density; |
|
5432
|
|
|
|
|
|
|
size_t *restrict counts; |
|
5433
|
5
|
50
|
|
|
|
|
Newx(breaks, n_bins + 1, double); |
|
5434
|
5
|
50
|
|
|
|
|
Newx(mids, n_bins, double); |
|
5435
|
5
|
50
|
|
|
|
|
Newx(density, n_bins, double); |
|
5436
|
5
|
50
|
|
|
|
|
Newx(counts, n_bins, size_t); |
|
5437
|
|
|
|
|
|
|
// Generate simple linear breaks |
|
5438
|
5
|
|
|
|
|
|
NV step = (max_val - min_val) / (double)n_bins; |
|
5439
|
28
|
100
|
|
|
|
|
for (size_t i = 0; i <= n_bins; i++) { |
|
5440
|
23
|
|
|
|
|
|
breaks[i] = min_val + (double)i * step; |
|
5441
|
|
|
|
|
|
|
} |
|
5442
|
|
|
|
|
|
|
// 5. Compute Statistics |
|
5443
|
5
|
|
|
|
|
|
compute_hist_logic(x, n, breaks, n_bins, counts, mids, density); |
|
5444
|
|
|
|
|
|
|
// 6. Build Return HashRef |
|
5445
|
5
|
|
|
|
|
|
HV*restrict res_hv = newHV(); |
|
5446
|
5
|
|
|
|
|
|
AV*restrict av_breaks = newAV(); |
|
5447
|
5
|
|
|
|
|
|
AV*restrict av_counts = newAV(); |
|
5448
|
5
|
|
|
|
|
|
AV*restrict av_mids = newAV(); |
|
5449
|
5
|
|
|
|
|
|
AV*restrict av_density = newAV(); |
|
5450
|
28
|
100
|
|
|
|
|
for (size_t i = 0; i <= n_bins; i++) { |
|
5451
|
23
|
|
|
|
|
|
av_push(av_breaks, newSVnv(breaks[i])); |
|
5452
|
23
|
100
|
|
|
|
|
if (i < n_bins) { |
|
5453
|
18
|
|
|
|
|
|
av_push(av_counts, newSViv(counts[i])); |
|
5454
|
18
|
|
|
|
|
|
av_push(av_mids, newSVnv(mids[i])); |
|
5455
|
18
|
|
|
|
|
|
av_push(av_density, newSVnv(density[i])); |
|
5456
|
|
|
|
|
|
|
} |
|
5457
|
|
|
|
|
|
|
} |
|
5458
|
5
|
|
|
|
|
|
hv_stores(res_hv, "breaks", newRV_noinc((SV*)av_breaks)); |
|
5459
|
5
|
|
|
|
|
|
hv_stores(res_hv, "counts", newRV_noinc((SV*)av_counts)); |
|
5460
|
5
|
|
|
|
|
|
hv_stores(res_hv, "mids", newRV_noinc((SV*)av_mids)); |
|
5461
|
5
|
|
|
|
|
|
hv_stores(res_hv, "density", newRV_noinc((SV*)av_density)); |
|
5462
|
|
|
|
|
|
|
// Clean |
|
5463
|
5
|
|
|
|
|
|
Safefree(x); Safefree(breaks); Safefree(mids); |
|
5464
|
5
|
|
|
|
|
|
Safefree(density); Safefree(counts); |
|
5465
|
5
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)res_hv); |
|
5466
|
|
|
|
|
|
|
} |
|
5467
|
|
|
|
|
|
|
OUTPUT: |
|
5468
|
|
|
|
|
|
|
RETVAL |
|
5469
|
|
|
|
|
|
|
|
|
5470
|
|
|
|
|
|
|
SV* quantile(...) |
|
5471
|
|
|
|
|
|
|
CODE: |
|
5472
|
|
|
|
|
|
|
{ |
|
5473
|
11
|
|
|
|
|
|
SV *restrict x_sv = NULL; |
|
5474
|
11
|
|
|
|
|
|
SV *restrict probs_sv = NULL; |
|
5475
|
11
|
|
|
|
|
|
unsigned int arg_idx = 0; |
|
5476
|
|
|
|
|
|
|
// --- 1. Consume first positional arg as 'x' if it's an array ref |
|
5477
|
11
|
50
|
|
|
|
|
if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
5478
|
10
|
|
|
|
|
|
x_sv = ST(arg_idx); |
|
5479
|
10
|
|
|
|
|
|
arg_idx++; |
|
5480
|
|
|
|
|
|
|
} |
|
5481
|
|
|
|
|
|
|
// --- 2. Remaining args must be key-value pairs |
|
5482
|
11
|
50
|
|
|
|
|
if ((items - arg_idx) % 2 != 0) |
|
5483
|
0
|
|
|
|
|
|
croak("Usage: quantile(\\@data, probs => \\@probs) OR quantile(x => \\@data, probs => \\@probs)"); |
|
5484
|
|
|
|
|
|
|
|
|
5485
|
23
|
100
|
|
|
|
|
for (; arg_idx < items; arg_idx += 2) { |
|
5486
|
12
|
|
|
|
|
|
const char *restrict key = SvPV_nolen(ST(arg_idx)); |
|
5487
|
12
|
|
|
|
|
|
SV *restrict val = ST(arg_idx + 1); |
|
5488
|
|
|
|
|
|
|
|
|
5489
|
12
|
100
|
|
|
|
|
if (strEQ(key, "x")) x_sv = val; |
|
5490
|
11
|
50
|
|
|
|
|
else if (strEQ(key, "probs")) probs_sv = val; |
|
5491
|
0
|
|
|
|
|
|
else croak("quantile: unknown argument '%s'", key); |
|
5492
|
|
|
|
|
|
|
} |
|
5493
|
11
|
50
|
|
|
|
|
if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
5494
|
0
|
|
|
|
|
|
croak("quantile: 'x' must be an array reference"); |
|
5495
|
|
|
|
|
|
|
|
|
5496
|
11
|
|
|
|
|
|
AV *restrict x_av = (AV*)SvRV(x_sv); |
|
5497
|
11
|
|
|
|
|
|
size_t n_raw = av_len(x_av) + 1; |
|
5498
|
11
|
50
|
|
|
|
|
if (n_raw == 0) croak("quantile: 'x' is empty"); |
|
5499
|
|
|
|
|
|
|
// --- Extract valid numeric data & drop NAs (Upgraded to NV) |
|
5500
|
|
|
|
|
|
|
NV *restrict x; |
|
5501
|
11
|
50
|
|
|
|
|
Newx(x, n_raw, NV); |
|
5502
|
11
|
|
|
|
|
|
size_t n = 0; |
|
5503
|
458
|
100
|
|
|
|
|
for (size_t i = 0; i < n_raw; i++) { |
|
5504
|
447
|
|
|
|
|
|
SV **restrict tv = av_fetch(x_av, i, 0); |
|
5505
|
447
|
50
|
|
|
|
|
if (tv && SvOK(*tv)) { |
|
|
|
50
|
|
|
|
|
|
|
5506
|
447
|
|
|
|
|
|
x[n++] = SvNV(*tv); |
|
5507
|
|
|
|
|
|
|
} |
|
5508
|
|
|
|
|
|
|
} |
|
5509
|
11
|
50
|
|
|
|
|
if (n == 0) { |
|
5510
|
0
|
|
|
|
|
|
Safefree(x); |
|
5511
|
0
|
|
|
|
|
|
croak("quantile: 'x' contains no valid numbers"); |
|
5512
|
|
|
|
|
|
|
} |
|
5513
|
|
|
|
|
|
|
// --- Sort Data for Quantile Math --- |
|
5514
|
|
|
|
|
|
|
// Note: You must update `compare_doubles` to accept and compare `NV` types! |
|
5515
|
11
|
|
|
|
|
|
qsort(x, n, sizeof(NV), compare_NVs); |
|
5516
|
|
|
|
|
|
|
// --- Parse Probabilities (Upgraded to NV) --- |
|
5517
|
11
|
|
|
|
|
|
NV default_probs[] = {0.0, 0.25, 0.50, 0.75, 1.0}; |
|
5518
|
11
|
|
|
|
|
|
unsigned int n_probs = 5; |
|
5519
|
|
|
|
|
|
|
NV *restrict probs; |
|
5520
|
22
|
50
|
|
|
|
|
if (probs_sv && SvROK(probs_sv) && SvTYPE(SvRV(probs_sv)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
5521
|
11
|
|
|
|
|
|
AV *restrict p_av = (AV*)SvRV(probs_sv); |
|
5522
|
11
|
|
|
|
|
|
n_probs = av_len(p_av) + 1; |
|
5523
|
11
|
|
|
|
|
|
Newx(probs, n_probs, NV); |
|
5524
|
34
|
100
|
|
|
|
|
for (unsigned int i = 0; i < n_probs; i++) { |
|
5525
|
23
|
|
|
|
|
|
SV **tv = av_fetch(p_av, i, 0); |
|
5526
|
23
|
50
|
|
|
|
|
probs[i] = (tv && SvOK(*tv)) ? SvNV(*tv) : 0.0; |
|
|
|
50
|
|
|
|
|
|
|
5527
|
23
|
50
|
|
|
|
|
if (probs[i] < 0.0 || probs[i] > 1.0) { |
|
|
|
50
|
|
|
|
|
|
|
5528
|
0
|
|
|
|
|
|
Safefree(x); Safefree(probs); |
|
5529
|
0
|
|
|
|
|
|
croak("quantile: probabilities must be between 0 and 1"); |
|
5530
|
|
|
|
|
|
|
} |
|
5531
|
|
|
|
|
|
|
} |
|
5532
|
|
|
|
|
|
|
} else { |
|
5533
|
0
|
|
|
|
|
|
Newx(probs, n_probs, NV); |
|
5534
|
0
|
0
|
|
|
|
|
for (unsigned int i = 0; i < n_probs; i++) probs[i] = default_probs[i]; |
|
5535
|
|
|
|
|
|
|
} |
|
5536
|
|
|
|
|
|
|
// --- Calculate Quantiles (R Type 7 Algorithm) --- |
|
5537
|
11
|
|
|
|
|
|
HV *restrict res_hv = newHV(); |
|
5538
|
34
|
100
|
|
|
|
|
for (size_t i = 0; i < n_probs; i++) { |
|
5539
|
23
|
|
|
|
|
|
NV p = probs[i]; |
|
5540
|
23
|
|
|
|
|
|
NV q = 0.0; |
|
5541
|
|
|
|
|
|
|
|
|
5542
|
23
|
100
|
|
|
|
|
if (n == 1) { |
|
5543
|
1
|
|
|
|
|
|
q = x[0]; |
|
5544
|
22
|
100
|
|
|
|
|
} else if (p == 1.0) { |
|
5545
|
1
|
|
|
|
|
|
q = x[n - 1]; |
|
5546
|
21
|
100
|
|
|
|
|
} else if (p == 0.0) { |
|
5547
|
1
|
|
|
|
|
|
q = x[0]; |
|
5548
|
|
|
|
|
|
|
} else { |
|
5549
|
20
|
|
|
|
|
|
NV h = (n - 1) * p; |
|
5550
|
20
|
|
|
|
|
|
unsigned int j = (unsigned int)h; |
|
5551
|
20
|
|
|
|
|
|
NV gamma = h - j; |
|
5552
|
20
|
|
|
|
|
|
q = (1.0 - gamma) * x[j] + gamma * x[j + 1]; |
|
5553
|
|
|
|
|
|
|
} |
|
5554
|
|
|
|
|
|
|
// --- Format hash key with Epsilon guarding --- |
|
5555
|
|
|
|
|
|
|
char key[32]; |
|
5556
|
23
|
|
|
|
|
|
double pct = (double)(p * 100.0); // Safe to cast to double just for formatting |
|
5557
|
23
|
|
|
|
|
|
double pct_rounded = floor(pct + 0.5); // C89 safe rounding |
|
5558
|
|
|
|
|
|
|
// Use 1e-9 epsilon check instead of strict integer equality |
|
5559
|
23
|
50
|
|
|
|
|
if (fabs(pct - pct_rounded) < 1e-9) { |
|
5560
|
23
|
|
|
|
|
|
snprintf(key, sizeof(key), "%.0f%%", pct_rounded); |
|
5561
|
|
|
|
|
|
|
} else { |
|
5562
|
0
|
|
|
|
|
|
snprintf(key, sizeof(key), "%.1f%%", pct); |
|
5563
|
|
|
|
|
|
|
} |
|
5564
|
|
|
|
|
|
|
|
|
5565
|
23
|
|
|
|
|
|
hv_store(res_hv, key, strlen(key), newSVnv(q), 0); |
|
5566
|
|
|
|
|
|
|
} |
|
5567
|
11
|
|
|
|
|
|
Safefree(x); Safefree(probs); |
|
5568
|
11
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)res_hv); |
|
5569
|
|
|
|
|
|
|
} |
|
5570
|
|
|
|
|
|
|
OUTPUT: |
|
5571
|
|
|
|
|
|
|
RETVAL |
|
5572
|
|
|
|
|
|
|
|
|
5573
|
|
|
|
|
|
|
double mean(...) |
|
5574
|
|
|
|
|
|
|
PROTOTYPE: @ |
|
5575
|
|
|
|
|
|
|
INIT: |
|
5576
|
48
|
|
|
|
|
|
NV total = 0; |
|
5577
|
48
|
|
|
|
|
|
size_t count = 0; |
|
5578
|
|
|
|
|
|
|
CODE: |
|
5579
|
107
|
100
|
|
|
|
|
for (size_t i = 0; i < items; i++) { |
|
5580
|
61
|
|
|
|
|
|
SV* restrict arg = ST(i); |
|
5581
|
105
|
100
|
|
|
|
|
if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
5582
|
45
|
|
|
|
|
|
AV* restrict av = (AV*)SvRV(arg); |
|
5583
|
45
|
|
|
|
|
|
size_t len = av_len(av) + 1; |
|
5584
|
20557
|
100
|
|
|
|
|
for (size_t j = 0; j < len; j++) { |
|
5585
|
20513
|
|
|
|
|
|
SV** restrict tv = av_fetch(av, j, 0); |
|
5586
|
20513
|
50
|
|
|
|
|
if (tv && SvOK(*tv)) { |
|
|
|
100
|
|
|
|
|
|
|
5587
|
20512
|
|
|
|
|
|
total += SvNV(*tv); |
|
5588
|
20512
|
|
|
|
|
|
count++; |
|
5589
|
|
|
|
|
|
|
} else { |
|
5590
|
1
|
|
|
|
|
|
croak("mean: undefined value at array ref index %zu (argument %zu)", j, i); |
|
5591
|
|
|
|
|
|
|
} |
|
5592
|
|
|
|
|
|
|
} |
|
5593
|
16
|
100
|
|
|
|
|
} else if (SvOK(arg)) { |
|
5594
|
15
|
|
|
|
|
|
total += SvNV(arg); |
|
5595
|
15
|
|
|
|
|
|
count++; |
|
5596
|
|
|
|
|
|
|
} else { |
|
5597
|
1
|
|
|
|
|
|
croak("mean: undefined value at argument index %zu", i); |
|
5598
|
|
|
|
|
|
|
} |
|
5599
|
|
|
|
|
|
|
} |
|
5600
|
46
|
100
|
|
|
|
|
if (count == 0) croak("mean needs >= 1 element"); |
|
5601
|
45
|
100
|
|
|
|
|
RETVAL = total / count; |
|
5602
|
|
|
|
|
|
|
OUTPUT: |
|
5603
|
|
|
|
|
|
|
RETVAL |
|
5604
|
|
|
|
|
|
|
|
|
5605
|
|
|
|
|
|
|
void mode(...) |
|
5606
|
|
|
|
|
|
|
PROTOTYPE: @ |
|
5607
|
|
|
|
|
|
|
PREINIT: |
|
5608
|
|
|
|
|
|
|
HV *restrict counts; |
|
5609
|
|
|
|
|
|
|
HV *restrict originals; |
|
5610
|
5
|
|
|
|
|
|
size_t max_count = 0, arg_count = 0; |
|
5611
|
|
|
|
|
|
|
HE *restrict he; |
|
5612
|
|
|
|
|
|
|
PPCODE: |
|
5613
|
|
|
|
|
|
|
/* counts: string(value) -> occurrence count */ |
|
5614
|
|
|
|
|
|
|
/* originals: string(value) -> SV* first-seen original */ |
|
5615
|
5
|
|
|
|
|
|
counts = (HV *)sv_2mortal((SV *)newHV()); |
|
5616
|
5
|
|
|
|
|
|
originals = (HV *)sv_2mortal((SV *)newHV()); |
|
5617
|
|
|
|
|
|
|
|
|
5618
|
16
|
100
|
|
|
|
|
for (size_t i = 0; i < items; i++) { |
|
5619
|
12
|
|
|
|
|
|
SV *restrict arg = ST(i); |
|
5620
|
13
|
100
|
|
|
|
|
if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
5621
|
1
|
|
|
|
|
|
AV *restrict av = (AV *)SvRV(arg); |
|
5622
|
1
|
|
|
|
|
|
size_t len = av_len(av) + 1; |
|
5623
|
5
|
100
|
|
|
|
|
for (size_t j = 0; j < len; j++) { |
|
5624
|
4
|
|
|
|
|
|
SV **restrict tv = av_fetch(av, j, 0); |
|
5625
|
4
|
50
|
|
|
|
|
if (tv && SvOK(*tv)) { |
|
|
|
50
|
|
|
|
|
|
|
5626
|
|
|
|
|
|
|
STRLEN klen; |
|
5627
|
4
|
|
|
|
|
|
const char *restrict key = SvPV(*tv, klen); |
|
5628
|
4
|
|
|
|
|
|
SV **restrict slot = hv_fetch(counts, key, klen, 1); |
|
5629
|
4
|
50
|
|
|
|
|
if (!slot) croak("mode: internal hash error"); |
|
5630
|
4
|
100
|
|
|
|
|
size_t cnt = SvOK(*slot) ? SvIV(*slot) + 1 : 1; |
|
5631
|
4
|
|
|
|
|
|
sv_setiv(*slot, cnt); |
|
5632
|
4
|
100
|
|
|
|
|
if (cnt > max_count) max_count = cnt; |
|
5633
|
4
|
100
|
|
|
|
|
if (cnt == 1) |
|
5634
|
2
|
|
|
|
|
|
hv_store(originals, key, klen, newSVsv(*tv), 0); |
|
5635
|
4
|
|
|
|
|
|
arg_count++; |
|
5636
|
|
|
|
|
|
|
} else { |
|
5637
|
0
|
|
|
|
|
|
croak("mode: undefined value at array ref index %zu (argument %zu)", j, i); |
|
5638
|
|
|
|
|
|
|
} |
|
5639
|
|
|
|
|
|
|
} |
|
5640
|
11
|
100
|
|
|
|
|
} else if (SvOK(arg)) { |
|
5641
|
|
|
|
|
|
|
STRLEN klen; |
|
5642
|
10
|
|
|
|
|
|
const char *restrict key = SvPV(arg, klen); |
|
5643
|
10
|
|
|
|
|
|
SV **restrict slot = hv_fetch(counts, key, klen, 1); |
|
5644
|
10
|
50
|
|
|
|
|
if (!slot) croak("mode: internal hash error"); |
|
5645
|
10
|
100
|
|
|
|
|
size_t cnt = SvOK(*slot) ? SvIV(*slot) + 1 : 1; |
|
5646
|
10
|
|
|
|
|
|
sv_setiv(*slot, cnt); |
|
5647
|
10
|
100
|
|
|
|
|
if (cnt > max_count) max_count = cnt; |
|
5648
|
10
|
100
|
|
|
|
|
if (cnt == 1) |
|
5649
|
6
|
|
|
|
|
|
hv_store(originals, key, klen, newSVsv(arg), 0); |
|
5650
|
10
|
|
|
|
|
|
arg_count++; |
|
5651
|
|
|
|
|
|
|
} else { |
|
5652
|
1
|
|
|
|
|
|
croak("mode: undefined value at argument index %zu", i); |
|
5653
|
|
|
|
|
|
|
} |
|
5654
|
|
|
|
|
|
|
} |
|
5655
|
|
|
|
|
|
|
|
|
5656
|
4
|
100
|
|
|
|
|
if (arg_count == 0) |
|
5657
|
1
|
|
|
|
|
|
croak("mode needs >= 1 element"); |
|
5658
|
|
|
|
|
|
|
|
|
5659
|
3
|
|
|
|
|
|
hv_iterinit(counts); |
|
5660
|
13
|
100
|
|
|
|
|
while ((he = hv_iternext(counts))) { |
|
5661
|
7
|
100
|
|
|
|
|
if (SvIV(hv_iterval(counts, he)) == max_count) { |
|
5662
|
|
|
|
|
|
|
STRLEN klen; |
|
5663
|
4
|
50
|
|
|
|
|
const char *restrict key = HePV(he, klen); |
|
5664
|
4
|
|
|
|
|
|
SV **restrict orig = hv_fetch(originals, key, klen, 0); |
|
5665
|
4
|
50
|
|
|
|
|
mXPUSHs(orig ? newSVsv(*orig) : newSVpvn(key, klen)); |
|
|
|
50
|
|
|
|
|
|
|
5666
|
|
|
|
|
|
|
} |
|
5667
|
|
|
|
|
|
|
} |
|
5668
|
|
|
|
|
|
|
|
|
5669
|
|
|
|
|
|
|
double sum(...) |
|
5670
|
|
|
|
|
|
|
PROTOTYPE: @ |
|
5671
|
|
|
|
|
|
|
INIT: |
|
5672
|
5
|
|
|
|
|
|
NV total = 0; |
|
5673
|
5
|
|
|
|
|
|
size_t count = 0; |
|
5674
|
|
|
|
|
|
|
CODE: |
|
5675
|
19
|
100
|
|
|
|
|
for (size_t i = 0; i < items; i++) { |
|
5676
|
16
|
|
|
|
|
|
SV* restrict arg = ST(i); |
|
5677
|
17
|
100
|
|
|
|
|
if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
5678
|
2
|
|
|
|
|
|
AV* restrict av = (AV*)SvRV(arg); |
|
5679
|
2
|
|
|
|
|
|
size_t len = av_len(av) + 1; |
|
5680
|
11
|
100
|
|
|
|
|
for (size_t j = 0; j < len; j++) { |
|
5681
|
10
|
|
|
|
|
|
SV** restrict tv = av_fetch(av, j, 0); |
|
5682
|
10
|
50
|
|
|
|
|
if (tv && SvOK(*tv)) { |
|
|
|
100
|
|
|
|
|
|
|
5683
|
9
|
|
|
|
|
|
total += SvNV(*tv); |
|
5684
|
9
|
|
|
|
|
|
count++; |
|
5685
|
|
|
|
|
|
|
} else { |
|
5686
|
1
|
|
|
|
|
|
croak("sum: undefined value at array ref index %zu (argument %zu)", j, i); |
|
5687
|
|
|
|
|
|
|
} |
|
5688
|
|
|
|
|
|
|
} |
|
5689
|
14
|
100
|
|
|
|
|
} else if (SvOK(arg)) { |
|
5690
|
13
|
|
|
|
|
|
total += SvNV(arg); |
|
5691
|
13
|
|
|
|
|
|
count++; |
|
5692
|
|
|
|
|
|
|
} else { |
|
5693
|
1
|
|
|
|
|
|
croak("sum: undefined value at argument index %zu", i); |
|
5694
|
|
|
|
|
|
|
} |
|
5695
|
|
|
|
|
|
|
} |
|
5696
|
3
|
50
|
|
|
|
|
if (count == 0) croak("sum needs >= 1 element"); |
|
5697
|
3
|
100
|
|
|
|
|
RETVAL = total; |
|
5698
|
|
|
|
|
|
|
OUTPUT: |
|
5699
|
|
|
|
|
|
|
RETVAL |
|
5700
|
|
|
|
|
|
|
|
|
5701
|
|
|
|
|
|
|
double sd(...) |
|
5702
|
|
|
|
|
|
|
PROTOTYPE: @ |
|
5703
|
|
|
|
|
|
|
INIT: |
|
5704
|
23
|
|
|
|
|
|
NV mean = 0.0, M2 = 0.0; |
|
5705
|
23
|
|
|
|
|
|
size_t count = 0; |
|
5706
|
|
|
|
|
|
|
CODE: |
|
5707
|
|
|
|
|
|
|
/* Single Pass Standard Deviation via Welford's Algorithm */ |
|
5708
|
58
|
100
|
|
|
|
|
for (size_t i = 0; i < items; i++) { |
|
5709
|
37
|
|
|
|
|
|
SV* restrict arg = ST(i); |
|
5710
|
54
|
100
|
|
|
|
|
if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
5711
|
18
|
|
|
|
|
|
AV* restrict av = (AV*)SvRV(arg); |
|
5712
|
18
|
|
|
|
|
|
size_t len = av_len(av) + 1; |
|
5713
|
10086
|
100
|
|
|
|
|
for (size_t j = 0; j < len; j++) { |
|
5714
|
10069
|
|
|
|
|
|
SV** restrict tv = av_fetch(av, j, 0); |
|
5715
|
10069
|
50
|
|
|
|
|
if (tv && SvOK(*tv)) { |
|
|
|
100
|
|
|
|
|
|
|
5716
|
10068
|
|
|
|
|
|
count++; |
|
5717
|
10068
|
|
|
|
|
|
double val = SvNV(*tv); |
|
5718
|
10068
|
|
|
|
|
|
double delta = val - mean; |
|
5719
|
10068
|
|
|
|
|
|
mean += delta / count; |
|
5720
|
10068
|
|
|
|
|
|
M2 += delta * (val - mean); |
|
5721
|
|
|
|
|
|
|
} else { |
|
5722
|
1
|
|
|
|
|
|
croak("sd: undefined value at array ref index %zu (argument %zu)", j, i); |
|
5723
|
|
|
|
|
|
|
} |
|
5724
|
|
|
|
|
|
|
} |
|
5725
|
19
|
100
|
|
|
|
|
} else if (SvOK(arg)) { |
|
5726
|
18
|
|
|
|
|
|
count++; |
|
5727
|
18
|
|
|
|
|
|
NV val = SvNV(arg); |
|
5728
|
18
|
|
|
|
|
|
NV delta = val - mean; |
|
5729
|
18
|
|
|
|
|
|
mean += delta / count; |
|
5730
|
18
|
|
|
|
|
|
M2 += delta * (val - mean); |
|
5731
|
|
|
|
|
|
|
} else { |
|
5732
|
1
|
|
|
|
|
|
croak("sd: undefined value at argument index %zu", i); |
|
5733
|
|
|
|
|
|
|
} |
|
5734
|
|
|
|
|
|
|
} |
|
5735
|
21
|
100
|
|
|
|
|
if (count < 2) croak("sd needs >= 2 elements"); |
|
5736
|
20
|
100
|
|
|
|
|
RETVAL = sqrt(M2 / (count - 1)); |
|
5737
|
|
|
|
|
|
|
OUTPUT: |
|
5738
|
|
|
|
|
|
|
RETVAL |
|
5739
|
|
|
|
|
|
|
|
|
5740
|
|
|
|
|
|
|
double var(...) |
|
5741
|
|
|
|
|
|
|
PROTOTYPE: @ |
|
5742
|
|
|
|
|
|
|
INIT: |
|
5743
|
8
|
|
|
|
|
|
NV mean = 0.0, M2 = 0.0; |
|
5744
|
8
|
|
|
|
|
|
size_t count = 0; |
|
5745
|
|
|
|
|
|
|
CODE: |
|
5746
|
|
|
|
|
|
|
// Single Pass Variance via Welford's Algorithm |
|
5747
|
21
|
100
|
|
|
|
|
for (size_t i = 0; i < items; i++) { |
|
5748
|
15
|
|
|
|
|
|
SV* restrict arg = ST(i); |
|
5749
|
18
|
100
|
|
|
|
|
if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
5750
|
4
|
|
|
|
|
|
AV* restrict av = (AV*)SvRV(arg); |
|
5751
|
4
|
|
|
|
|
|
size_t len = av_len(av) + 1; |
|
5752
|
10015
|
100
|
|
|
|
|
for (size_t j = 0; j < len; j++) { |
|
5753
|
10012
|
|
|
|
|
|
SV** restrict tv = av_fetch(av, j, 0); |
|
5754
|
10012
|
50
|
|
|
|
|
if (tv && SvOK(*tv)) { |
|
|
|
100
|
|
|
|
|
|
|
5755
|
10011
|
|
|
|
|
|
count++; |
|
5756
|
10011
|
|
|
|
|
|
NV val = SvNV(*tv); |
|
5757
|
10011
|
|
|
|
|
|
NV delta = val - mean; |
|
5758
|
10011
|
|
|
|
|
|
mean += delta / count; |
|
5759
|
10011
|
|
|
|
|
|
M2 += delta * (val - mean); |
|
5760
|
|
|
|
|
|
|
} else { |
|
5761
|
1
|
|
|
|
|
|
croak("var: undefined value at array ref index %zu (argument %zu)", j, i); |
|
5762
|
|
|
|
|
|
|
} |
|
5763
|
|
|
|
|
|
|
} |
|
5764
|
11
|
100
|
|
|
|
|
} else if (SvOK(arg)) { |
|
5765
|
10
|
|
|
|
|
|
count++; |
|
5766
|
10
|
|
|
|
|
|
NV val = SvNV(arg); |
|
5767
|
10
|
|
|
|
|
|
NV delta = val - mean; |
|
5768
|
10
|
|
|
|
|
|
mean += delta / count; |
|
5769
|
10
|
|
|
|
|
|
M2 += delta * (val - mean); |
|
5770
|
|
|
|
|
|
|
} else { |
|
5771
|
1
|
|
|
|
|
|
croak("var: undefined value at argument index %zu", i); |
|
5772
|
|
|
|
|
|
|
} |
|
5773
|
|
|
|
|
|
|
} |
|
5774
|
6
|
100
|
|
|
|
|
if (count < 2) croak("var needs >= 2 elements"); |
|
5775
|
5
|
100
|
|
|
|
|
RETVAL = M2 / (count - 1); |
|
5776
|
|
|
|
|
|
|
OUTPUT: |
|
5777
|
|
|
|
|
|
|
RETVAL |
|
5778
|
|
|
|
|
|
|
|
|
5779
|
|
|
|
|
|
|
SV* t_test(...) |
|
5780
|
|
|
|
|
|
|
CODE: |
|
5781
|
|
|
|
|
|
|
{ |
|
5782
|
53
|
|
|
|
|
|
SV*restrict x_sv = NULL; |
|
5783
|
53
|
|
|
|
|
|
SV*restrict y_sv = NULL; |
|
5784
|
53
|
|
|
|
|
|
NV mu = 0.0, conf_level = 0.95; |
|
5785
|
53
|
|
|
|
|
|
bool paired = FALSE, var_equal = FALSE; |
|
5786
|
53
|
|
|
|
|
|
const char*restrict alternative = "two.sided"; |
|
5787
|
53
|
|
|
|
|
|
unsigned short int arg_idx = 0; |
|
5788
|
|
|
|
|
|
|
// 1. Shift first positional argument as 'x' if it's an array reference |
|
5789
|
53
|
50
|
|
|
|
|
if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
5790
|
27
|
|
|
|
|
|
x_sv = ST(arg_idx); |
|
5791
|
27
|
|
|
|
|
|
arg_idx++; |
|
5792
|
|
|
|
|
|
|
} |
|
5793
|
|
|
|
|
|
|
// 2. Shift second positional argument as 'y' if it's an array reference |
|
5794
|
53
|
50
|
|
|
|
|
if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
5795
|
10
|
|
|
|
|
|
y_sv = ST(arg_idx); |
|
5796
|
10
|
|
|
|
|
|
arg_idx++; |
|
5797
|
|
|
|
|
|
|
} |
|
5798
|
|
|
|
|
|
|
// Ensure the remaining arguments form complete key-value pairs |
|
5799
|
53
|
50
|
|
|
|
|
if ((items - arg_idx) % 2 != 0) { |
|
5800
|
0
|
|
|
|
|
|
croak("Usage: t_test(\\@x, [\\@y], key => value, ...)"); |
|
5801
|
|
|
|
|
|
|
} |
|
5802
|
|
|
|
|
|
|
// --- Parse named arguments from the remaining flat stack --- |
|
5803
|
129
|
100
|
|
|
|
|
for (; arg_idx < items; arg_idx += 2) { |
|
5804
|
76
|
|
|
|
|
|
const char*restrict key = SvPV_nolen(ST(arg_idx)); |
|
5805
|
76
|
|
|
|
|
|
SV*restrict val = ST(arg_idx + 1); |
|
5806
|
|
|
|
|
|
|
|
|
5807
|
76
|
100
|
|
|
|
|
if (strEQ(key, "x")) x_sv = val; |
|
5808
|
51
|
100
|
|
|
|
|
else if (strEQ(key, "y")) y_sv = val; |
|
5809
|
46
|
100
|
|
|
|
|
else if (strEQ(key, "mu")) mu = SvNV(val); |
|
5810
|
11
|
100
|
|
|
|
|
else if (strEQ(key, "paired")) paired = SvTRUE(val); |
|
5811
|
7
|
100
|
|
|
|
|
else if (strEQ(key, "var_equal")) var_equal = SvTRUE(val); |
|
5812
|
4
|
100
|
|
|
|
|
else if (strEQ(key, "conf_level")) conf_level = SvNV(val); |
|
5813
|
2
|
50
|
|
|
|
|
else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val); |
|
5814
|
0
|
|
|
|
|
|
else croak("t_test: unknown argument '%s'", key); |
|
5815
|
|
|
|
|
|
|
} |
|
5816
|
|
|
|
|
|
|
|
|
5817
|
|
|
|
|
|
|
// --- Validate required / types --- |
|
5818
|
53
|
100
|
|
|
|
|
if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
5819
|
1
|
|
|
|
|
|
croak("t_test: 'x' is a required argument and must be an ARRAY reference"); |
|
5820
|
52
|
|
|
|
|
|
AV*restrict x_av = (AV*)SvRV(x_sv); |
|
5821
|
52
|
|
|
|
|
|
size_t nx = av_len(x_av) + 1; |
|
5822
|
52
|
50
|
|
|
|
|
if (nx < 2) croak("t_test: 'x' needs at least 2 elements"); |
|
5823
|
52
|
|
|
|
|
|
AV*restrict y_av = NULL; |
|
5824
|
52
|
100
|
|
|
|
|
if (y_sv && SvROK(y_sv) && SvTYPE(SvRV(y_sv)) == SVt_PVAV) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
5825
|
14
|
|
|
|
|
|
y_av = (AV*)SvRV(y_sv); |
|
5826
|
52
|
50
|
|
|
|
|
if (conf_level <= 0.0 || conf_level >= 1.0) |
|
|
|
100
|
|
|
|
|
|
|
5827
|
1
|
|
|
|
|
|
croak("t_test: 'conf_level' must be between 0 and 1"); |
|
5828
|
|
|
|
|
|
|
// --- Computation via Welford's Algorithm --- */ |
|
5829
|
51
|
|
|
|
|
|
NV mean_x = 0.0, M2_x = 0.0, var_x, t_stat, df, p_val, std_err, cint_est; |
|
5830
|
51
|
|
|
|
|
|
HV*restrict results = newHV(); |
|
5831
|
447
|
100
|
|
|
|
|
for (size_t i = 0; i < nx; i++) { |
|
5832
|
396
|
|
|
|
|
|
SV**restrict tv = av_fetch(x_av, i, 0); |
|
5833
|
396
|
50
|
|
|
|
|
NV val = (tv && SvOK(*tv)) ? SvNV(*tv) : 0; |
|
|
|
50
|
|
|
|
|
|
|
5834
|
396
|
|
|
|
|
|
NV delta = val - mean_x; |
|
5835
|
396
|
|
|
|
|
|
mean_x += delta / (i + 1); |
|
5836
|
396
|
|
|
|
|
|
M2_x += delta * (val - mean_x); |
|
5837
|
|
|
|
|
|
|
} |
|
5838
|
51
|
|
|
|
|
|
var_x = M2_x / (nx - 1); |
|
5839
|
51
|
100
|
|
|
|
|
if (var_x == 0.0 && !y_av) croak("t_test: data are essentially constant"); |
|
|
|
50
|
|
|
|
|
|
|
5840
|
|
|
|
|
|
|
|
|
5841
|
63
|
100
|
|
|
|
|
if (paired || y_av) { |
|
|
|
100
|
|
|
|
|
|
|
5842
|
15
|
100
|
|
|
|
|
if (!y_av) croak("t_test: 'y' must be provided for paired or two-sample tests"); |
|
5843
|
14
|
|
|
|
|
|
size_t ny = av_len(y_av) + 1; |
|
5844
|
14
|
100
|
|
|
|
|
if (paired && ny != nx) croak("t_test: Paired arrays must be same length"); |
|
|
|
100
|
|
|
|
|
|
|
5845
|
13
|
|
|
|
|
|
double mean_y = 0.0, M2_y = 0.0, var_y; |
|
5846
|
140
|
100
|
|
|
|
|
for (size_t i = 0; i < ny; i++) { |
|
5847
|
127
|
|
|
|
|
|
SV**restrict tv = av_fetch(y_av, i, 0); |
|
5848
|
127
|
50
|
|
|
|
|
NV val = (tv && SvOK(*tv)) ? SvNV(*tv) : 0; |
|
|
|
50
|
|
|
|
|
|
|
5849
|
127
|
|
|
|
|
|
NV delta = val - mean_y; |
|
5850
|
127
|
|
|
|
|
|
mean_y += delta / (i + 1); |
|
5851
|
127
|
|
|
|
|
|
M2_y += delta * (val - mean_y); |
|
5852
|
|
|
|
|
|
|
} |
|
5853
|
13
|
|
|
|
|
|
var_y = M2_y / (ny - 1); |
|
5854
|
13
|
100
|
|
|
|
|
if (paired) { |
|
5855
|
2
|
|
|
|
|
|
double mean_d = 0.0, M2_d = 0.0; |
|
5856
|
14
|
100
|
|
|
|
|
for (size_t i = 0; i < nx; i++) { |
|
5857
|
12
|
|
|
|
|
|
SV**restrict dx_ptr = av_fetch(x_av, i, 0); |
|
5858
|
12
|
|
|
|
|
|
SV**restrict dy_ptr = av_fetch(y_av, i, 0); |
|
5859
|
12
|
50
|
|
|
|
|
double dx = (dx_ptr && SvOK(*dx_ptr)) ? SvNV(*dx_ptr) : 0.0; |
|
|
|
50
|
|
|
|
|
|
|
5860
|
12
|
50
|
|
|
|
|
double dy = (dy_ptr && SvOK(*dy_ptr)) ? SvNV(*dy_ptr) : 0.0; |
|
|
|
50
|
|
|
|
|
|
|
5861
|
12
|
|
|
|
|
|
double val = dx - dy; |
|
5862
|
12
|
|
|
|
|
|
double delta = val - mean_d; |
|
5863
|
12
|
|
|
|
|
|
mean_d += delta / (i + 1); |
|
5864
|
12
|
|
|
|
|
|
M2_d += delta * (val - mean_d); |
|
5865
|
|
|
|
|
|
|
} |
|
5866
|
2
|
|
|
|
|
|
double var_d = M2_d / (nx - 1); |
|
5867
|
2
|
50
|
|
|
|
|
if (var_d == 0.0) croak("t_test: data are essentially constant"); |
|
5868
|
2
|
|
|
|
|
|
cint_est = mean_d; |
|
5869
|
2
|
|
|
|
|
|
std_err = sqrt(var_d / nx); |
|
5870
|
2
|
|
|
|
|
|
t_stat = (cint_est - mu) / std_err; |
|
5871
|
2
|
|
|
|
|
|
df = nx - 1; |
|
5872
|
2
|
|
|
|
|
|
hv_store(results, "estimate", 8, newSVnv(mean_d), 0); |
|
5873
|
11
|
100
|
|
|
|
|
} else if (var_equal) { |
|
5874
|
2
|
50
|
|
|
|
|
if (var_x == 0.0 && var_y == 0.0) croak("t_test: data are essentially constant"); |
|
|
|
0
|
|
|
|
|
|
|
5875
|
2
|
|
|
|
|
|
double pooled_var = ((nx - 1) * var_x + (ny - 1) * var_y) / (nx + ny - 2); |
|
5876
|
2
|
|
|
|
|
|
cint_est = mean_x - mean_y; |
|
5877
|
2
|
|
|
|
|
|
std_err = sqrt(pooled_var * (1.0 / nx + 1.0 / ny)); |
|
5878
|
2
|
|
|
|
|
|
t_stat = (cint_est - mu) / std_err; |
|
5879
|
2
|
|
|
|
|
|
df = nx + ny - 2; |
|
5880
|
2
|
|
|
|
|
|
hv_store(results, "estimate_x", 10, newSVnv(mean_x), 0); |
|
5881
|
2
|
|
|
|
|
|
hv_store(results, "estimate_y", 10, newSVnv(mean_y), 0); |
|
5882
|
|
|
|
|
|
|
} else { |
|
5883
|
9
|
50
|
|
|
|
|
if (var_x == 0.0 && var_y == 0.0) croak("t_test: data are essentially constant"); |
|
|
|
0
|
|
|
|
|
|
|
5884
|
9
|
|
|
|
|
|
cint_est = mean_x - mean_y; |
|
5885
|
9
|
|
|
|
|
|
double stderr_x2 = var_x / nx; |
|
5886
|
9
|
|
|
|
|
|
double stderr_y2 = var_y / ny; |
|
5887
|
9
|
|
|
|
|
|
std_err = sqrt(stderr_x2 + stderr_y2); |
|
5888
|
9
|
|
|
|
|
|
t_stat = (cint_est - mu) / std_err; |
|
5889
|
9
|
|
|
|
|
|
df = pow(stderr_x2 + stderr_y2, 2) / |
|
5890
|
9
|
|
|
|
|
|
(pow(stderr_x2, 2) / (nx - 1) + pow(stderr_y2, 2) / (ny - 1)); |
|
5891
|
9
|
|
|
|
|
|
hv_store(results, "estimate_x", 10, newSVnv(mean_x), 0); |
|
5892
|
9
|
|
|
|
|
|
hv_store(results, "estimate_y", 10, newSVnv(mean_y), 0); |
|
5893
|
|
|
|
|
|
|
} |
|
5894
|
|
|
|
|
|
|
} else { |
|
5895
|
35
|
|
|
|
|
|
cint_est = mean_x; |
|
5896
|
35
|
|
|
|
|
|
std_err = sqrt(var_x / nx); |
|
5897
|
35
|
|
|
|
|
|
t_stat = (cint_est - mu) / std_err; |
|
5898
|
35
|
|
|
|
|
|
df = nx - 1; |
|
5899
|
35
|
|
|
|
|
|
hv_store(results, "estimate", 8, newSVnv(mean_x), 0); |
|
5900
|
|
|
|
|
|
|
} |
|
5901
|
48
|
|
|
|
|
|
p_val = get_t_pvalue(t_stat, df, alternative); |
|
5902
|
48
|
|
|
|
|
|
double alpha = 1.0 - conf_level, t_crit, ci_lower, ci_upper; |
|
5903
|
48
|
100
|
|
|
|
|
if (strcmp(alternative, "less") == 0) { |
|
5904
|
1
|
|
|
|
|
|
t_crit = qt_tail(df, alpha); |
|
5905
|
1
|
|
|
|
|
|
ci_lower = -INFINITY; |
|
5906
|
1
|
|
|
|
|
|
ci_upper = cint_est + t_crit * std_err; |
|
5907
|
47
|
100
|
|
|
|
|
} else if (strcmp(alternative, "greater") == 0) { |
|
5908
|
1
|
|
|
|
|
|
t_crit = qt_tail(df, alpha); |
|
5909
|
1
|
|
|
|
|
|
ci_lower = cint_est - t_crit * std_err; |
|
5910
|
1
|
|
|
|
|
|
ci_upper = INFINITY; |
|
5911
|
|
|
|
|
|
|
} else { |
|
5912
|
46
|
|
|
|
|
|
t_crit = qt_tail(df, alpha / 2.0); |
|
5913
|
46
|
|
|
|
|
|
ci_lower = cint_est - t_crit * std_err; |
|
5914
|
46
|
|
|
|
|
|
ci_upper = cint_est + t_crit * std_err; |
|
5915
|
|
|
|
|
|
|
} |
|
5916
|
48
|
|
|
|
|
|
AV*restrict conf_int = newAV(); |
|
5917
|
48
|
|
|
|
|
|
av_push(conf_int, newSVnv(ci_lower)); |
|
5918
|
48
|
|
|
|
|
|
av_push(conf_int, newSVnv(ci_upper)); |
|
5919
|
48
|
|
|
|
|
|
hv_store(results, "statistic", 9, newSVnv(t_stat), 0); |
|
5920
|
48
|
|
|
|
|
|
hv_store(results, "df", 2, newSVnv(df), 0); |
|
5921
|
48
|
|
|
|
|
|
hv_store(results, "p_value", 7, newSVnv(p_val), 0); |
|
5922
|
48
|
|
|
|
|
|
hv_store(results, "conf_int", 8, newRV_noinc((SV*)conf_int), 0); |
|
5923
|
48
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)results); |
|
5924
|
|
|
|
|
|
|
} |
|
5925
|
|
|
|
|
|
|
OUTPUT: |
|
5926
|
|
|
|
|
|
|
RETVAL |
|
5927
|
|
|
|
|
|
|
|
|
5928
|
|
|
|
|
|
|
void p_adjust(SV* p_sv, const char* method = "holm") |
|
5929
|
|
|
|
|
|
|
INIT: |
|
5930
|
15
|
100
|
|
|
|
|
if (!SvROK(p_sv) || SvTYPE(SvRV(p_sv)) != SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
5931
|
1
|
|
|
|
|
|
croak("p_adjust: first argument must be an ARRAY reference of p-values"); |
|
5932
|
|
|
|
|
|
|
} |
|
5933
|
14
|
|
|
|
|
|
AV *restrict p_av = (AV*)SvRV(p_sv); |
|
5934
|
14
|
|
|
|
|
|
size_t n = av_len(p_av) + 1; |
|
5935
|
|
|
|
|
|
|
// Handle empty input |
|
5936
|
14
|
100
|
|
|
|
|
if (n == 0) { |
|
5937
|
1
|
|
|
|
|
|
XSRETURN_EMPTY; |
|
5938
|
|
|
|
|
|
|
} |
|
5939
|
|
|
|
|
|
|
// Normalize method string |
|
5940
|
|
|
|
|
|
|
char meth[64]; |
|
5941
|
13
|
|
|
|
|
|
strncpy(meth, method, 63); meth[63] = '\0'; |
|
5942
|
157
|
100
|
|
|
|
|
for(unsigned short int i = 0; meth[i]; i++) meth[i] = tolower(meth[i]); |
|
5943
|
|
|
|
|
|
|
// Resolve aliases |
|
5944
|
13
|
100
|
|
|
|
|
if (strstr(meth, "benjamini") && strstr(meth, "hochberg")) strcpy(meth, "bh"); |
|
|
|
100
|
|
|
|
|
|
|
5945
|
13
|
100
|
|
|
|
|
if (strstr(meth, "benjamini") && strstr(meth, "yekutieli")) strcpy(meth, "by"); |
|
|
|
50
|
|
|
|
|
|
|
5946
|
13
|
50
|
|
|
|
|
if (strcmp(meth, "fdr") == 0) strcpy(meth, "bh"); |
|
5947
|
|
|
|
|
|
|
// Allocate C memory |
|
5948
|
|
|
|
|
|
|
PVal *restrict arr; |
|
5949
|
|
|
|
|
|
|
double *restrict adj; |
|
5950
|
13
|
50
|
|
|
|
|
Newx(arr, n, PVal); |
|
5951
|
13
|
50
|
|
|
|
|
Newx(adj, n, double); |
|
5952
|
|
|
|
|
|
|
|
|
5953
|
369
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
5954
|
356
|
|
|
|
|
|
SV**restrict tv = av_fetch(p_av, i, 0); |
|
5955
|
356
|
50
|
|
|
|
|
arr[i].p = (tv && SvOK(*tv)) ? SvNV(*tv) : 1.0; |
|
|
|
50
|
|
|
|
|
|
|
5956
|
356
|
|
|
|
|
|
arr[i].orig_idx = i; |
|
5957
|
|
|
|
|
|
|
} |
|
5958
|
|
|
|
|
|
|
// Sort ascending (Stable sort using original index) |
|
5959
|
13
|
|
|
|
|
|
qsort(arr, n, sizeof(PVal), cmp_pval); |
|
5960
|
|
|
|
|
|
|
PPCODE: |
|
5961
|
13
|
100
|
|
|
|
|
if (strcmp(meth, "bonferroni") == 0) { |
|
5962
|
53
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
5963
|
51
|
|
|
|
|
|
double v = arr[i].p * n; |
|
5964
|
51
|
100
|
|
|
|
|
adj[arr[i].orig_idx] = (v < 1.0) ? v : 1.0; |
|
5965
|
|
|
|
|
|
|
} |
|
5966
|
11
|
100
|
|
|
|
|
} else if (strcmp(meth, "holm") == 0) { |
|
5967
|
2
|
|
|
|
|
|
NV cummax = 0.0; |
|
5968
|
53
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
5969
|
51
|
|
|
|
|
|
double v = arr[i].p * (n - i); |
|
5970
|
51
|
100
|
|
|
|
|
if (v > cummax) cummax = v; |
|
5971
|
51
|
100
|
|
|
|
|
adj[arr[i].orig_idx] = (cummax < 1.0) ? cummax : 1.0; |
|
5972
|
|
|
|
|
|
|
} |
|
5973
|
9
|
100
|
|
|
|
|
} else if (strcmp(meth, "hochberg") == 0) { |
|
5974
|
2
|
|
|
|
|
|
NV cummin = 1.0; |
|
5975
|
53
|
100
|
|
|
|
|
for (ssize_t i = n - 1; i >= 0; i--) { |
|
5976
|
51
|
|
|
|
|
|
double v = arr[i].p * (n - i); |
|
5977
|
51
|
100
|
|
|
|
|
if (v < cummin) cummin = v; |
|
5978
|
51
|
50
|
|
|
|
|
adj[arr[i].orig_idx] = (cummin < 1.0) ? cummin : 1.0; |
|
5979
|
|
|
|
|
|
|
} |
|
5980
|
7
|
100
|
|
|
|
|
} else if (strcmp(meth, "bh") == 0) { |
|
5981
|
2
|
|
|
|
|
|
NV cummin = 1.0; |
|
5982
|
53
|
100
|
|
|
|
|
for (ssize_t i = n - 1; i >= 0; i--) { |
|
5983
|
51
|
|
|
|
|
|
double v = arr[i].p * n / (i + 1.0); |
|
5984
|
51
|
100
|
|
|
|
|
if (v < cummin) cummin = v; |
|
5985
|
51
|
50
|
|
|
|
|
adj[arr[i].orig_idx] = (cummin < 1.0) ? cummin : 1.0; |
|
5986
|
|
|
|
|
|
|
} |
|
5987
|
5
|
100
|
|
|
|
|
} else if (strcmp(meth, "by") == 0) { |
|
5988
|
2
|
|
|
|
|
|
NV q = 0.0; |
|
5989
|
53
|
100
|
|
|
|
|
for (size_t i = 1; i <= n; i++) q += 1.0 / i; |
|
5990
|
2
|
|
|
|
|
|
NV cummin = 1.0; |
|
5991
|
53
|
100
|
|
|
|
|
for (ssize_t i = n - 1; i >= 0; i--) { |
|
5992
|
51
|
|
|
|
|
|
double v = arr[i].p * n / (i + 1.0) * q; |
|
5993
|
51
|
100
|
|
|
|
|
if (v < cummin) cummin = v; |
|
5994
|
51
|
100
|
|
|
|
|
adj[arr[i].orig_idx] = (cummin < 1.0) ? cummin : 1.0; |
|
5995
|
|
|
|
|
|
|
} |
|
5996
|
3
|
100
|
|
|
|
|
} else if (strcmp(meth, "hommel") == 0) { |
|
5997
|
|
|
|
|
|
|
NV *restrict pa, *restrict q_arr; |
|
5998
|
2
|
50
|
|
|
|
|
Newx(pa, n, double); |
|
5999
|
2
|
50
|
|
|
|
|
Newx(q_arr, n, double); |
|
6000
|
|
|
|
|
|
|
// Initial: min(n * p[i] / (i + 1)) |
|
6001
|
2
|
|
|
|
|
|
double min_val = n * arr[0].p; |
|
6002
|
51
|
100
|
|
|
|
|
for (size_t i = 1; i < n; i++) { |
|
6003
|
49
|
|
|
|
|
|
double temp = (n * arr[i].p) / (i + 1.0); |
|
6004
|
49
|
50
|
|
|
|
|
if (temp < min_val) { |
|
6005
|
0
|
|
|
|
|
|
min_val = temp; |
|
6006
|
|
|
|
|
|
|
} |
|
6007
|
|
|
|
|
|
|
} |
|
6008
|
|
|
|
|
|
|
// pa <- q <- rep(min, n) |
|
6009
|
53
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
6010
|
51
|
|
|
|
|
|
pa[i] = min_val; |
|
6011
|
51
|
|
|
|
|
|
q_arr[i] = min_val; |
|
6012
|
|
|
|
|
|
|
} |
|
6013
|
50
|
100
|
|
|
|
|
for (size_t j = n - 1; j >= 2; j--) { |
|
6014
|
48
|
|
|
|
|
|
ssize_t n_mj = n - j; // Max index for 'ij'. Length is n_mj + 1 |
|
6015
|
48
|
|
|
|
|
|
ssize_t i2_len = j - 1; // Length of 'i2 |
|
6016
|
|
|
|
|
|
|
// Calculate q1 = min(j * p[i2] / (2:j)) |
|
6017
|
48
|
|
|
|
|
|
double q1 = (j * arr[n_mj + 1].p) / 2.0; |
|
6018
|
1176
|
100
|
|
|
|
|
for (size_t k = 1; k < i2_len; k++) { |
|
6019
|
1128
|
|
|
|
|
|
double temp_q1 = (j * arr[n_mj + 1 + k].p) / (2.0 + k); |
|
6020
|
1128
|
100
|
|
|
|
|
if (temp_q1 < q1) { |
|
6021
|
266
|
|
|
|
|
|
q1 = temp_q1; |
|
6022
|
|
|
|
|
|
|
} |
|
6023
|
|
|
|
|
|
|
} |
|
6024
|
|
|
|
|
|
|
// q[ij] <- pmin(j * p[ij], q1) |
|
6025
|
1272
|
100
|
|
|
|
|
for (size_t i = 0; i <= n_mj; i++) { |
|
6026
|
1224
|
|
|
|
|
|
double v = j * arr[i].p; |
|
6027
|
1224
|
100
|
|
|
|
|
q_arr[i] = (v < q1) ? v : q1; |
|
6028
|
|
|
|
|
|
|
} |
|
6029
|
|
|
|
|
|
|
// q[i2] <- q[n - j] |
|
6030
|
1224
|
100
|
|
|
|
|
for (size_t i = 0; i < i2_len; i++) { |
|
6031
|
1176
|
|
|
|
|
|
q_arr[n_mj + 1 + i] = q_arr[n_mj]; |
|
6032
|
|
|
|
|
|
|
} |
|
6033
|
|
|
|
|
|
|
// pa <- pmax(pa, q) |
|
6034
|
2448
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
6035
|
2400
|
100
|
|
|
|
|
if (pa[i] < q_arr[i]) { |
|
6036
|
1401
|
|
|
|
|
|
pa[i] = q_arr[i]; |
|
6037
|
|
|
|
|
|
|
} |
|
6038
|
|
|
|
|
|
|
} |
|
6039
|
|
|
|
|
|
|
} |
|
6040
|
|
|
|
|
|
|
// pmin(1, pmax(pa, p))[ro] — map sorted results back to original indices |
|
6041
|
53
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
6042
|
51
|
100
|
|
|
|
|
NV v = (pa[i] > arr[i].p) ? pa[i] : arr[i].p; |
|
6043
|
51
|
50
|
|
|
|
|
if (v > 1.0) v = 1.0; |
|
6044
|
51
|
|
|
|
|
|
adj[arr[i].orig_idx] = v; |
|
6045
|
|
|
|
|
|
|
} |
|
6046
|
2
|
|
|
|
|
|
Safefree(pa); Safefree(q_arr); |
|
6047
|
1
|
50
|
|
|
|
|
} else if (strcmp(meth, "none") == 0) { |
|
6048
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
6049
|
0
|
|
|
|
|
|
adj[arr[i].orig_idx] = arr[i].p; |
|
6050
|
|
|
|
|
|
|
} |
|
6051
|
|
|
|
|
|
|
} else { |
|
6052
|
1
|
|
|
|
|
|
Safefree(arr); Safefree(adj); |
|
6053
|
1
|
|
|
|
|
|
croak("Unknown p-value adjustment method: %s", method); |
|
6054
|
|
|
|
|
|
|
} |
|
6055
|
|
|
|
|
|
|
// Push values onto the Perl stack as a flat list |
|
6056
|
12
|
50
|
|
|
|
|
EXTEND(SP, n); |
|
6057
|
318
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
6058
|
306
|
|
|
|
|
|
PUSHs(sv_2mortal(newSVnv(adj[i]))); |
|
6059
|
|
|
|
|
|
|
} |
|
6060
|
12
|
|
|
|
|
|
Safefree(arr); arr = NULL; |
|
6061
|
12
|
|
|
|
|
|
Safefree(adj); adj = NULL; |
|
6062
|
|
|
|
|
|
|
|
|
6063
|
|
|
|
|
|
|
double median(...) |
|
6064
|
|
|
|
|
|
|
PROTOTYPE: @ |
|
6065
|
|
|
|
|
|
|
INIT: |
|
6066
|
15
|
|
|
|
|
|
size_t total_count = 0, k = 0; |
|
6067
|
|
|
|
|
|
|
NV* restrict nums; |
|
6068
|
15
|
|
|
|
|
|
NV median_val = 0.0; |
|
6069
|
|
|
|
|
|
|
CODE: |
|
6070
|
|
|
|
|
|
|
// Pass 1: Count valid elements — die immediately on any undef |
|
6071
|
32
|
100
|
|
|
|
|
for (size_t i = 0; i < items; i++) { |
|
6072
|
19
|
|
|
|
|
|
SV* restrict arg = ST(i); |
|
6073
|
30
|
100
|
|
|
|
|
if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
6074
|
12
|
|
|
|
|
|
AV* restrict av = (AV*)SvRV(arg); |
|
6075
|
12
|
|
|
|
|
|
size_t len = av_len(av) + 1; |
|
6076
|
295
|
100
|
|
|
|
|
for (size_t j = 0; j < len; j++) { |
|
6077
|
284
|
|
|
|
|
|
SV** restrict tv = av_fetch(av, j, 0); |
|
6078
|
284
|
50
|
|
|
|
|
if (tv && SvOK(*tv)) { |
|
|
|
100
|
|
|
|
|
|
|
6079
|
283
|
|
|
|
|
|
total_count++; |
|
6080
|
|
|
|
|
|
|
} else { |
|
6081
|
1
|
|
|
|
|
|
croak("median: undefined value at array ref index %zu (argument %zu)", j, i); |
|
6082
|
|
|
|
|
|
|
} |
|
6083
|
|
|
|
|
|
|
} |
|
6084
|
7
|
100
|
|
|
|
|
} else if (SvOK(arg)) { |
|
6085
|
6
|
|
|
|
|
|
total_count++; |
|
6086
|
|
|
|
|
|
|
} else { |
|
6087
|
1
|
|
|
|
|
|
croak("median: undefined value at argument index %zu", i); |
|
6088
|
|
|
|
|
|
|
} |
|
6089
|
|
|
|
|
|
|
} |
|
6090
|
13
|
100
|
|
|
|
|
if (total_count == 0) croak("median needs >= 1 element"); |
|
6091
|
|
|
|
|
|
|
|
|
6092
|
|
|
|
|
|
|
/* Allocate C array now that we know the exact size */ |
|
6093
|
12
|
50
|
|
|
|
|
Newx(nums, total_count, double); |
|
6094
|
|
|
|
|
|
|
|
|
6095
|
|
|
|
|
|
|
/* Pass 2: Populate the C array — Safefree before any croak */ |
|
6096
|
27
|
100
|
|
|
|
|
for (size_t i = 0; i < items; i++) { |
|
6097
|
15
|
|
|
|
|
|
SV* restrict arg = ST(i); |
|
6098
|
26
|
100
|
|
|
|
|
if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
6099
|
11
|
|
|
|
|
|
AV* restrict av = (AV*)SvRV(arg); |
|
6100
|
11
|
|
|
|
|
|
size_t len = av_len(av) + 1; |
|
6101
|
293
|
100
|
|
|
|
|
for (size_t j = 0; j < len; j++) { |
|
6102
|
282
|
|
|
|
|
|
SV** restrict tv = av_fetch(av, j, 0); |
|
6103
|
282
|
50
|
|
|
|
|
if (tv && SvOK(*tv)) { |
|
|
|
50
|
|
|
|
|
|
|
6104
|
282
|
|
|
|
|
|
nums[k++] = SvNV(*tv); |
|
6105
|
|
|
|
|
|
|
} else { |
|
6106
|
0
|
|
|
|
|
|
Safefree(nums); |
|
6107
|
0
|
|
|
|
|
|
croak("median: undefined value at array ref index %zu (argument %zu)", j, i); |
|
6108
|
|
|
|
|
|
|
} |
|
6109
|
|
|
|
|
|
|
} |
|
6110
|
4
|
50
|
|
|
|
|
} else if (SvOK(arg)) { |
|
6111
|
4
|
|
|
|
|
|
nums[k++] = SvNV(arg); |
|
6112
|
|
|
|
|
|
|
} else { |
|
6113
|
0
|
|
|
|
|
|
Safefree(nums); |
|
6114
|
0
|
|
|
|
|
|
croak("median: undefined value at argument index %zu", i); |
|
6115
|
|
|
|
|
|
|
} |
|
6116
|
|
|
|
|
|
|
} |
|
6117
|
|
|
|
|
|
|
/* Sort and calculate median */ |
|
6118
|
12
|
|
|
|
|
|
qsort(nums, total_count, sizeof(double), compare_doubles); |
|
6119
|
12
|
100
|
|
|
|
|
if (total_count % 2 == 0) { |
|
6120
|
4
|
|
|
|
|
|
median_val = (nums[total_count / 2 - 1] + nums[total_count / 2]) / 2.0; |
|
6121
|
|
|
|
|
|
|
} else { |
|
6122
|
8
|
|
|
|
|
|
median_val = nums[total_count / 2]; |
|
6123
|
|
|
|
|
|
|
} |
|
6124
|
12
|
|
|
|
|
|
Safefree(nums); |
|
6125
|
12
|
|
|
|
|
|
nums = NULL; |
|
6126
|
12
|
100
|
|
|
|
|
RETVAL = median_val; |
|
6127
|
|
|
|
|
|
|
OUTPUT: |
|
6128
|
|
|
|
|
|
|
RETVAL |
|
6129
|
|
|
|
|
|
|
|
|
6130
|
|
|
|
|
|
|
SV* cor(SV* x_sv, SV* y_sv = &PL_sv_undef, const char* method = "pearson") |
|
6131
|
|
|
|
|
|
|
INIT: |
|
6132
|
|
|
|
|
|
|
// --- validate method ------------------------------------------- |
|
6133
|
70
|
100
|
|
|
|
|
if (strcmp(method, "pearson") != 0 && |
|
6134
|
11
|
100
|
|
|
|
|
strcmp(method, "spearman") != 0 && |
|
6135
|
5
|
100
|
|
|
|
|
strcmp(method, "kendall") != 0) |
|
6136
|
1
|
|
|
|
|
|
croak("cor: unknown method '%s' (use 'pearson', 'spearman', or 'kendall')", |
|
6137
|
|
|
|
|
|
|
method); |
|
6138
|
|
|
|
|
|
|
|
|
6139
|
|
|
|
|
|
|
// --- validate x ------------------------------------------------ |
|
6140
|
69
|
50
|
|
|
|
|
if (!SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV) |
|
|
|
50
|
|
|
|
|
|
|
6141
|
0
|
|
|
|
|
|
croak("cor: x must be an ARRAY reference"); |
|
6142
|
|
|
|
|
|
|
|
|
6143
|
69
|
|
|
|
|
|
AV*restrict x_av = (AV*)SvRV(x_sv); |
|
6144
|
69
|
|
|
|
|
|
size_t nx = av_len(x_av) + 1; |
|
6145
|
69
|
50
|
|
|
|
|
if (nx == 0) croak("cor: x is empty"); |
|
6146
|
|
|
|
|
|
|
|
|
6147
|
|
|
|
|
|
|
// --- detect whether x is a flat vector or a matrix (AoA) ------- |
|
6148
|
69
|
|
|
|
|
|
bool x_is_matrix = 0; |
|
6149
|
|
|
|
|
|
|
{ |
|
6150
|
69
|
|
|
|
|
|
SV**restrict fp = av_fetch(x_av, 0, 0); |
|
6151
|
69
|
50
|
|
|
|
|
if (fp && SvROK(*fp) && SvTYPE(SvRV(*fp)) == SVt_PVAV) |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
6152
|
1
|
|
|
|
|
|
x_is_matrix = 1; |
|
6153
|
|
|
|
|
|
|
} |
|
6154
|
|
|
|
|
|
|
|
|
6155
|
|
|
|
|
|
|
// --- detect y ---------------------------- |
|
6156
|
138
|
50
|
|
|
|
|
bool has_y = (SvOK(y_sv) && SvROK(y_sv) && |
|
|
|
50
|
|
|
|
|
|
|
6157
|
69
|
50
|
|
|
|
|
SvTYPE(SvRV(y_sv)) == SVt_PVAV); |
|
6158
|
|
|
|
|
|
|
|
|
6159
|
69
|
50
|
|
|
|
|
AV*restrict y_av = has_y ? (AV*)SvRV(y_sv) : NULL; |
|
6160
|
69
|
50
|
|
|
|
|
size_t ny = has_y ? av_len(y_av) + 1 : 0; |
|
6161
|
|
|
|
|
|
|
|
|
6162
|
69
|
|
|
|
|
|
bool y_is_matrix = 0; |
|
6163
|
69
|
50
|
|
|
|
|
if (has_y && ny > 0) { |
|
|
|
50
|
|
|
|
|
|
|
6164
|
69
|
|
|
|
|
|
SV**restrict fp = av_fetch(y_av, 0, 0); |
|
6165
|
69
|
50
|
|
|
|
|
if (fp && SvROK(*fp) && SvTYPE(SvRV(*fp)) == SVt_PVAV) |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
6166
|
1
|
|
|
|
|
|
y_is_matrix = 1; |
|
6167
|
|
|
|
|
|
|
} |
|
6168
|
|
|
|
|
|
|
|
|
6169
|
|
|
|
|
|
|
CODE: |
|
6170
|
|
|
|
|
|
|
// Branch 1: both inputs are flat vectors → scalar result |
|
6171
|
69
|
100
|
|
|
|
|
if (!x_is_matrix && !y_is_matrix) { |
|
|
|
50
|
|
|
|
|
|
|
6172
|
68
|
50
|
|
|
|
|
if (!has_y) { |
|
6173
|
|
|
|
|
|
|
/* cor(vector) == 1 by definition */ |
|
6174
|
0
|
|
|
|
|
|
RETVAL = newSVnv(1.0); |
|
6175
|
|
|
|
|
|
|
} else { |
|
6176
|
68
|
100
|
|
|
|
|
if (nx != ny) |
|
6177
|
1
|
|
|
|
|
|
croak("cor: x and y must have the same length (%lu vs %lu)", |
|
6178
|
|
|
|
|
|
|
nx, ny); |
|
6179
|
67
|
50
|
|
|
|
|
if (nx < 2) |
|
6180
|
0
|
|
|
|
|
|
croak("cor: need at least 2 observations"); |
|
6181
|
|
|
|
|
|
|
double *restrict xd, *restrict yd; |
|
6182
|
67
|
50
|
|
|
|
|
Newx(xd, nx, double); |
|
6183
|
67
|
50
|
|
|
|
|
Newx(yd, ny, double); |
|
6184
|
67
|
|
|
|
|
|
bool x_sd0 = 1, y_sd0 = 1; |
|
6185
|
67
|
|
|
|
|
|
double x_first = NAN, y_first = NAN; |
|
6186
|
385
|
100
|
|
|
|
|
for (size_t i = 0; i < nx; i++) { |
|
6187
|
318
|
|
|
|
|
|
SV**restrict tv = av_fetch(x_av, i, 0); |
|
6188
|
318
|
50
|
|
|
|
|
double val = (tv && SvOK(*tv) && looks_like_number(*tv)) ? SvNV(*tv) : NAN; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
6189
|
318
|
|
|
|
|
|
xd[i] = val; |
|
6190
|
318
|
50
|
|
|
|
|
if (!isnan(val)) { |
|
6191
|
318
|
100
|
|
|
|
|
if (isnan(x_first)) x_first = val; |
|
6192
|
251
|
100
|
|
|
|
|
else if (val != x_first) x_sd0 = 0; |
|
6193
|
|
|
|
|
|
|
} |
|
6194
|
|
|
|
|
|
|
} |
|
6195
|
385
|
100
|
|
|
|
|
for (size_t i = 0; i < ny; i++) { |
|
6196
|
318
|
|
|
|
|
|
SV**restrict tv = av_fetch(y_av, i, 0); |
|
6197
|
318
|
50
|
|
|
|
|
double val = (tv && SvOK(*tv) && looks_like_number(*tv)) ? SvNV(*tv) : NAN; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
6198
|
318
|
|
|
|
|
|
yd[i] = val; |
|
6199
|
318
|
50
|
|
|
|
|
if (!isnan(val)) { |
|
6200
|
318
|
100
|
|
|
|
|
if (isnan(y_first)) y_first = val; |
|
6201
|
251
|
100
|
|
|
|
|
else if (val != y_first) y_sd0 = 0; |
|
6202
|
|
|
|
|
|
|
} |
|
6203
|
|
|
|
|
|
|
} |
|
6204
|
67
|
100
|
|
|
|
|
if (x_sd0 || y_sd0) { |
|
|
|
50
|
|
|
|
|
|
|
6205
|
9
|
|
|
|
|
|
Safefree(xd); Safefree(yd); |
|
6206
|
9
|
50
|
|
|
|
|
if (x_sd0) croak("cor: standard deviation of x is 0"); |
|
6207
|
0
|
|
|
|
|
|
croak("cor: standard deviation of y is 0"); |
|
6208
|
|
|
|
|
|
|
} |
|
6209
|
58
|
|
|
|
|
|
double r = compute_cor(xd, yd, nx, method); |
|
6210
|
58
|
|
|
|
|
|
Safefree(xd); Safefree(yd); |
|
6211
|
58
|
|
|
|
|
|
RETVAL = newSVnv(r); |
|
6212
|
|
|
|
|
|
|
} |
|
6213
|
|
|
|
|
|
|
} else {//Branch 2: x is a matrix (or y is a matrix) → AoA result |
|
6214
|
|
|
|
|
|
|
// -- resolve x matrix dimensions |
|
6215
|
1
|
50
|
|
|
|
|
if (!x_is_matrix) |
|
6216
|
0
|
|
|
|
|
|
croak("cor: x must be a matrix (array ref of array refs) " |
|
6217
|
|
|
|
|
|
|
"when y is a matrix"); |
|
6218
|
|
|
|
|
|
|
|
|
6219
|
1
|
|
|
|
|
|
SV**restrict xr0 = av_fetch(x_av, 0, 0); |
|
6220
|
1
|
50
|
|
|
|
|
if (!xr0 || !SvROK(*xr0) || SvTYPE(SvRV(*xr0)) != SVt_PVAV) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
6221
|
0
|
|
|
|
|
|
croak("cor: each row of x must be an ARRAY reference"); |
|
6222
|
|
|
|
|
|
|
|
|
6223
|
1
|
|
|
|
|
|
size_t ncols_x = av_len((AV*)SvRV(*xr0)) + 1; |
|
6224
|
1
|
50
|
|
|
|
|
if (ncols_x == 0) croak("cor: x matrix has zero columns"); |
|
6225
|
|
|
|
|
|
|
|
|
6226
|
1
|
|
|
|
|
|
size_t nrows = nx; /* observations */ |
|
6227
|
|
|
|
|
|
|
|
|
6228
|
|
|
|
|
|
|
// PRE-VALIDATION PASS: Ensure all rows are arrays to prevent memory leaks on croak |
|
6229
|
4
|
100
|
|
|
|
|
for (size_t i = 0; i < nrows; i++) { |
|
6230
|
3
|
|
|
|
|
|
SV**restrict rv = av_fetch(x_av, i, 0); |
|
6231
|
3
|
50
|
|
|
|
|
if (!rv || !SvROK(*rv) || SvTYPE(SvRV(*rv)) != SVt_PVAV) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
6232
|
0
|
|
|
|
|
|
croak("cor: x row %lu is not an array ref", i); |
|
6233
|
|
|
|
|
|
|
} |
|
6234
|
|
|
|
|
|
|
|
|
6235
|
1
|
50
|
|
|
|
|
if (has_y && y_is_matrix) { |
|
|
|
50
|
|
|
|
|
|
|
6236
|
1
|
50
|
|
|
|
|
if (ny != nrows) croak("cor: x and y must have the same number of rows (%lu vs %lu)", nrows, ny); |
|
6237
|
4
|
100
|
|
|
|
|
for (size_t i = 0; i < nrows; i++) { |
|
6238
|
3
|
|
|
|
|
|
SV**restrict rv = av_fetch(y_av, i, 0); |
|
6239
|
3
|
50
|
|
|
|
|
if (!rv || !SvROK(*rv) || SvTYPE(SvRV(*rv)) != SVt_PVAV) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
6240
|
0
|
|
|
|
|
|
croak("cor: y row %lu is not an array ref", i); |
|
6241
|
|
|
|
|
|
|
} |
|
6242
|
|
|
|
|
|
|
} |
|
6243
|
|
|
|
|
|
|
// -- extract x columns |
|
6244
|
|
|
|
|
|
|
NV **restrict col_x; |
|
6245
|
1
|
50
|
|
|
|
|
Newx(col_x, ncols_x, NV*); |
|
6246
|
3
|
100
|
|
|
|
|
for (size_t j = 0; j < ncols_x; j++) { |
|
6247
|
2
|
50
|
|
|
|
|
Newx(col_x[j], nrows, NV); |
|
6248
|
2
|
|
|
|
|
|
bool sd0 = 1; |
|
6249
|
2
|
|
|
|
|
|
NV first = NAN; |
|
6250
|
8
|
100
|
|
|
|
|
for (size_t i = 0; i < nrows; i++) { |
|
6251
|
6
|
|
|
|
|
|
SV**restrict rv = av_fetch(x_av, i, 0); |
|
6252
|
6
|
|
|
|
|
|
AV*restrict row = (AV*)SvRV(*rv); |
|
6253
|
6
|
|
|
|
|
|
SV**restrict cv = av_fetch(row, j, 0); |
|
6254
|
6
|
50
|
|
|
|
|
NV val = (cv && SvOK(*cv) && looks_like_number(*cv)) ? SvNV(*cv) : NAN; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
6255
|
6
|
|
|
|
|
|
col_x[j][i] = val; |
|
6256
|
6
|
50
|
|
|
|
|
if (!isnan(val)) { |
|
6257
|
6
|
100
|
|
|
|
|
if (isnan(first)) first = val; |
|
6258
|
4
|
50
|
|
|
|
|
else if (val != first) sd0 = 0; |
|
6259
|
|
|
|
|
|
|
} |
|
6260
|
|
|
|
|
|
|
} |
|
6261
|
2
|
50
|
|
|
|
|
if (sd0) { |
|
6262
|
0
|
0
|
|
|
|
|
for (size_t k = 0; k <= j; k++) Safefree(col_x[k]); |
|
6263
|
0
|
|
|
|
|
|
Safefree(col_x); |
|
6264
|
0
|
|
|
|
|
|
croak("cor: standard deviation is 0 in x column %lu", j); |
|
6265
|
|
|
|
|
|
|
} |
|
6266
|
|
|
|
|
|
|
} |
|
6267
|
|
|
|
|
|
|
// -- resolve y: separate matrix or re-use x (symmetric) |
|
6268
|
|
|
|
|
|
|
size_t ncols_y; |
|
6269
|
1
|
|
|
|
|
|
NV **restrict col_y = NULL; |
|
6270
|
1
|
|
|
|
|
|
bool symmetric = 0; |
|
6271
|
|
|
|
|
|
|
// 1 = cor(X) — result is symmetric |
|
6272
|
2
|
50
|
|
|
|
|
if (has_y && y_is_matrix) { |
|
|
|
50
|
|
|
|
|
|
|
6273
|
|
|
|
|
|
|
// cross-correlation: X (nrows × p) vs Y (nrows × q) |
|
6274
|
1
|
|
|
|
|
|
SV**restrict yr0 = av_fetch(y_av, 0, 0); |
|
6275
|
1
|
|
|
|
|
|
ncols_y = av_len((AV*)SvRV(*yr0)) + 1; |
|
6276
|
1
|
50
|
|
|
|
|
if (ncols_y == 0) croak("cor: y matrix has zero columns"); |
|
6277
|
|
|
|
|
|
|
|
|
6278
|
1
|
50
|
|
|
|
|
Newx(col_y, ncols_y, NV*); |
|
6279
|
3
|
100
|
|
|
|
|
for (size_t j = 0; j < ncols_y; j++) { |
|
6280
|
2
|
50
|
|
|
|
|
Newx(col_y[j], nrows, NV); |
|
6281
|
2
|
|
|
|
|
|
bool sd0 = 1; |
|
6282
|
2
|
|
|
|
|
|
NV first = NAN; |
|
6283
|
8
|
100
|
|
|
|
|
for (size_t i = 0; i < nrows; i++) { |
|
6284
|
6
|
|
|
|
|
|
SV**restrict rv = av_fetch(y_av, i, 0); |
|
6285
|
6
|
|
|
|
|
|
AV*restrict row = (AV*)SvRV(*rv); |
|
6286
|
6
|
|
|
|
|
|
SV**restrict cv = av_fetch(row, j, 0); |
|
6287
|
6
|
50
|
|
|
|
|
NV val = (cv && SvOK(*cv) && looks_like_number(*cv)) ? SvNV(*cv) : NAN; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
6288
|
6
|
|
|
|
|
|
col_y[j][i] = val; |
|
6289
|
6
|
50
|
|
|
|
|
if (!isnan(val)) { |
|
6290
|
6
|
100
|
|
|
|
|
if (isnan(first)) first = val; |
|
6291
|
4
|
50
|
|
|
|
|
else if (val != first) sd0 = 0; |
|
6292
|
|
|
|
|
|
|
} |
|
6293
|
|
|
|
|
|
|
} |
|
6294
|
2
|
50
|
|
|
|
|
if (sd0) { |
|
6295
|
0
|
0
|
|
|
|
|
for (size_t k = 0; k < ncols_x; k++) Safefree(col_x[k]); |
|
6296
|
0
|
|
|
|
|
|
Safefree(col_x); |
|
6297
|
0
|
0
|
|
|
|
|
for (size_t k = 0; k <= j; k++) Safefree(col_y[k]); |
|
6298
|
0
|
|
|
|
|
|
Safefree(col_y); |
|
6299
|
0
|
|
|
|
|
|
croak("cor: standard deviation is 0 in y column %lu", j); |
|
6300
|
|
|
|
|
|
|
} |
|
6301
|
|
|
|
|
|
|
} |
|
6302
|
|
|
|
|
|
|
} else { // cor(X) — symmetric p×p result; share column arrays |
|
6303
|
0
|
|
|
|
|
|
ncols_y = ncols_x; |
|
6304
|
0
|
|
|
|
|
|
col_y = col_x; |
|
6305
|
0
|
|
|
|
|
|
symmetric = 1; |
|
6306
|
|
|
|
|
|
|
} |
|
6307
|
1
|
50
|
|
|
|
|
if (nrows < 2) |
|
6308
|
0
|
|
|
|
|
|
croak("cor: need at least 2 observations (got %lu)", nrows); |
|
6309
|
|
|
|
|
|
|
// -- build cache for symmetric case: compute upper triangle, store results, mirror to lower triangle |
|
6310
|
1
|
|
|
|
|
|
AV*restrict result_av = newAV(); |
|
6311
|
1
|
|
|
|
|
|
av_extend(result_av, ncols_x - 1); |
|
6312
|
|
|
|
|
|
|
// Allocate per-row AVs up front so we can fill them in order |
|
6313
|
|
|
|
|
|
|
AV **restrict rows_out; |
|
6314
|
1
|
50
|
|
|
|
|
Newx(rows_out, ncols_x, AV*); |
|
6315
|
3
|
100
|
|
|
|
|
for (size_t i = 0; i < ncols_x; i++) { |
|
6316
|
2
|
|
|
|
|
|
rows_out[i] = newAV(); |
|
6317
|
2
|
|
|
|
|
|
av_extend(rows_out[i], ncols_y - 1); |
|
6318
|
|
|
|
|
|
|
} |
|
6319
|
1
|
50
|
|
|
|
|
if (symmetric) { |
|
6320
|
|
|
|
|
|
|
/* Upper triangle + diagonal, then mirror. r_cache[i][j] (j >= i) holds the computed value. */ |
|
6321
|
|
|
|
|
|
|
NV **restrict r_cache; |
|
6322
|
0
|
0
|
|
|
|
|
Newx(r_cache, ncols_x, NV*); |
|
6323
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < ncols_x; i++) |
|
6324
|
0
|
0
|
|
|
|
|
Newx(r_cache[i], ncols_x, NV); |
|
6325
|
|
|
|
|
|
|
|
|
6326
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < ncols_x; i++) { |
|
6327
|
0
|
|
|
|
|
|
r_cache[i][i] = 1.0; // diagonal |
|
6328
|
0
|
0
|
|
|
|
|
for (size_t j = i + 1; j < ncols_x; j++) { |
|
6329
|
0
|
|
|
|
|
|
NV r = compute_cor(col_x[i], col_x[j], nrows, method); |
|
6330
|
0
|
|
|
|
|
|
r_cache[i][j] = r; |
|
6331
|
0
|
|
|
|
|
|
r_cache[j][i] = r; // symmetry |
|
6332
|
|
|
|
|
|
|
} |
|
6333
|
|
|
|
|
|
|
} |
|
6334
|
|
|
|
|
|
|
// fill output AoA from cache |
|
6335
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < ncols_x; i++) |
|
6336
|
0
|
0
|
|
|
|
|
for (size_t j = 0; j < ncols_x; j++) |
|
6337
|
0
|
|
|
|
|
|
av_store(rows_out[i], j, newSVnv(r_cache[i][j])); |
|
6338
|
|
|
|
|
|
|
|
|
6339
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < ncols_x; i++) Safefree(r_cache[i]); |
|
6340
|
0
|
|
|
|
|
|
Safefree(r_cache); r_cache = NULL; |
|
6341
|
|
|
|
|
|
|
} else { |
|
6342
|
|
|
|
|
|
|
// cross-correlation: every (i,j) pair is independent |
|
6343
|
3
|
100
|
|
|
|
|
for (size_t i = 0; i < ncols_x; i++) |
|
6344
|
6
|
100
|
|
|
|
|
for (size_t j = 0; j < ncols_y; j++) |
|
6345
|
4
|
|
|
|
|
|
av_store(rows_out[i], j, newSVnv(compute_cor(col_x[i], col_y[j], nrows, method))); |
|
6346
|
|
|
|
|
|
|
} |
|
6347
|
|
|
|
|
|
|
// push row AVs into result |
|
6348
|
3
|
100
|
|
|
|
|
for (size_t i = 0; i < ncols_x; i++) |
|
6349
|
2
|
|
|
|
|
|
av_store(result_av, i, newRV_noinc((SV*)rows_out[i])); |
|
6350
|
1
|
|
|
|
|
|
Safefree(rows_out); rows_out = NULL; |
|
6351
|
|
|
|
|
|
|
// -- free column arrays ------------------------------------- |
|
6352
|
3
|
100
|
|
|
|
|
for (size_t j = 0; j < ncols_x; j++) Safefree(col_x[j]); |
|
6353
|
1
|
|
|
|
|
|
Safefree(col_x); col_x = NULL; |
|
6354
|
1
|
50
|
|
|
|
|
if (!symmetric) { |
|
6355
|
3
|
100
|
|
|
|
|
for (size_t j = 0; j < ncols_y; j++) Safefree(col_y[j]); |
|
6356
|
1
|
|
|
|
|
|
Safefree(col_y); |
|
6357
|
|
|
|
|
|
|
} |
|
6358
|
1
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)result_av); |
|
6359
|
|
|
|
|
|
|
} |
|
6360
|
|
|
|
|
|
|
OUTPUT: |
|
6361
|
|
|
|
|
|
|
RETVAL |
|
6362
|
|
|
|
|
|
|
|
|
6363
|
|
|
|
|
|
|
void scale(...) |
|
6364
|
|
|
|
|
|
|
PROTOTYPE: @ |
|
6365
|
|
|
|
|
|
|
PPCODE: |
|
6366
|
|
|
|
|
|
|
{ |
|
6367
|
5
|
|
|
|
|
|
bool do_center_mean = TRUE, do_scale_sd = TRUE; |
|
6368
|
5
|
|
|
|
|
|
NV center_val = 0.0, scale_val = 1.0; |
|
6369
|
5
|
|
|
|
|
|
size_t data_items = items; |
|
6370
|
|
|
|
|
|
|
// 1. Parse Options Hash (if it exists as the last argument) |
|
6371
|
5
|
50
|
|
|
|
|
if (items > 0) { |
|
6372
|
5
|
|
|
|
|
|
SV*restrict last_arg = ST(items - 1); |
|
6373
|
5
|
100
|
|
|
|
|
if (SvROK(last_arg) && SvTYPE(SvRV(last_arg)) == SVt_PVHV) { |
|
|
|
100
|
|
|
|
|
|
|
6374
|
2
|
|
|
|
|
|
data_items = items - 1; // Exclude hash from data processing |
|
6375
|
2
|
|
|
|
|
|
HV*restrict opt_hv = (HV*)SvRV(last_arg); |
|
6376
|
|
|
|
|
|
|
// --- Parse 'center' |
|
6377
|
2
|
|
|
|
|
|
SV**restrict center_sv = hv_fetch(opt_hv, "center", 6, 0); |
|
6378
|
2
|
50
|
|
|
|
|
if (center_sv) { |
|
6379
|
2
|
|
|
|
|
|
SV*restrict val_sv = *center_sv; |
|
6380
|
2
|
50
|
|
|
|
|
if (!SvOK(val_sv)) { |
|
6381
|
0
|
|
|
|
|
|
do_center_mean = FALSE; center_val = 0.0; |
|
6382
|
|
|
|
|
|
|
} else { |
|
6383
|
2
|
|
|
|
|
|
char *restrict str = SvPV_nolen(val_sv); |
|
6384
|
|
|
|
|
|
|
/* Trap booleans and empty strings before numeric checks */ |
|
6385
|
2
|
50
|
|
|
|
|
if (strcasecmp(str, "mean") == 0 || strcasecmp(str, "true") == 0 || strcmp(str, "1") == 0) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
6386
|
1
|
|
|
|
|
|
do_center_mean = TRUE; |
|
6387
|
1
|
50
|
|
|
|
|
} else if (strcasecmp(str, "none") == 0 || strcasecmp(str, "false") == 0 || strcmp(str, "0") == 0 || strcmp(str, "") == 0) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
6388
|
1
|
|
|
|
|
|
do_center_mean = FALSE; center_val = 0.0; |
|
6389
|
0
|
0
|
|
|
|
|
} else if (looks_like_number(val_sv)) { |
|
6390
|
0
|
|
|
|
|
|
do_center_mean = FALSE; center_val = SvNV(val_sv); |
|
6391
|
0
|
0
|
|
|
|
|
} else if (SvTRUE(val_sv)) { |
|
6392
|
0
|
|
|
|
|
|
do_center_mean = TRUE; |
|
6393
|
|
|
|
|
|
|
} else { |
|
6394
|
0
|
|
|
|
|
|
do_center_mean = FALSE; center_val = 0.0; |
|
6395
|
|
|
|
|
|
|
} |
|
6396
|
|
|
|
|
|
|
} |
|
6397
|
|
|
|
|
|
|
} |
|
6398
|
|
|
|
|
|
|
// --- Parse 'scale' --- |
|
6399
|
2
|
|
|
|
|
|
SV**restrict scale_sv = hv_fetch(opt_hv, "scale", 5, 0); |
|
6400
|
2
|
100
|
|
|
|
|
if (scale_sv) { |
|
6401
|
1
|
|
|
|
|
|
SV*restrict val_sv = *scale_sv; |
|
6402
|
1
|
50
|
|
|
|
|
if (!SvOK(val_sv)) { |
|
6403
|
0
|
|
|
|
|
|
do_scale_sd = FALSE; scale_val = 1.0; |
|
6404
|
|
|
|
|
|
|
} else { |
|
6405
|
1
|
|
|
|
|
|
char *restrict str = SvPV_nolen(val_sv); |
|
6406
|
1
|
50
|
|
|
|
|
if (strcasecmp(str, "sd") == 0 || strcasecmp(str, "true") == 0 || strcmp(str, "1") == 0) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
6407
|
0
|
|
|
|
|
|
do_scale_sd = TRUE; |
|
6408
|
1
|
50
|
|
|
|
|
} else if (strcasecmp(str, "none") == 0 || strcasecmp(str, "false") == 0 || strcmp(str, "0") == 0 || strcmp(str, "") == 0) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
6409
|
1
|
|
|
|
|
|
do_scale_sd = FALSE; scale_val = 1.0; |
|
6410
|
0
|
0
|
|
|
|
|
} else if (looks_like_number(val_sv)) { |
|
6411
|
0
|
|
|
|
|
|
do_scale_sd = FALSE; scale_val = SvNV(val_sv); |
|
6412
|
0
|
0
|
|
|
|
|
if (scale_val == 0.0) scale_val = 1.0; /* Prevent Division By Zero */ |
|
6413
|
0
|
0
|
|
|
|
|
} else if (SvTRUE(val_sv)) { |
|
6414
|
0
|
|
|
|
|
|
do_scale_sd = TRUE; |
|
6415
|
|
|
|
|
|
|
} else { |
|
6416
|
0
|
|
|
|
|
|
do_scale_sd = FALSE; scale_val = 1.0; |
|
6417
|
|
|
|
|
|
|
} |
|
6418
|
|
|
|
|
|
|
} |
|
6419
|
|
|
|
|
|
|
} |
|
6420
|
|
|
|
|
|
|
} |
|
6421
|
|
|
|
|
|
|
} |
|
6422
|
|
|
|
|
|
|
// 2. Detect if the input is a Matrix (Array of Arrays) |
|
6423
|
5
|
|
|
|
|
|
bool is_matrix = FALSE; |
|
6424
|
5
|
100
|
|
|
|
|
if (data_items == 1) { |
|
6425
|
2
|
|
|
|
|
|
SV*restrict first_arg = ST(0); |
|
6426
|
2
|
100
|
|
|
|
|
if (SvROK(first_arg) && SvTYPE(SvRV(first_arg)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
6427
|
1
|
|
|
|
|
|
AV*restrict av = (AV*)SvRV(first_arg); |
|
6428
|
1
|
50
|
|
|
|
|
if (av_len(av) >= 0) { |
|
6429
|
1
|
|
|
|
|
|
SV**restrict first_elem = av_fetch(av, 0, 0); |
|
6430
|
1
|
50
|
|
|
|
|
if (first_elem && SvROK(*first_elem) && SvTYPE(SvRV(*first_elem)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
6431
|
1
|
|
|
|
|
|
is_matrix = TRUE; |
|
6432
|
|
|
|
|
|
|
} |
|
6433
|
|
|
|
|
|
|
} |
|
6434
|
|
|
|
|
|
|
} |
|
6435
|
|
|
|
|
|
|
} |
|
6436
|
5
|
100
|
|
|
|
|
if (is_matrix) { |
|
6437
|
|
|
|
|
|
|
// MATRIX MODE: Scale columns independently (Just like R) |
|
6438
|
1
|
|
|
|
|
|
AV*restrict mat_av = (AV*)SvRV(ST(0)); |
|
6439
|
1
|
|
|
|
|
|
size_t nrow = av_len(mat_av) + 1, ncol = 0; |
|
6440
|
1
|
|
|
|
|
|
SV**restrict first_row = av_fetch(mat_av, 0, 0); |
|
6441
|
1
|
|
|
|
|
|
ncol = av_len((AV*)SvRV(*first_row)) + 1; |
|
6442
|
1
|
50
|
|
|
|
|
if (nrow == 0 || ncol == 0) croak("scale requires non-empty matrix"); |
|
|
|
50
|
|
|
|
|
|
|
6443
|
|
|
|
|
|
|
// Create a new matrix for the scaled output |
|
6444
|
1
|
|
|
|
|
|
AV*restrict result_av = newAV(); |
|
6445
|
1
|
|
|
|
|
|
av_extend(result_av, nrow - 1); |
|
6446
|
1
|
|
|
|
|
|
AV**restrict row_ptrs = (AV**)safemalloc(nrow * sizeof(AV*)); |
|
6447
|
4
|
100
|
|
|
|
|
for (size_t r = 0; r < nrow; r++) { |
|
6448
|
3
|
|
|
|
|
|
row_ptrs[r] = newAV(); |
|
6449
|
3
|
|
|
|
|
|
av_extend(row_ptrs[r], ncol - 1); |
|
6450
|
3
|
|
|
|
|
|
av_push(result_av, newRV_noinc((SV*)row_ptrs[r])); |
|
6451
|
|
|
|
|
|
|
} |
|
6452
|
|
|
|
|
|
|
// Calculate and apply scale per column |
|
6453
|
3
|
100
|
|
|
|
|
for (size_t c = 0; c < ncol; c++) { |
|
6454
|
2
|
|
|
|
|
|
NV col_sum = 0.0; |
|
6455
|
|
|
|
|
|
|
NV *restrict col_data; |
|
6456
|
2
|
50
|
|
|
|
|
Newx(col_data, nrow, NV); |
|
6457
|
|
|
|
|
|
|
// Extract the column data |
|
6458
|
8
|
100
|
|
|
|
|
for (size_t r = 0; r < nrow; r++) { |
|
6459
|
6
|
|
|
|
|
|
SV**restrict row_sv = av_fetch(mat_av, r, 0); |
|
6460
|
6
|
50
|
|
|
|
|
if (row_sv && SvROK(*row_sv)) { |
|
|
|
50
|
|
|
|
|
|
|
6461
|
6
|
|
|
|
|
|
AV*restrict row_av = (AV*)SvRV(*row_sv); |
|
6462
|
6
|
|
|
|
|
|
SV**restrict cell_sv = av_fetch(row_av, c, 0); |
|
6463
|
6
|
50
|
|
|
|
|
col_data[r] = (cell_sv && SvOK(*cell_sv)) ? SvNV(*cell_sv) : 0.0; |
|
|
|
50
|
|
|
|
|
|
|
6464
|
|
|
|
|
|
|
} else { |
|
6465
|
0
|
|
|
|
|
|
col_data[r] = 0.0; |
|
6466
|
|
|
|
|
|
|
} |
|
6467
|
6
|
|
|
|
|
|
col_sum += col_data[r]; |
|
6468
|
|
|
|
|
|
|
} |
|
6469
|
|
|
|
|
|
|
|
|
6470
|
2
|
50
|
|
|
|
|
NV col_center = do_center_mean ? (col_sum / nrow) : center_val; |
|
6471
|
2
|
|
|
|
|
|
NV col_scale = scale_val; |
|
6472
|
|
|
|
|
|
|
// Calculate Standard Deviation for this specific column if needed |
|
6473
|
2
|
50
|
|
|
|
|
if (do_scale_sd) { |
|
6474
|
2
|
50
|
|
|
|
|
if (nrow <= 1) { |
|
6475
|
0
|
|
|
|
|
|
Safefree(col_data); |
|
6476
|
0
|
|
|
|
|
|
safefree(row_ptrs); |
|
6477
|
0
|
|
|
|
|
|
croak("scale needs >= 2 rows to calculate standard deviation for a matrix column"); |
|
6478
|
|
|
|
|
|
|
} |
|
6479
|
2
|
|
|
|
|
|
NV sum_sq = 0.0; |
|
6480
|
8
|
100
|
|
|
|
|
for (size_t r = 0; r < nrow; r++) { |
|
6481
|
6
|
|
|
|
|
|
NV diff = col_data[r] - col_center; |
|
6482
|
6
|
|
|
|
|
|
sum_sq += diff * diff; |
|
6483
|
|
|
|
|
|
|
} |
|
6484
|
2
|
|
|
|
|
|
col_scale = sqrt(sum_sq / (nrow - 1)); |
|
6485
|
|
|
|
|
|
|
} |
|
6486
|
|
|
|
|
|
|
// Store scaled values back into the new matrix rows |
|
6487
|
8
|
100
|
|
|
|
|
for (size_t r = 0; r < nrow; r++) { |
|
6488
|
6
|
|
|
|
|
|
NV centered = col_data[r] - col_center; |
|
6489
|
6
|
50
|
|
|
|
|
NV final_val = (col_scale == 0.0) ? (0.0 / 0.0) : (centered / col_scale); |
|
6490
|
6
|
|
|
|
|
|
av_store(row_ptrs[r], c, newSVnv(final_val)); |
|
6491
|
|
|
|
|
|
|
} |
|
6492
|
2
|
|
|
|
|
|
Safefree(col_data); |
|
6493
|
|
|
|
|
|
|
} |
|
6494
|
1
|
|
|
|
|
|
safefree(row_ptrs); |
|
6495
|
|
|
|
|
|
|
// Push the resulting matrix as a single Reference onto the Perl stack |
|
6496
|
1
|
50
|
|
|
|
|
EXTEND(SP, 1); |
|
6497
|
1
|
|
|
|
|
|
PUSHs(sv_2mortal(newRV_noinc((SV*)result_av))); |
|
6498
|
|
|
|
|
|
|
} else { |
|
6499
|
|
|
|
|
|
|
// FLAT LIST MODE: Original functionality |
|
6500
|
4
|
|
|
|
|
|
size_t total_count = 0, k = 0; |
|
6501
|
|
|
|
|
|
|
NV *restrict nums; |
|
6502
|
4
|
|
|
|
|
|
NV sum = 0.0; |
|
6503
|
20
|
100
|
|
|
|
|
for (size_t i = 0; i < data_items; i++) { |
|
6504
|
16
|
|
|
|
|
|
SV*restrict arg = ST(i); |
|
6505
|
16
|
50
|
|
|
|
|
if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) { |
|
|
|
0
|
|
|
|
|
|
|
6506
|
0
|
|
|
|
|
|
AV*restrict av = (AV*)SvRV(arg); |
|
6507
|
0
|
|
|
|
|
|
size_t len = av_len(av) + 1; |
|
6508
|
0
|
0
|
|
|
|
|
for (unsigned int j = 0; j < len; j++) { |
|
6509
|
0
|
|
|
|
|
|
SV**restrict tv = av_fetch(av, j, 0); |
|
6510
|
0
|
0
|
|
|
|
|
if (tv && SvOK(*tv)) { total_count++; } |
|
|
|
0
|
|
|
|
|
|
|
6511
|
|
|
|
|
|
|
} |
|
6512
|
16
|
50
|
|
|
|
|
} else if (SvOK(arg)) { |
|
6513
|
16
|
|
|
|
|
|
total_count++; |
|
6514
|
|
|
|
|
|
|
} |
|
6515
|
|
|
|
|
|
|
} |
|
6516
|
4
|
50
|
|
|
|
|
if (total_count == 0) croak("scale requires at least 1 numeric element"); |
|
6517
|
4
|
50
|
|
|
|
|
Newx(nums, total_count, NV); |
|
6518
|
20
|
100
|
|
|
|
|
for (size_t i = 0; i < data_items; i++) { |
|
6519
|
16
|
|
|
|
|
|
SV*restrict arg = ST(i); |
|
6520
|
16
|
50
|
|
|
|
|
if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) { |
|
|
|
0
|
|
|
|
|
|
|
6521
|
0
|
|
|
|
|
|
AV*restrict av = (AV*)SvRV(arg); |
|
6522
|
0
|
|
|
|
|
|
size_t len = av_len(av) + 1; |
|
6523
|
0
|
0
|
|
|
|
|
for (size_t j = 0; j < len; j++) { |
|
6524
|
0
|
|
|
|
|
|
SV**restrict tv = av_fetch(av, j, 0); |
|
6525
|
0
|
0
|
|
|
|
|
if (tv && SvOK(*tv)) { |
|
|
|
0
|
|
|
|
|
|
|
6526
|
0
|
|
|
|
|
|
double val = SvNV(*tv); |
|
6527
|
0
|
|
|
|
|
|
nums[k++] = val; sum += val; |
|
6528
|
|
|
|
|
|
|
} |
|
6529
|
|
|
|
|
|
|
} |
|
6530
|
16
|
50
|
|
|
|
|
} else if (SvOK(arg)) { |
|
6531
|
16
|
|
|
|
|
|
NV val = SvNV(arg); |
|
6532
|
16
|
|
|
|
|
|
nums[k++] = val; sum += val; |
|
6533
|
|
|
|
|
|
|
} |
|
6534
|
|
|
|
|
|
|
} |
|
6535
|
4
|
100
|
|
|
|
|
if (do_center_mean) center_val = sum / total_count; |
|
6536
|
4
|
100
|
|
|
|
|
if (do_scale_sd) { |
|
6537
|
3
|
100
|
|
|
|
|
if (total_count <= 1) { |
|
6538
|
1
|
|
|
|
|
|
Safefree(nums); |
|
6539
|
1
|
|
|
|
|
|
croak("scale needs >= 2 elements to calculate SD"); |
|
6540
|
|
|
|
|
|
|
} |
|
6541
|
2
|
|
|
|
|
|
NV sum_sq = 0.0; |
|
6542
|
12
|
100
|
|
|
|
|
for (size_t i = 0; i < total_count; i++) { |
|
6543
|
10
|
|
|
|
|
|
NV diff = nums[i] - center_val; |
|
6544
|
10
|
|
|
|
|
|
sum_sq += diff * diff; |
|
6545
|
|
|
|
|
|
|
} |
|
6546
|
2
|
|
|
|
|
|
scale_val = sqrt(sum_sq / (total_count - 1)); |
|
6547
|
|
|
|
|
|
|
} |
|
6548
|
3
|
50
|
|
|
|
|
EXTEND(SP, total_count); |
|
6549
|
18
|
100
|
|
|
|
|
for (size_t i = 0; i < total_count; i++) { |
|
6550
|
15
|
|
|
|
|
|
NV centered = nums[i] - center_val; |
|
6551
|
15
|
50
|
|
|
|
|
NV final_val = (scale_val == 0.0) ? (0.0 / 0.0) : (centered / scale_val); |
|
6552
|
15
|
|
|
|
|
|
PUSHs(sv_2mortal(newSVnv(final_val))); |
|
6553
|
|
|
|
|
|
|
} |
|
6554
|
3
|
|
|
|
|
|
Safefree(nums); nums = NULL; |
|
6555
|
|
|
|
|
|
|
} |
|
6556
|
|
|
|
|
|
|
} |
|
6557
|
|
|
|
|
|
|
|
|
6558
|
|
|
|
|
|
|
SV* matrix(...) |
|
6559
|
|
|
|
|
|
|
CODE: |
|
6560
|
6
|
|
|
|
|
|
SV*restrict data_sv = NULL; |
|
6561
|
6
|
|
|
|
|
|
size_t nrow = 0, ncol = 0; |
|
6562
|
6
|
|
|
|
|
|
bool byrow = FALSE, nrow_set = FALSE, ncol_set = FALSE; |
|
6563
|
|
|
|
|
|
|
|
|
6564
|
|
|
|
|
|
|
/* Hybrid Argument Parser */ |
|
6565
|
6
|
50
|
|
|
|
|
if (items > 0 && SvROK(ST(0)) && SvTYPE(SvRV(ST(0))) == SVt_PVAV) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
6566
|
|
|
|
|
|
|
/* POSITIONAL: matrix($data_ref, $nrow, $ncol, $byrow) */ |
|
6567
|
1
|
|
|
|
|
|
data_sv = ST(0); |
|
6568
|
1
|
50
|
|
|
|
|
if (items > 1 && SvOK(ST(1))) { |
|
|
|
50
|
|
|
|
|
|
|
6569
|
1
|
|
|
|
|
|
nrow = (size_t)SvUV(ST(1)); |
|
6570
|
1
|
|
|
|
|
|
nrow_set = TRUE; |
|
6571
|
|
|
|
|
|
|
} |
|
6572
|
1
|
50
|
|
|
|
|
if (items > 2 && SvOK(ST(2))) { |
|
|
|
0
|
|
|
|
|
|
|
6573
|
0
|
|
|
|
|
|
ncol = (size_t)SvUV(ST(2)); |
|
6574
|
0
|
|
|
|
|
|
ncol_set = TRUE; |
|
6575
|
|
|
|
|
|
|
} |
|
6576
|
1
|
50
|
|
|
|
|
if (items > 3 && SvOK(ST(3))) { |
|
|
|
0
|
|
|
|
|
|
|
6577
|
0
|
|
|
|
|
|
byrow = SvTRUE(ST(3)); |
|
6578
|
|
|
|
|
|
|
} |
|
6579
|
5
|
50
|
|
|
|
|
} else if (items % 2 == 0) { |
|
6580
|
|
|
|
|
|
|
/* NAMED: matrix(data => [...], nrow => $n, ncol => $m) */ |
|
6581
|
16
|
100
|
|
|
|
|
for (size_t i = 0; i < items; i += 2) { |
|
6582
|
11
|
|
|
|
|
|
char*restrict key = SvPV_nolen(ST(i)); |
|
6583
|
11
|
|
|
|
|
|
SV*restrict val = ST(i + 1); |
|
6584
|
11
|
100
|
|
|
|
|
if (strEQ(key, "data")) { |
|
6585
|
5
|
|
|
|
|
|
data_sv = val; |
|
6586
|
6
|
100
|
|
|
|
|
} else if (strEQ(key, "nrow")) { |
|
6587
|
4
|
50
|
|
|
|
|
if (SvOK(val)) { nrow = (size_t)SvUV(val); nrow_set = TRUE; } |
|
6588
|
2
|
100
|
|
|
|
|
} else if (strEQ(key, "ncol")) { |
|
6589
|
1
|
50
|
|
|
|
|
if (SvOK(val)) { ncol = (size_t)SvUV(val); ncol_set = TRUE; } |
|
6590
|
1
|
50
|
|
|
|
|
} else if (strEQ(key, "byrow")) { |
|
6591
|
1
|
|
|
|
|
|
byrow = SvTRUE(val); |
|
6592
|
|
|
|
|
|
|
} else { |
|
6593
|
0
|
|
|
|
|
|
croak("Unknown option: %s", key); |
|
6594
|
|
|
|
|
|
|
} |
|
6595
|
|
|
|
|
|
|
} |
|
6596
|
|
|
|
|
|
|
} else { |
|
6597
|
0
|
|
|
|
|
|
croak("Usage: matrix($data_ref, $nrow, $ncol, $byrow) OR matrix(data => $data_ref, ...)"); |
|
6598
|
|
|
|
|
|
|
} |
|
6599
|
|
|
|
|
|
|
// Validate data input |
|
6600
|
6
|
50
|
|
|
|
|
if (!data_sv || !SvROK(data_sv) || SvTYPE(SvRV(data_sv)) != SVt_PVAV) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
6601
|
1
|
|
|
|
|
|
croak("The 'data' option must be an array reference (e.g. [1..6] or rnorm(6))"); |
|
6602
|
|
|
|
|
|
|
} |
|
6603
|
5
|
|
|
|
|
|
AV*restrict data_av = (AV*)SvRV(data_sv); |
|
6604
|
5
|
50
|
|
|
|
|
size_t data_len = (UV)(av_top_index(data_av) + 1); |
|
6605
|
5
|
100
|
|
|
|
|
if (data_len == 0) { |
|
6606
|
1
|
|
|
|
|
|
croak("Data array cannot be empty"); |
|
6607
|
|
|
|
|
|
|
} |
|
6608
|
|
|
|
|
|
|
// R-style dimension inference |
|
6609
|
4
|
50
|
|
|
|
|
if (!nrow_set && !ncol_set) { |
|
|
|
0
|
|
|
|
|
|
|
6610
|
0
|
|
|
|
|
|
nrow = data_len; |
|
6611
|
0
|
|
|
|
|
|
ncol = 1; |
|
6612
|
4
|
50
|
|
|
|
|
} else if (nrow_set && !ncol_set) { |
|
|
|
100
|
|
|
|
|
|
|
6613
|
3
|
|
|
|
|
|
ncol = (data_len + nrow - 1) / nrow; |
|
6614
|
1
|
50
|
|
|
|
|
} else if (!nrow_set && ncol_set) { |
|
|
|
0
|
|
|
|
|
|
|
6615
|
0
|
|
|
|
|
|
nrow = (data_len + ncol - 1) / ncol; |
|
6616
|
|
|
|
|
|
|
} |
|
6617
|
|
|
|
|
|
|
// Final safety check for dimensions |
|
6618
|
4
|
100
|
|
|
|
|
if (nrow == 0 || ncol == 0) { |
|
|
|
50
|
|
|
|
|
|
|
6619
|
1
|
|
|
|
|
|
croak("Dimensions must be greater than 0"); |
|
6620
|
|
|
|
|
|
|
} |
|
6621
|
|
|
|
|
|
|
// Create the matrix (Array of Arrays) |
|
6622
|
3
|
|
|
|
|
|
AV*restrict result_av = newAV(); |
|
6623
|
3
|
|
|
|
|
|
av_extend(result_av, nrow - 1); |
|
6624
|
|
|
|
|
|
|
size_t r, c; // Use unsigned types for counters to prevent negative indexing |
|
6625
|
3
|
|
|
|
|
|
AV**restrict row_ptrs = (AV**restrict)safemalloc(nrow * sizeof(AV*)); /* Pre-allocate row pointers */ |
|
6626
|
9
|
100
|
|
|
|
|
for (r = 0; r < nrow; r++) { |
|
6627
|
6
|
|
|
|
|
|
row_ptrs[r] = newAV(); |
|
6628
|
6
|
|
|
|
|
|
av_extend(row_ptrs[r], ncol - 1); |
|
6629
|
6
|
|
|
|
|
|
av_push(result_av, newRV_noinc((SV*)row_ptrs[r])); |
|
6630
|
|
|
|
|
|
|
} |
|
6631
|
|
|
|
|
|
|
// Fill the matrix |
|
6632
|
3
|
|
|
|
|
|
size_t total_cells = nrow * ncol; |
|
6633
|
21
|
100
|
|
|
|
|
for (size_t i = 0; i < total_cells; i++) { |
|
6634
|
|
|
|
|
|
|
// Vector recycling logic |
|
6635
|
18
|
|
|
|
|
|
SV**restrict fetched = av_fetch(data_av, i % data_len, 0); |
|
6636
|
18
|
50
|
|
|
|
|
SV*restrict val = fetched ? newSVsv(*fetched) : newSV(0); |
|
6637
|
18
|
100
|
|
|
|
|
if (byrow) { |
|
6638
|
6
|
|
|
|
|
|
r = i / ncol; |
|
6639
|
6
|
|
|
|
|
|
c = i % ncol; |
|
6640
|
|
|
|
|
|
|
} else { |
|
6641
|
12
|
|
|
|
|
|
r = i % nrow; |
|
6642
|
12
|
|
|
|
|
|
c = i / nrow; |
|
6643
|
|
|
|
|
|
|
} |
|
6644
|
18
|
|
|
|
|
|
av_store(row_ptrs[r], c, val); |
|
6645
|
|
|
|
|
|
|
} |
|
6646
|
3
|
|
|
|
|
|
safefree(row_ptrs); |
|
6647
|
3
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)result_av); |
|
6648
|
|
|
|
|
|
|
OUTPUT: |
|
6649
|
|
|
|
|
|
|
RETVAL |
|
6650
|
|
|
|
|
|
|
|
|
6651
|
|
|
|
|
|
|
SV* lm(...) |
|
6652
|
|
|
|
|
|
|
CODE: |
|
6653
|
|
|
|
|
|
|
{ |
|
6654
|
22
|
|
|
|
|
|
const char *restrict formula = NULL; |
|
6655
|
22
|
|
|
|
|
|
SV *restrict data_sv = NULL; |
|
6656
|
|
|
|
|
|
|
char f_cpy[512]; |
|
6657
|
|
|
|
|
|
|
char *restrict src, *restrict dst, *restrict tilde, *restrict lhs, *restrict rhs, *restrict chunk; |
|
6658
|
22
|
|
|
|
|
|
char **restrict terms = NULL, **restrict uniq_terms = NULL, **restrict exp_terms = NULL; |
|
6659
|
22
|
|
|
|
|
|
bool *restrict is_dummy = NULL; |
|
6660
|
22
|
|
|
|
|
|
char **restrict dummy_base = NULL, **restrict dummy_level = NULL; |
|
6661
|
22
|
|
|
|
|
|
unsigned int term_cap = 64, exp_cap = 64, num_terms = 0, num_uniq = 0, p = 0, p_exp = 0; |
|
6662
|
22
|
|
|
|
|
|
size_t n = 0, valid_n = 0, i, j, k, l, l1, l2; |
|
6663
|
22
|
|
|
|
|
|
bool has_intercept = TRUE; |
|
6664
|
22
|
|
|
|
|
|
char **restrict row_names = NULL, **restrict valid_row_names = NULL; |
|
6665
|
22
|
|
|
|
|
|
HV **restrict row_hashes = NULL; |
|
6666
|
22
|
|
|
|
|
|
HV *restrict data_hoa = NULL; |
|
6667
|
22
|
|
|
|
|
|
SV *restrict ref = NULL; |
|
6668
|
22
|
|
|
|
|
|
double *restrict X = NULL, *restrict Y = NULL, *restrict XtX = NULL, *restrict XtY = NULL; |
|
6669
|
22
|
|
|
|
|
|
bool *restrict aliased = NULL; |
|
6670
|
22
|
|
|
|
|
|
double *restrict beta = NULL; |
|
6671
|
22
|
|
|
|
|
|
int final_rank = 0, df_res = 0; |
|
6672
|
|
|
|
|
|
|
HV *restrict res_hv, *restrict coef_hv, *restrict fitted_hv, *restrict resid_hv, *restrict summary_hv; |
|
6673
|
|
|
|
|
|
|
AV *restrict terms_av; |
|
6674
|
22
|
|
|
|
|
|
double rss = 0.0, rse_sq = 0.0; |
|
6675
|
|
|
|
|
|
|
HE *restrict entry; |
|
6676
|
|
|
|
|
|
|
|
|
6677
|
22
|
50
|
|
|
|
|
if (items % 2 != 0) croak("Usage: lm(formula => 'mpg ~ wt * hp', data => \\%%mtcars)"); |
|
6678
|
|
|
|
|
|
|
|
|
6679
|
64
|
100
|
|
|
|
|
for (unsigned short i_arg = 0; i_arg < items; i_arg += 2) { |
|
6680
|
42
|
|
|
|
|
|
const char *restrict key = SvPV_nolen(ST(i_arg)); |
|
6681
|
42
|
|
|
|
|
|
SV *restrict val = ST(i_arg + 1); |
|
6682
|
42
|
100
|
|
|
|
|
if (strEQ(key, "formula")) formula = SvPV_nolen(val); |
|
6683
|
21
|
50
|
|
|
|
|
else if (strEQ(key, "data")) data_sv = val; |
|
6684
|
0
|
|
|
|
|
|
else croak("lm: unknown argument '%s'", key); |
|
6685
|
|
|
|
|
|
|
} |
|
6686
|
22
|
100
|
|
|
|
|
if (!formula) croak("lm: formula is required"); |
|
6687
|
21
|
100
|
|
|
|
|
if (!data_sv || !SvROK(data_sv)) croak("lm: data is required and must be a reference"); |
|
|
|
100
|
|
|
|
|
|
|
6688
|
|
|
|
|
|
|
|
|
6689
|
|
|
|
|
|
|
/* PHASE 1: Data Extraction */ |
|
6690
|
19
|
|
|
|
|
|
ref = SvRV(data_sv); |
|
6691
|
19
|
50
|
|
|
|
|
if (SvTYPE(ref) == SVt_PVHV) { |
|
6692
|
19
|
|
|
|
|
|
HV *restrict hv = (HV*)ref; |
|
6693
|
19
|
50
|
|
|
|
|
if (hv_iterinit(hv) == 0) croak("lm: Data hash is empty"); |
|
6694
|
19
|
|
|
|
|
|
entry = hv_iternext(hv); |
|
6695
|
19
|
50
|
|
|
|
|
if (entry) { |
|
6696
|
19
|
|
|
|
|
|
SV *restrict val = hv_iterval(hv, entry); |
|
6697
|
19
|
50
|
|
|
|
|
if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVAV) { |
|
|
|
100
|
|
|
|
|
|
|
6698
|
12
|
|
|
|
|
|
data_hoa = hv; |
|
6699
|
12
|
|
|
|
|
|
n = av_len((AV*)SvRV(val)) + 1; |
|
6700
|
12
|
50
|
|
|
|
|
Newx(row_names, n, char*); |
|
6701
|
82
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
6702
|
|
|
|
|
|
|
char buf[32]; |
|
6703
|
70
|
|
|
|
|
|
snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i + 1)); |
|
6704
|
70
|
|
|
|
|
|
row_names[i] = savepv(buf); |
|
6705
|
|
|
|
|
|
|
} |
|
6706
|
7
|
50
|
|
|
|
|
} else if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVHV) { |
|
|
|
50
|
|
|
|
|
|
|
6707
|
7
|
|
|
|
|
|
n = hv_iterinit(hv); |
|
6708
|
7
|
50
|
|
|
|
|
Newx(row_names, n, char*); Newx(row_hashes, n, HV*); |
|
|
|
50
|
|
|
|
|
|
|
6709
|
7
|
|
|
|
|
|
i = 0; |
|
6710
|
231
|
100
|
|
|
|
|
while ((entry = hv_iternext(hv))) { |
|
6711
|
|
|
|
|
|
|
I32 len; |
|
6712
|
224
|
|
|
|
|
|
row_names[i] = savepv(hv_iterkey(entry, &len)); |
|
6713
|
224
|
|
|
|
|
|
row_hashes[i] = (HV*)SvRV(hv_iterval(hv, entry)); |
|
6714
|
224
|
|
|
|
|
|
i++; |
|
6715
|
|
|
|
|
|
|
} |
|
6716
|
0
|
|
|
|
|
|
} else croak("lm: Hash values must be ArrayRefs (HoA) or HashRefs (HoH)"); |
|
6717
|
|
|
|
|
|
|
} |
|
6718
|
0
|
0
|
|
|
|
|
} else if (SvTYPE(ref) == SVt_PVAV) { |
|
6719
|
0
|
|
|
|
|
|
AV *restrict av = (AV*)ref; n = av_len(av) + 1; |
|
6720
|
0
|
0
|
|
|
|
|
Newx(row_names, n, char*); |
|
6721
|
0
|
0
|
|
|
|
|
Newx(row_hashes, n, HV*); |
|
6722
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
6723
|
0
|
|
|
|
|
|
SV **restrict val = av_fetch(av, i, 0); |
|
6724
|
0
|
0
|
|
|
|
|
if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVHV) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
6725
|
0
|
|
|
|
|
|
row_hashes[i] = (HV*)SvRV(*val); |
|
6726
|
0
|
|
|
|
|
|
char buf[32]; snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i + 1)); |
|
6727
|
0
|
|
|
|
|
|
row_names[i] = savepv(buf); |
|
6728
|
|
|
|
|
|
|
} else { |
|
6729
|
0
|
0
|
|
|
|
|
for (k = 0; k < i; k++) Safefree(row_names[k]); |
|
6730
|
0
|
|
|
|
|
|
Safefree(row_names); Safefree(row_hashes); |
|
6731
|
0
|
|
|
|
|
|
croak("lm: Array values must be HashRefs (AoH)"); |
|
6732
|
|
|
|
|
|
|
} |
|
6733
|
|
|
|
|
|
|
} |
|
6734
|
0
|
|
|
|
|
|
} else croak("lm: Data must be an Array or Hash reference"); |
|
6735
|
|
|
|
|
|
|
/* PHASE 2: Formula Parsing & `.` Expansion */ |
|
6736
|
19
|
|
|
|
|
|
src = (char*)formula; dst = f_cpy; |
|
6737
|
215
|
100
|
|
|
|
|
while (*src && (dst - f_cpy < 511)) { if (!isspace(*src)) { *dst++ = *src; } src++; } |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
6738
|
19
|
|
|
|
|
|
*dst = '\0'; |
|
6739
|
|
|
|
|
|
|
|
|
6740
|
19
|
|
|
|
|
|
tilde = strchr(f_cpy, '~'); |
|
6741
|
19
|
100
|
|
|
|
|
if (!tilde) { |
|
6742
|
3
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) Safefree(row_names[i]); |
|
6743
|
1
|
50
|
|
|
|
|
Safefree(row_names); if (row_hashes) Safefree(row_hashes); |
|
6744
|
1
|
|
|
|
|
|
croak("lm: invalid formula, missing '~'"); |
|
6745
|
|
|
|
|
|
|
} |
|
6746
|
18
|
|
|
|
|
|
*tilde = '\0'; |
|
6747
|
18
|
|
|
|
|
|
lhs = f_cpy; |
|
6748
|
18
|
|
|
|
|
|
rhs = tilde + 1; |
|
6749
|
|
|
|
|
|
|
|
|
6750
|
|
|
|
|
|
|
// Remove intercept-suppression markers from RHS. |
|
6751
|
|
|
|
|
|
|
// IMPORTANT: skip tokens that appear inside I(...) wrappers so that |
|
6752
|
|
|
|
|
|
|
// expressions like I(x^-1) are never mistakenly treated as "-1". |
|
6753
|
|
|
|
|
|
|
{ |
|
6754
|
18
|
|
|
|
|
|
char *restrict p_idx = rhs; |
|
6755
|
89
|
100
|
|
|
|
|
while (*p_idx) { |
|
6756
|
|
|
|
|
|
|
// Skip over I(...) sub-expressions entirely |
|
6757
|
71
|
50
|
|
|
|
|
if (p_idx[0] == 'I' && p_idx[1] == '(') { |
|
|
|
0
|
|
|
|
|
|
|
6758
|
0
|
|
|
|
|
|
int depth = 0; |
|
6759
|
0
|
0
|
|
|
|
|
while (*p_idx) { if (*p_idx == '(') depth++; else if (*p_idx == ')') { depth--; if (depth == 0) { p_idx++; break; } } p_idx++; } |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
6760
|
0
|
|
|
|
|
|
continue; |
|
6761
|
|
|
|
|
|
|
} |
|
6762
|
|
|
|
|
|
|
// Match bare -1 |
|
6763
|
71
|
100
|
|
|
|
|
if (p_idx[0] == '-' && p_idx[1] == '1' && |
|
|
|
50
|
|
|
|
|
|
|
6764
|
1
|
50
|
|
|
|
|
(p_idx[2] == '\0' || p_idx[2] == '+' || p_idx[2] == '-')) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
6765
|
1
|
|
|
|
|
|
has_intercept = FALSE; |
|
6766
|
1
|
|
|
|
|
|
memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); |
|
6767
|
1
|
|
|
|
|
|
continue; // re-examine same position |
|
6768
|
|
|
|
|
|
|
} |
|
6769
|
|
|
|
|
|
|
// Match +0 |
|
6770
|
70
|
100
|
|
|
|
|
if (p_idx[0] == '+' && p_idx[1] == '0' && |
|
|
|
50
|
|
|
|
|
|
|
6771
|
0
|
0
|
|
|
|
|
(p_idx[2] == '\0' || p_idx[2] == '+' || p_idx[2] == '-')) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
6772
|
0
|
|
|
|
|
|
has_intercept = FALSE; |
|
6773
|
0
|
|
|
|
|
|
memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); |
|
6774
|
0
|
|
|
|
|
|
continue; |
|
6775
|
|
|
|
|
|
|
} |
|
6776
|
|
|
|
|
|
|
// Match leading 0+ |
|
6777
|
70
|
100
|
|
|
|
|
if (p_idx == rhs && p_idx[0] == '0' && p_idx[1] == '+') { |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
6778
|
0
|
|
|
|
|
|
has_intercept = FALSE; |
|
6779
|
0
|
|
|
|
|
|
memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); |
|
6780
|
0
|
|
|
|
|
|
continue; |
|
6781
|
|
|
|
|
|
|
} |
|
6782
|
|
|
|
|
|
|
// Match bare 0 (entire rhs) |
|
6783
|
70
|
100
|
|
|
|
|
if (p_idx == rhs && p_idx[0] == '0' && p_idx[1] == '\0') { |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
6784
|
0
|
|
|
|
|
|
has_intercept = FALSE; p_idx[0] = '\0'; break; |
|
6785
|
|
|
|
|
|
|
} |
|
6786
|
|
|
|
|
|
|
// Strip redundant +1 (keep intercept, just remove marker) |
|
6787
|
70
|
100
|
|
|
|
|
if (p_idx[0] == '+' && p_idx[1] == '1' && |
|
|
|
50
|
|
|
|
|
|
|
6788
|
0
|
0
|
|
|
|
|
(p_idx[2] == '\0' || p_idx[2] == '+' || p_idx[2] == '-')) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
6789
|
0
|
|
|
|
|
|
memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); |
|
6790
|
0
|
|
|
|
|
|
continue; |
|
6791
|
|
|
|
|
|
|
} |
|
6792
|
|
|
|
|
|
|
// Strip leading bare 1 or 1+ |
|
6793
|
70
|
100
|
|
|
|
|
if (p_idx == rhs) { |
|
6794
|
18
|
50
|
|
|
|
|
if (p_idx[0] == '1' && p_idx[1] == '\0') { p_idx[0] = '\0'; break; } |
|
|
|
0
|
|
|
|
|
|
|
6795
|
18
|
50
|
|
|
|
|
if (p_idx[0] == '1' && p_idx[1] == '+') { memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); continue; } |
|
|
|
0
|
|
|
|
|
|
|
6796
|
|
|
|
|
|
|
} |
|
6797
|
70
|
|
|
|
|
|
p_idx++; |
|
6798
|
|
|
|
|
|
|
} |
|
6799
|
|
|
|
|
|
|
} |
|
6800
|
|
|
|
|
|
|
// Clean up stray `++`, leading `+`, trailing `+` |
|
6801
|
|
|
|
|
|
|
{ |
|
6802
|
|
|
|
|
|
|
char *restrict p_idx; |
|
6803
|
18
|
50
|
|
|
|
|
while ((p_idx = strstr(rhs, "++")) != NULL) |
|
6804
|
0
|
|
|
|
|
|
memmove(p_idx, p_idx + 1, strlen(p_idx + 1) + 1); |
|
6805
|
18
|
50
|
|
|
|
|
if (rhs[0] == '+') memmove(rhs, rhs + 1, strlen(rhs + 1) + 1); |
|
6806
|
18
|
|
|
|
|
|
size_t len_rhs = strlen(rhs); |
|
6807
|
18
|
50
|
|
|
|
|
if (len_rhs > 0 && rhs[len_rhs - 1] == '+') rhs[len_rhs - 1] = '\0'; |
|
|
|
50
|
|
|
|
|
|
|
6808
|
|
|
|
|
|
|
} |
|
6809
|
|
|
|
|
|
|
|
|
6810
|
|
|
|
|
|
|
// Expand `.` Operator |
|
6811
|
18
|
|
|
|
|
|
char rhs_expanded[2048] = ""; |
|
6812
|
18
|
|
|
|
|
|
size_t rhs_len = 0; |
|
6813
|
18
|
|
|
|
|
|
chunk = strtok(rhs, "+"); |
|
6814
|
44
|
100
|
|
|
|
|
while (chunk != NULL) { |
|
6815
|
26
|
100
|
|
|
|
|
if (strcmp(chunk, ".") == 0) { |
|
6816
|
1
|
|
|
|
|
|
AV *restrict cols = get_all_columns(aTHX_ data_hoa, row_hashes, n); |
|
6817
|
4
|
100
|
|
|
|
|
for (size_t c = 0; c <= (size_t)av_len(cols); c++) { |
|
6818
|
3
|
|
|
|
|
|
SV **restrict col_sv = av_fetch(cols, c, 0); |
|
6819
|
3
|
50
|
|
|
|
|
if (col_sv && SvOK(*col_sv)) { |
|
|
|
50
|
|
|
|
|
|
|
6820
|
3
|
|
|
|
|
|
const char *restrict col_name = SvPV_nolen(*col_sv); |
|
6821
|
3
|
100
|
|
|
|
|
if (strcmp(col_name, lhs) != 0) { |
|
6822
|
2
|
|
|
|
|
|
size_t slen = strlen(col_name); |
|
6823
|
2
|
50
|
|
|
|
|
if (rhs_len + slen + 2 < sizeof(rhs_expanded)) { |
|
6824
|
2
|
100
|
|
|
|
|
if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; } |
|
6825
|
2
|
|
|
|
|
|
strcat(rhs_expanded, col_name); |
|
6826
|
2
|
|
|
|
|
|
rhs_len += slen; |
|
6827
|
|
|
|
|
|
|
} |
|
6828
|
|
|
|
|
|
|
} |
|
6829
|
|
|
|
|
|
|
} |
|
6830
|
|
|
|
|
|
|
} |
|
6831
|
1
|
|
|
|
|
|
SvREFCNT_dec(cols); |
|
6832
|
|
|
|
|
|
|
} else { |
|
6833
|
25
|
|
|
|
|
|
size_t slen = strlen(chunk); |
|
6834
|
25
|
50
|
|
|
|
|
if (rhs_len + slen + 2 < sizeof(rhs_expanded)) { |
|
6835
|
25
|
100
|
|
|
|
|
if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; } |
|
6836
|
25
|
|
|
|
|
|
strcat(rhs_expanded, chunk); |
|
6837
|
25
|
|
|
|
|
|
rhs_len += slen; |
|
6838
|
|
|
|
|
|
|
} |
|
6839
|
|
|
|
|
|
|
} |
|
6840
|
26
|
|
|
|
|
|
chunk = strtok(NULL, "+"); |
|
6841
|
|
|
|
|
|
|
} |
|
6842
|
|
|
|
|
|
|
|
|
6843
|
18
|
|
|
|
|
|
Newx(terms, term_cap, char*); Newx(uniq_terms, term_cap, char*); |
|
6844
|
18
|
|
|
|
|
|
Newx(exp_terms, exp_cap, char*); Newx(is_dummy, exp_cap, bool); |
|
6845
|
18
|
|
|
|
|
|
Newx(dummy_base, exp_cap, char*); Newx(dummy_level, exp_cap, char*); |
|
6846
|
|
|
|
|
|
|
|
|
6847
|
18
|
100
|
|
|
|
|
if (has_intercept) { terms[num_terms++] = savepv("Intercept"); } |
|
6848
|
|
|
|
|
|
|
|
|
6849
|
18
|
50
|
|
|
|
|
if (strlen(rhs_expanded) > 0) { |
|
6850
|
18
|
|
|
|
|
|
chunk = strtok(rhs_expanded, "+"); |
|
6851
|
45
|
100
|
|
|
|
|
while (chunk != NULL) { |
|
6852
|
27
|
50
|
|
|
|
|
if (num_terms >= term_cap - 3) { |
|
6853
|
0
|
|
|
|
|
|
term_cap *= 2; |
|
6854
|
0
|
|
|
|
|
|
Renew(terms, term_cap, char*); Renew(uniq_terms, term_cap, char*); |
|
6855
|
|
|
|
|
|
|
} |
|
6856
|
27
|
|
|
|
|
|
char *restrict star = strchr(chunk, '*'); |
|
6857
|
27
|
100
|
|
|
|
|
if (star) { |
|
6858
|
1
|
|
|
|
|
|
*star = '\0'; |
|
6859
|
1
|
|
|
|
|
|
char *restrict left = chunk; |
|
6860
|
1
|
|
|
|
|
|
char *restrict right = star + 1; |
|
6861
|
1
|
|
|
|
|
|
char *restrict c_l = strchr(left, '^'); |
|
6862
|
1
|
50
|
|
|
|
|
if (c_l && strncmp(left, "I(", 2) != 0) *c_l = '\0'; |
|
|
|
0
|
|
|
|
|
|
|
6863
|
1
|
|
|
|
|
|
char *restrict c_r = strchr(right, '^'); |
|
6864
|
1
|
50
|
|
|
|
|
if (c_r && strncmp(right, "I(", 2) != 0) *c_r = '\0'; |
|
|
|
50
|
|
|
|
|
|
|
6865
|
1
|
|
|
|
|
|
terms[num_terms++] = savepv(left); |
|
6866
|
1
|
|
|
|
|
|
terms[num_terms++] = savepv(right); |
|
6867
|
1
|
|
|
|
|
|
size_t inter_len = strlen(left) + strlen(right) + 2; |
|
6868
|
1
|
|
|
|
|
|
terms[num_terms] = (char*)safemalloc(inter_len); |
|
6869
|
1
|
|
|
|
|
|
snprintf(terms[num_terms++], inter_len, "%s:%s", left, right); |
|
6870
|
|
|
|
|
|
|
} else { |
|
6871
|
26
|
|
|
|
|
|
char *restrict c_chunk = strchr(chunk, '^'); |
|
6872
|
26
|
50
|
|
|
|
|
if (c_chunk && strncmp(chunk, "I(", 2) != 0) *c_chunk = '\0'; |
|
|
|
0
|
|
|
|
|
|
|
6873
|
26
|
|
|
|
|
|
terms[num_terms++] = savepv(chunk); |
|
6874
|
|
|
|
|
|
|
} |
|
6875
|
27
|
|
|
|
|
|
chunk = strtok(NULL, "+"); |
|
6876
|
|
|
|
|
|
|
} |
|
6877
|
|
|
|
|
|
|
} |
|
6878
|
|
|
|
|
|
|
|
|
6879
|
64
|
100
|
|
|
|
|
for (i = 0; i < num_terms; i++) { |
|
6880
|
46
|
|
|
|
|
|
bool found = FALSE; |
|
6881
|
86
|
50
|
|
|
|
|
for (j = 0; j < num_uniq; j++) { if (strcmp(terms[i], uniq_terms[j]) == 0) { found = TRUE; break; } } |
|
|
|
100
|
|
|
|
|
|
|
6882
|
46
|
50
|
|
|
|
|
if (!found) uniq_terms[num_uniq++] = savepv(terms[i]); |
|
6883
|
|
|
|
|
|
|
} |
|
6884
|
18
|
|
|
|
|
|
p = num_uniq; |
|
6885
|
|
|
|
|
|
|
/* PHASE 3: Categorical Expansion*/ |
|
6886
|
64
|
100
|
|
|
|
|
for (j = 0; j < p; j++) { |
|
6887
|
46
|
50
|
|
|
|
|
if (p_exp + 32 >= exp_cap) { |
|
6888
|
0
|
|
|
|
|
|
exp_cap *= 2; |
|
6889
|
0
|
|
|
|
|
|
Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool); |
|
6890
|
0
|
|
|
|
|
|
Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*); |
|
6891
|
|
|
|
|
|
|
} |
|
6892
|
46
|
100
|
|
|
|
|
if (strcmp(uniq_terms[j], "Intercept") == 0) { |
|
6893
|
17
|
|
|
|
|
|
exp_terms[p_exp] = savepv("Intercept"); is_dummy[p_exp] = FALSE; p_exp++; continue; |
|
6894
|
|
|
|
|
|
|
} |
|
6895
|
29
|
100
|
|
|
|
|
if (is_column_categorical(aTHX_ data_hoa, row_hashes, n, uniq_terms[j])) { |
|
6896
|
5
|
|
|
|
|
|
char **restrict levels = NULL; |
|
6897
|
5
|
|
|
|
|
|
unsigned int num_levels = 0, levels_cap = 8; |
|
6898
|
5
|
|
|
|
|
|
Newx(levels, levels_cap, char*); |
|
6899
|
47
|
100
|
|
|
|
|
for (i = 0; i < n; i++) { |
|
6900
|
42
|
|
|
|
|
|
char *restrict str_val = get_data_string_alloc(aTHX_ data_hoa, row_hashes, i, uniq_terms[j]); |
|
6901
|
42
|
50
|
|
|
|
|
if (str_val) { |
|
6902
|
42
|
|
|
|
|
|
bool found = FALSE; |
|
6903
|
81
|
100
|
|
|
|
|
for (l = 0; l < num_levels; l++) { if (strcmp(levels[l], str_val) == 0) { found = TRUE; break; } } |
|
|
|
100
|
|
|
|
|
|
|
6904
|
42
|
100
|
|
|
|
|
if (!found) { |
|
6905
|
14
|
50
|
|
|
|
|
if (num_levels >= levels_cap) { levels_cap *= 2; Renew(levels, levels_cap, char*); } |
|
6906
|
14
|
|
|
|
|
|
levels[num_levels++] = savepv(str_val); |
|
6907
|
|
|
|
|
|
|
} |
|
6908
|
42
|
|
|
|
|
|
Safefree(str_val); |
|
6909
|
|
|
|
|
|
|
} |
|
6910
|
|
|
|
|
|
|
} |
|
6911
|
5
|
50
|
|
|
|
|
if (num_levels > 0) { |
|
6912
|
14
|
100
|
|
|
|
|
for (l1 = 0; l1 < num_levels - 1; l1++) |
|
6913
|
22
|
100
|
|
|
|
|
for (l2 = l1 + 1; l2 < num_levels; l2++) |
|
6914
|
13
|
100
|
|
|
|
|
if (strcmp(levels[l1], levels[l2]) > 0) { char *tmp = levels[l1]; levels[l1] = levels[l2]; levels[l2] = tmp; } |
|
6915
|
14
|
100
|
|
|
|
|
for (l = 1; l < num_levels; l++) { |
|
6916
|
9
|
50
|
|
|
|
|
if (p_exp >= exp_cap) { |
|
6917
|
0
|
|
|
|
|
|
exp_cap *= 2; |
|
6918
|
0
|
|
|
|
|
|
Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool); |
|
6919
|
0
|
|
|
|
|
|
Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*); |
|
6920
|
|
|
|
|
|
|
} |
|
6921
|
9
|
|
|
|
|
|
size_t t_len = strlen(uniq_terms[j]) + strlen(levels[l]) + 1; |
|
6922
|
9
|
|
|
|
|
|
exp_terms[p_exp] = (char*)safemalloc(t_len); |
|
6923
|
9
|
|
|
|
|
|
snprintf(exp_terms[p_exp], t_len, "%s%s", uniq_terms[j], levels[l]); |
|
6924
|
9
|
|
|
|
|
|
is_dummy[p_exp] = TRUE; |
|
6925
|
9
|
|
|
|
|
|
dummy_base[p_exp] = savepv(uniq_terms[j]); |
|
6926
|
9
|
|
|
|
|
|
dummy_level[p_exp] = savepv(levels[l]); |
|
6927
|
9
|
|
|
|
|
|
p_exp++; |
|
6928
|
|
|
|
|
|
|
} |
|
6929
|
19
|
100
|
|
|
|
|
for (l = 0; l < num_levels; l++) Safefree(levels[l]); |
|
6930
|
5
|
|
|
|
|
|
Safefree(levels); |
|
6931
|
|
|
|
|
|
|
} else { |
|
6932
|
0
|
|
|
|
|
|
Safefree(levels); |
|
6933
|
0
|
|
|
|
|
|
exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = FALSE; p_exp++; |
|
6934
|
|
|
|
|
|
|
} |
|
6935
|
|
|
|
|
|
|
} else { |
|
6936
|
24
|
|
|
|
|
|
exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = FALSE; p_exp++; |
|
6937
|
|
|
|
|
|
|
} |
|
6938
|
|
|
|
|
|
|
} |
|
6939
|
18
|
|
|
|
|
|
p = p_exp; |
|
6940
|
18
|
50
|
|
|
|
|
Newx(X, n * p, NV); Newx(Y, n, NV); |
|
|
|
50
|
|
|
|
|
|
|
6941
|
18
|
50
|
|
|
|
|
Newx(valid_row_names, n, char*); |
|
6942
|
|
|
|
|
|
|
// |
|
6943
|
|
|
|
|
|
|
// PHASE 4: Matrix Construction & Listwise Deletion |
|
6944
|
|
|
|
|
|
|
// |
|
6945
|
310
|
100
|
|
|
|
|
for (i = 0; i < n; i++) { |
|
6946
|
292
|
|
|
|
|
|
NV y_val = evaluate_term(aTHX_ data_hoa, row_hashes, i, lhs); |
|
6947
|
292
|
100
|
|
|
|
|
if (isnan(y_val)) { Safefree(row_names[i]); continue; } |
|
6948
|
|
|
|
|
|
|
|
|
6949
|
289
|
|
|
|
|
|
bool row_ok = TRUE; |
|
6950
|
289
|
|
|
|
|
|
NV *restrict row_x = (NV*)safemalloc(p * sizeof(NV)); |
|
6951
|
1112
|
100
|
|
|
|
|
for (j = 0; j < p; j++) { |
|
6952
|
823
|
100
|
|
|
|
|
if (strcmp(exp_terms[j], "Intercept") == 0) { |
|
6953
|
257
|
|
|
|
|
|
row_x[j] = 1.0; |
|
6954
|
566
|
100
|
|
|
|
|
} else if (is_dummy[j]) { |
|
6955
|
78
|
|
|
|
|
|
char *restrict str_val = get_data_string_alloc(aTHX_ data_hoa, row_hashes, i, dummy_base[j]); |
|
6956
|
78
|
50
|
|
|
|
|
if (str_val) { |
|
6957
|
78
|
100
|
|
|
|
|
row_x[j] = (strcmp(str_val, dummy_level[j]) == 0) ? 1.0 : 0.0; |
|
6958
|
78
|
|
|
|
|
|
Safefree(str_val); |
|
6959
|
0
|
|
|
|
|
|
} else { row_ok = FALSE; break; } |
|
6960
|
|
|
|
|
|
|
} else { |
|
6961
|
488
|
|
|
|
|
|
row_x[j] = evaluate_term(aTHX_ data_hoa, row_hashes, i, exp_terms[j]); |
|
6962
|
488
|
50
|
|
|
|
|
if (isnan(row_x[j])) { row_ok = FALSE; break; } |
|
6963
|
|
|
|
|
|
|
} |
|
6964
|
|
|
|
|
|
|
} |
|
6965
|
289
|
50
|
|
|
|
|
if (!row_ok) { Safefree(row_names[i]); Safefree(row_x); continue; } |
|
6966
|
289
|
|
|
|
|
|
Y[valid_n] = y_val; |
|
6967
|
1112
|
100
|
|
|
|
|
for (j = 0; j < p; j++) X[valid_n * p + j] = row_x[j]; |
|
6968
|
289
|
|
|
|
|
|
valid_row_names[valid_n] = row_names[i]; |
|
6969
|
289
|
|
|
|
|
|
valid_n++; |
|
6970
|
289
|
|
|
|
|
|
Safefree(row_x); |
|
6971
|
|
|
|
|
|
|
} |
|
6972
|
18
|
|
|
|
|
|
Safefree(row_names); |
|
6973
|
18
|
100
|
|
|
|
|
if (valid_n <= p) { |
|
6974
|
7
|
100
|
|
|
|
|
for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms); |
|
6975
|
7
|
100
|
|
|
|
|
for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms); |
|
6976
|
7
|
100
|
|
|
|
|
for (j = 0; j < p_exp; j++) { |
|
6977
|
5
|
|
|
|
|
|
Safefree(exp_terms[j]); |
|
6978
|
5
|
50
|
|
|
|
|
if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); } |
|
6979
|
|
|
|
|
|
|
} |
|
6980
|
2
|
|
|
|
|
|
Safefree(exp_terms); Safefree(is_dummy); Safefree(dummy_base); Safefree(dummy_level); |
|
6981
|
2
|
|
|
|
|
|
Safefree(X); Safefree(Y); Safefree(valid_row_names); |
|
6982
|
2
|
50
|
|
|
|
|
if (row_hashes) Safefree(row_hashes); |
|
6983
|
2
|
|
|
|
|
|
croak("lm: 0 degrees of freedom (too many NAs or parameters > observations)"); |
|
6984
|
|
|
|
|
|
|
} |
|
6985
|
|
|
|
|
|
|
// PHASE 5: OLS Math |
|
6986
|
16
|
|
|
|
|
|
Newxz(XtX, p * p, NV); |
|
6987
|
61
|
100
|
|
|
|
|
for (i = 0; i < p; i++) |
|
6988
|
178
|
100
|
|
|
|
|
for (j = 0; j < p; j++) { |
|
6989
|
133
|
|
|
|
|
|
NV sum = 0.0; |
|
6990
|
2620
|
100
|
|
|
|
|
for (k = 0; k < valid_n; k++) sum += X[k * p + i] * X[k * p + j]; |
|
6991
|
133
|
|
|
|
|
|
XtX[i * p + j] = sum; |
|
6992
|
|
|
|
|
|
|
} |
|
6993
|
16
|
|
|
|
|
|
Newxz(XtY, p, NV); |
|
6994
|
61
|
100
|
|
|
|
|
for (i = 0; i < p; i++) { |
|
6995
|
45
|
|
|
|
|
|
NV sum = 0.0; |
|
6996
|
860
|
100
|
|
|
|
|
for (k = 0; k < valid_n; k++) sum += X[k * p + i] * Y[k]; |
|
6997
|
45
|
|
|
|
|
|
XtY[i] = sum; |
|
6998
|
|
|
|
|
|
|
} |
|
6999
|
16
|
|
|
|
|
|
Newx(aliased, p, bool); |
|
7000
|
16
|
|
|
|
|
|
final_rank = sweep_matrix_ols(XtX, p, aliased); |
|
7001
|
16
|
|
|
|
|
|
Newxz(beta, p, NV); |
|
7002
|
61
|
100
|
|
|
|
|
for (i = 0; i < p; i++) { |
|
7003
|
45
|
100
|
|
|
|
|
if (aliased[i]) { beta[i] = NAN; } |
|
7004
|
|
|
|
|
|
|
else { |
|
7005
|
44
|
|
|
|
|
|
NV sum = 0.0; |
|
7006
|
174
|
100
|
|
|
|
|
for (j = 0; j < p; j++) if (!aliased[j]) sum += XtX[i * p + j] * XtY[j]; |
|
|
|
100
|
|
|
|
|
|
|
7007
|
44
|
|
|
|
|
|
beta[i] = sum; |
|
7008
|
|
|
|
|
|
|
} |
|
7009
|
|
|
|
|
|
|
} |
|
7010
|
|
|
|
|
|
|
// PHASE 6: Metrics & Cleanup |
|
7011
|
16
|
|
|
|
|
|
res_hv = newHV(); coef_hv = newHV(); fitted_hv = newHV(); resid_hv = newHV(); |
|
7012
|
16
|
|
|
|
|
|
summary_hv = newHV(); terms_av = newAV(); |
|
7013
|
16
|
|
|
|
|
|
df_res = (int)valid_n - final_rank; |
|
7014
|
|
|
|
|
|
|
// rss / mss accumulated here — rse_sq computed AFTER this loop (not before) |
|
7015
|
16
|
|
|
|
|
|
NV sum_y = 0.0, mss = 0.0; |
|
7016
|
302
|
100
|
|
|
|
|
for (i = 0; i < valid_n; i++) sum_y += Y[i]; |
|
7017
|
16
|
|
|
|
|
|
NV mean_y = sum_y / (NV)valid_n; |
|
7018
|
302
|
100
|
|
|
|
|
for (i = 0; i < valid_n; i++) { |
|
7019
|
286
|
|
|
|
|
|
NV y_hat = 0.0; |
|
7020
|
1101
|
100
|
|
|
|
|
for (j = 0; j < p; j++) if (!aliased[j]) y_hat += X[i * p + j] * beta[j]; |
|
|
|
100
|
|
|
|
|
|
|
7021
|
286
|
|
|
|
|
|
NV res = Y[i] - y_hat; |
|
7022
|
286
|
|
|
|
|
|
rss += res * res; |
|
7023
|
286
|
100
|
|
|
|
|
NV diff_m = has_intercept ? (y_hat - mean_y) : y_hat; |
|
7024
|
286
|
|
|
|
|
|
mss += diff_m * diff_m; |
|
7025
|
286
|
|
|
|
|
|
hv_store(fitted_hv, valid_row_names[i], strlen(valid_row_names[i]), newSVnv(y_hat), 0); |
|
7026
|
286
|
|
|
|
|
|
hv_store(resid_hv, valid_row_names[i], strlen(valid_row_names[i]), newSVnv(res), 0); |
|
7027
|
286
|
|
|
|
|
|
Safefree(valid_row_names[i]); |
|
7028
|
|
|
|
|
|
|
} |
|
7029
|
16
|
|
|
|
|
|
Safefree(valid_row_names); |
|
7030
|
|
|
|
|
|
|
// Single, authoritative rse_sq calculation |
|
7031
|
16
|
50
|
|
|
|
|
rse_sq = (df_res > 0) ? (rss / (NV)df_res) : NAN; |
|
7032
|
|
|
|
|
|
|
|
|
7033
|
16
|
|
|
|
|
|
int df_int = has_intercept ? 1 : 0; |
|
7034
|
16
|
|
|
|
|
|
NV r_squared = 0.0, adj_r_squared = 0.0, f_stat = NAN, f_pvalue = NAN; |
|
7035
|
16
|
|
|
|
|
|
int numdf = final_rank - df_int; |
|
7036
|
|
|
|
|
|
|
|
|
7037
|
16
|
50
|
|
|
|
|
if (final_rank != df_int && (mss + rss) > 0.0) { |
|
|
|
50
|
|
|
|
|
|
|
7038
|
16
|
|
|
|
|
|
r_squared = mss / (mss + rss); |
|
7039
|
16
|
|
|
|
|
|
adj_r_squared = 1.0 - (1.0 - r_squared) * ((valid_n - df_int) / (NV)df_res); |
|
7040
|
16
|
50
|
|
|
|
|
if (rse_sq > 0.0 && numdf > 0) { |
|
|
|
50
|
|
|
|
|
|
|
7041
|
16
|
|
|
|
|
|
f_stat = (mss / (NV)numdf) / rse_sq; |
|
7042
|
16
|
|
|
|
|
|
f_pvalue = 1.0 - pf(f_stat, (NV)numdf, (NV)df_res); |
|
7043
|
0
|
0
|
|
|
|
|
} else if (rse_sq == 0.0) { |
|
7044
|
0
|
|
|
|
|
|
f_stat = INFINITY; |
|
7045
|
0
|
|
|
|
|
|
f_pvalue = 0.0; |
|
7046
|
|
|
|
|
|
|
} |
|
7047
|
0
|
0
|
|
|
|
|
} else if (final_rank == df_int) { |
|
7048
|
0
|
|
|
|
|
|
r_squared = 0.0; adj_r_squared = 0.0; |
|
7049
|
|
|
|
|
|
|
} |
|
7050
|
61
|
100
|
|
|
|
|
for (j = 0; j < p; j++) { |
|
7051
|
45
|
|
|
|
|
|
hv_store(coef_hv, exp_terms[j], strlen(exp_terms[j]), newSVnv(beta[j]), 0); |
|
7052
|
45
|
|
|
|
|
|
av_push(terms_av, newSVpv(exp_terms[j], 0)); |
|
7053
|
45
|
|
|
|
|
|
HV *restrict row_hv = newHV(); |
|
7054
|
45
|
100
|
|
|
|
|
if (aliased[j]) { |
|
7055
|
1
|
|
|
|
|
|
hv_store(row_hv, "Estimate", 8, newSVpv("NaN", 0), 0); |
|
7056
|
1
|
|
|
|
|
|
hv_store(row_hv, "Std. Error", 10, newSVpv("NaN", 0), 0); |
|
7057
|
1
|
|
|
|
|
|
hv_store(row_hv, "t value", 7, newSVpv("NaN", 0), 0); |
|
7058
|
1
|
|
|
|
|
|
hv_store(row_hv, "Pr(>|t|)", 8, newSVpv("NaN", 0), 0); |
|
7059
|
|
|
|
|
|
|
} else { |
|
7060
|
44
|
|
|
|
|
|
NV se = sqrt(rse_sq * XtX[j * p + j]); |
|
7061
|
44
|
50
|
|
|
|
|
NV t_val = (se > 0.0) ? (beta[j] / se) : (INFINITY * (beta[j] >= 0.0 ? 1.0 : -1.0)); |
|
|
|
0
|
|
|
|
|
|
|
7062
|
44
|
|
|
|
|
|
NV p_val = get_t_pvalue(t_val, df_res, "two.sided"); |
|
7063
|
44
|
|
|
|
|
|
hv_store(row_hv, "Estimate", 8, newSVnv(beta[j]), 0); |
|
7064
|
44
|
|
|
|
|
|
hv_store(row_hv, "Std. Error", 10, newSVnv(se), 0); |
|
7065
|
44
|
|
|
|
|
|
hv_store(row_hv, "t value", 7, newSVnv(t_val), 0); |
|
7066
|
44
|
|
|
|
|
|
hv_store(row_hv, "Pr(>|t|)", 8, newSVnv(p_val), 0); |
|
7067
|
|
|
|
|
|
|
} |
|
7068
|
45
|
|
|
|
|
|
hv_store(summary_hv, exp_terms[j], strlen(exp_terms[j]), newRV_noinc((SV*)row_hv), 0); |
|
7069
|
|
|
|
|
|
|
} |
|
7070
|
16
|
|
|
|
|
|
hv_store(res_hv, "coefficients", 12, newRV_noinc((SV*)coef_hv), 0); |
|
7071
|
16
|
|
|
|
|
|
hv_store(res_hv, "fitted.values", 13, newRV_noinc((SV*)fitted_hv), 0); |
|
7072
|
16
|
|
|
|
|
|
hv_store(res_hv, "residuals", 9, newRV_noinc((SV*)resid_hv), 0); |
|
7073
|
16
|
|
|
|
|
|
hv_store(res_hv, "df.residual", 11, newSVuv(df_res), 0); |
|
7074
|
16
|
|
|
|
|
|
hv_store(res_hv, "rank", 4, newSVuv(final_rank), 0); |
|
7075
|
16
|
|
|
|
|
|
hv_store(res_hv, "rss", 3, newSVnv(rss), 0); |
|
7076
|
16
|
|
|
|
|
|
hv_store(res_hv, "summary", 7, newRV_noinc((SV*)summary_hv),0); |
|
7077
|
16
|
|
|
|
|
|
hv_store(res_hv, "terms", 5, newRV_noinc((SV*)terms_av), 0); |
|
7078
|
16
|
|
|
|
|
|
hv_store(res_hv, "r.squared", 9, newSVnv(r_squared), 0); |
|
7079
|
16
|
|
|
|
|
|
hv_store(res_hv, "adj.r.squared", 13, newSVnv(adj_r_squared), 0); |
|
7080
|
16
|
50
|
|
|
|
|
if (!isnan(f_stat)) { |
|
7081
|
16
|
|
|
|
|
|
AV *fstat_av = newAV(); |
|
7082
|
16
|
|
|
|
|
|
av_push(fstat_av, newSVnv(f_stat)); |
|
7083
|
16
|
|
|
|
|
|
av_push(fstat_av, newSViv(numdf)); |
|
7084
|
16
|
|
|
|
|
|
av_push(fstat_av, newSViv(df_res)); |
|
7085
|
16
|
|
|
|
|
|
hv_store(res_hv, "fstatistic", 10, newRV_noinc((SV*)fstat_av), 0); |
|
7086
|
16
|
|
|
|
|
|
hv_store(res_hv, "f.pvalue", 8, newSVnv(f_pvalue), 0); |
|
7087
|
|
|
|
|
|
|
} |
|
7088
|
|
|
|
|
|
|
// Deep Cleanup |
|
7089
|
57
|
100
|
|
|
|
|
for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms); |
|
7090
|
57
|
100
|
|
|
|
|
for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms); |
|
7091
|
61
|
100
|
|
|
|
|
for (j = 0; j < p_exp; j++) { |
|
7092
|
45
|
|
|
|
|
|
Safefree(exp_terms[j]); |
|
7093
|
45
|
100
|
|
|
|
|
if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); } |
|
7094
|
|
|
|
|
|
|
} |
|
7095
|
16
|
|
|
|
|
|
Safefree(exp_terms); Safefree(is_dummy); Safefree(dummy_base); Safefree(dummy_level); |
|
7096
|
16
|
|
|
|
|
|
Safefree(X); Safefree(Y); Safefree(XtX); Safefree(XtY); |
|
7097
|
16
|
|
|
|
|
|
Safefree(beta); Safefree(aliased); |
|
7098
|
16
|
100
|
|
|
|
|
if (row_hashes) Safefree(row_hashes); |
|
7099
|
|
|
|
|
|
|
|
|
7100
|
16
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)res_hv); |
|
7101
|
|
|
|
|
|
|
} |
|
7102
|
|
|
|
|
|
|
OUTPUT: |
|
7103
|
|
|
|
|
|
|
RETVAL |
|
7104
|
|
|
|
|
|
|
|
|
7105
|
|
|
|
|
|
|
void seq(from, to, by = 1.0) |
|
7106
|
|
|
|
|
|
|
NV from |
|
7107
|
|
|
|
|
|
|
NV to |
|
7108
|
|
|
|
|
|
|
NV by |
|
7109
|
|
|
|
|
|
|
PPCODE: |
|
7110
|
|
|
|
|
|
|
{ |
|
7111
|
|
|
|
|
|
|
//Handle the zero 'by' case |
|
7112
|
6
|
50
|
|
|
|
|
if (by == 0.0) { |
|
7113
|
0
|
0
|
|
|
|
|
if (from == to) { |
|
7114
|
0
|
0
|
|
|
|
|
EXTEND(SP, 1); |
|
7115
|
0
|
|
|
|
|
|
mPUSHn(from); |
|
7116
|
0
|
|
|
|
|
|
XSRETURN(1); |
|
7117
|
|
|
|
|
|
|
} else { |
|
7118
|
0
|
|
|
|
|
|
croak("invalid 'by' argument: cannot be zero when from != to"); |
|
7119
|
|
|
|
|
|
|
} |
|
7120
|
|
|
|
|
|
|
} |
|
7121
|
|
|
|
|
|
|
// Check for wrong direction / infinite loop |
|
7122
|
6
|
100
|
|
|
|
|
if ((from < to && by < 0.0) || (from > to && by > 0.0)) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
7123
|
0
|
|
|
|
|
|
croak("wrong sign in 'by' argument"); |
|
7124
|
|
|
|
|
|
|
} |
|
7125
|
|
|
|
|
|
|
/* * Calculate number of elements. |
|
7126
|
|
|
|
|
|
|
* R uses a small epsilon (like 1e-10) to avoid dropping the last |
|
7127
|
|
|
|
|
|
|
* element due to floating point inaccuracies. |
|
7128
|
|
|
|
|
|
|
*/ |
|
7129
|
6
|
|
|
|
|
|
NV n_elements_d = (to - from) / by; |
|
7130
|
6
|
50
|
|
|
|
|
if (n_elements_d < 0.0) n_elements_d = 0.0; |
|
7131
|
6
|
|
|
|
|
|
size_t n_elements = (n_elements_d + 1e-10) + 1; |
|
7132
|
|
|
|
|
|
|
// Pre-extend the stack to avoid reallocating inside the loop |
|
7133
|
6
|
50
|
|
|
|
|
EXTEND(SP, n_elements); |
|
7134
|
3033
|
100
|
|
|
|
|
for (size_t i = 0; i < n_elements; i++) { |
|
7135
|
3027
|
|
|
|
|
|
mPUSHn(from + i * by); |
|
7136
|
|
|
|
|
|
|
} |
|
7137
|
6
|
|
|
|
|
|
XSRETURN(n_elements); |
|
7138
|
|
|
|
|
|
|
} |
|
7139
|
|
|
|
|
|
|
|
|
7140
|
|
|
|
|
|
|
SV* rnorm(...) |
|
7141
|
|
|
|
|
|
|
CODE: |
|
7142
|
|
|
|
|
|
|
{ |
|
7143
|
|
|
|
|
|
|
// Auto-seed the PRNG if the Perl script hasn't done so yet |
|
7144
|
2
|
100
|
|
|
|
|
AUTO_SEED_PRNG(); |
|
7145
|
2
|
|
|
|
|
|
size_t n = 0; |
|
7146
|
2
|
|
|
|
|
|
NV mean = 0.0, sd = 1.0; |
|
7147
|
2
|
|
|
|
|
|
int arg_start = 0; |
|
7148
|
|
|
|
|
|
|
// Check if the first argument is a simple integer (rnorm(33)) |
|
7149
|
2
|
50
|
|
|
|
|
if (items > 0 && SvIOK(ST(0)) && (items == 1 || items % 2 != 0)) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
7150
|
0
|
|
|
|
|
|
n = (unsigned int)SvUV(ST(0)); |
|
7151
|
0
|
|
|
|
|
|
arg_start = 1; // Start parsing named arguments from the second element |
|
7152
|
|
|
|
|
|
|
} |
|
7153
|
|
|
|
|
|
|
|
|
7154
|
|
|
|
|
|
|
// --- Parse remaining named arguments from the flat stack --- |
|
7155
|
2
|
50
|
|
|
|
|
if ((items - arg_start) % 2 != 0) { |
|
7156
|
0
|
|
|
|
|
|
croak("Usage: rnorm(n), rnorm(n => 10, mean => 0, sd => 1), or rnorm(33, mean => 0)"); |
|
7157
|
|
|
|
|
|
|
} |
|
7158
|
|
|
|
|
|
|
|
|
7159
|
7
|
100
|
|
|
|
|
for (int i = arg_start; i < items; i += 2) { |
|
7160
|
5
|
|
|
|
|
|
const char* restrict key = SvPV_nolen(ST(i)); |
|
7161
|
5
|
|
|
|
|
|
SV* restrict val = ST(i + 1); |
|
7162
|
|
|
|
|
|
|
|
|
7163
|
5
|
100
|
|
|
|
|
if (strEQ(key, "n")) n = (unsigned int)SvUV(val); |
|
7164
|
3
|
100
|
|
|
|
|
else if (strEQ(key, "mean")) mean = SvNV(val); |
|
7165
|
2
|
50
|
|
|
|
|
else if (strEQ(key, "sd")) sd = SvNV(val); |
|
7166
|
0
|
|
|
|
|
|
else croak("rnorm: unknown argument '%s'", key); |
|
7167
|
|
|
|
|
|
|
} |
|
7168
|
2
|
100
|
|
|
|
|
if (sd < 0.0) croak("rnorm: standard deviation must be non-negative"); |
|
7169
|
1
|
|
|
|
|
|
AV *restrict result_av = newAV(); |
|
7170
|
1
|
50
|
|
|
|
|
if (n > 0) { |
|
7171
|
1
|
|
|
|
|
|
av_extend(result_av, n - 1); |
|
7172
|
|
|
|
|
|
|
// Generate random normals using the Box-Muller transform |
|
7173
|
5002
|
100
|
|
|
|
|
for (size_t i = 0; i < n; ) { |
|
7174
|
|
|
|
|
|
|
NV u, v, s; |
|
7175
|
|
|
|
|
|
|
do { |
|
7176
|
|
|
|
|
|
|
// Drand01() hooks into Perl's internal PRNG, respecting Perl's srand() |
|
7177
|
6357
|
|
|
|
|
|
u = 2.0 * Drand01() - 1.0; |
|
7178
|
6357
|
|
|
|
|
|
v = 2.0 * Drand01() - 1.0; |
|
7179
|
6357
|
|
|
|
|
|
s = u * u + v * v; |
|
7180
|
6357
|
100
|
|
|
|
|
} while (s >= 1.0 || s == 0.0); |
|
|
|
50
|
|
|
|
|
|
|
7181
|
5000
|
|
|
|
|
|
NV mul = sqrt(-2.0 * log(s) / s); |
|
7182
|
|
|
|
|
|
|
// Box-Muller generates two independent values per iteration |
|
7183
|
5000
|
|
|
|
|
|
av_store(result_av, i++, newSVnv(mean + sd * u * mul)); |
|
7184
|
5000
|
100
|
|
|
|
|
if (i < n) { |
|
7185
|
4999
|
|
|
|
|
|
av_store(result_av, i++, newSVnv(mean + sd * v * mul)); |
|
7186
|
|
|
|
|
|
|
} |
|
7187
|
|
|
|
|
|
|
} |
|
7188
|
|
|
|
|
|
|
} |
|
7189
|
1
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)result_av); |
|
7190
|
|
|
|
|
|
|
} |
|
7191
|
|
|
|
|
|
|
OUTPUT: |
|
7192
|
|
|
|
|
|
|
RETVAL |
|
7193
|
|
|
|
|
|
|
|
|
7194
|
|
|
|
|
|
|
SV* aov(data_sv, formula_sv = &PL_sv_undef) |
|
7195
|
|
|
|
|
|
|
SV* data_sv |
|
7196
|
|
|
|
|
|
|
SV* formula_sv |
|
7197
|
|
|
|
|
|
|
CODE: |
|
7198
|
|
|
|
|
|
|
{ |
|
7199
|
|
|
|
|
|
|
const char *restrict formula; |
|
7200
|
10
|
|
|
|
|
|
SV *restrict orig_data_sv = data_sv; |
|
7201
|
10
|
|
|
|
|
|
bool is_stacked = FALSE; |
|
7202
|
|
|
|
|
|
|
// |
|
7203
|
|
|
|
|
|
|
// PHASE 0: R-style stack() for missing formula |
|
7204
|
|
|
|
|
|
|
// |
|
7205
|
10
|
50
|
|
|
|
|
if (!formula_sv || !SvOK(formula_sv) || SvCUR(formula_sv) == 0) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
7206
|
1
|
50
|
|
|
|
|
if (!SvROK(data_sv) || SvTYPE(SvRV(data_sv)) != SVt_PVHV) { |
|
|
|
50
|
|
|
|
|
|
|
7207
|
0
|
|
|
|
|
|
croak("aov: Without a formula, data must be a HashRef of ArrayRefs (mimicking R's named list)"); |
|
7208
|
|
|
|
|
|
|
} |
|
7209
|
|
|
|
|
|
|
|
|
7210
|
1
|
|
|
|
|
|
is_stacked = TRUE; |
|
7211
|
1
|
|
|
|
|
|
HV *restrict input_hv = (HV*)SvRV(data_sv); |
|
7212
|
1
|
|
|
|
|
|
HV *restrict stacked_hv = newHV(); |
|
7213
|
1
|
|
|
|
|
|
AV *restrict val_av = newAV(); |
|
7214
|
1
|
|
|
|
|
|
AV *restrict grp_av = newAV(); |
|
7215
|
1
|
|
|
|
|
|
hv_iterinit(input_hv); |
|
7216
|
|
|
|
|
|
|
HE *restrict entry; |
|
7217
|
3
|
100
|
|
|
|
|
while ((entry = hv_iternext(input_hv))) { |
|
7218
|
2
|
|
|
|
|
|
SV *restrict grp_name_sv = hv_iterkeysv(entry); |
|
7219
|
2
|
|
|
|
|
|
SV *restrict arr_ref = hv_iterval(input_hv, entry); |
|
7220
|
4
|
50
|
|
|
|
|
if (SvROK(arr_ref) && SvTYPE(SvRV(arr_ref)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
7221
|
2
|
|
|
|
|
|
AV *restrict arr = (AV*)SvRV(arr_ref); |
|
7222
|
2
|
|
|
|
|
|
size_t len = av_len(arr); |
|
7223
|
14
|
100
|
|
|
|
|
for (size_t k = 0; k <= len; k++) { |
|
7224
|
12
|
|
|
|
|
|
SV **restrict v = av_fetch(arr, k, 0); |
|
7225
|
12
|
50
|
|
|
|
|
if (v && *v && SvOK(*v)) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
7226
|
12
|
|
|
|
|
|
av_push(val_av, newSVsv(*v)); |
|
7227
|
12
|
|
|
|
|
|
av_push(grp_av, newSVsv(grp_name_sv)); |
|
7228
|
|
|
|
|
|
|
} |
|
7229
|
|
|
|
|
|
|
} |
|
7230
|
|
|
|
|
|
|
} else { |
|
7231
|
0
|
|
|
|
|
|
SvREFCNT_dec(val_av); SvREFCNT_dec(grp_av); SvREFCNT_dec(stacked_hv); |
|
7232
|
0
|
|
|
|
|
|
croak("aov: Hash values must be ArrayRefs when no formula is provided"); |
|
7233
|
|
|
|
|
|
|
} |
|
7234
|
|
|
|
|
|
|
} |
|
7235
|
1
|
|
|
|
|
|
hv_stores(stacked_hv, "Value", newRV_noinc((SV*)val_av)); |
|
7236
|
1
|
|
|
|
|
|
hv_stores(stacked_hv, "Group", newRV_noinc((SV*)grp_av)); |
|
7237
|
|
|
|
|
|
|
// sv_2mortal ensures memory is freed automatically on return or croak |
|
7238
|
1
|
|
|
|
|
|
data_sv = sv_2mortal(newRV_noinc((SV*)stacked_hv)); |
|
7239
|
1
|
|
|
|
|
|
formula = "Value~Group"; |
|
7240
|
|
|
|
|
|
|
} else { |
|
7241
|
9
|
|
|
|
|
|
formula = SvPV_nolen(formula_sv); |
|
7242
|
|
|
|
|
|
|
} |
|
7243
|
|
|
|
|
|
|
char f_cpy[512]; |
|
7244
|
|
|
|
|
|
|
char *restrict src, *restrict dst, *restrict tilde, *restrict lhs, *restrict rhs, *restrict chunk; |
|
7245
|
10
|
|
|
|
|
|
char **restrict terms = NULL, **restrict uniq_terms = NULL, **restrict exp_terms = NULL, **restrict parent_term = NULL; |
|
7246
|
10
|
|
|
|
|
|
bool *restrict is_dummy = NULL, *is_interact = NULL; |
|
7247
|
10
|
|
|
|
|
|
char **restrict dummy_base = NULL, **restrict dummy_level = NULL; |
|
7248
|
10
|
|
|
|
|
|
int *restrict term_map = NULL, *restrict left_idx = NULL, *restrict right_idx = NULL; |
|
7249
|
10
|
|
|
|
|
|
unsigned int term_cap = 64, exp_cap = 64, num_terms = 0, num_uniq = 0, p = 0, p_exp = 0; |
|
7250
|
10
|
|
|
|
|
|
size_t n = 0, valid_n = 0, i, j; |
|
7251
|
10
|
|
|
|
|
|
bool has_intercept = TRUE; |
|
7252
|
10
|
|
|
|
|
|
char **restrict row_names = NULL; |
|
7253
|
10
|
|
|
|
|
|
HV **restrict row_hashes = NULL; |
|
7254
|
10
|
|
|
|
|
|
HV *restrict data_hoa = NULL; |
|
7255
|
10
|
|
|
|
|
|
SV *restrict ref = NULL; |
|
7256
|
|
|
|
|
|
|
HE *restrict entry; |
|
7257
|
10
|
|
|
|
|
|
NV **restrict X_mat = NULL; |
|
7258
|
10
|
|
|
|
|
|
NV *restrict Y = NULL; |
|
7259
|
10
|
|
|
|
|
|
char **restrict term_base_level = NULL; /* reference level for each uniq_term (NULL if not categorical) */ |
|
7260
|
10
|
50
|
|
|
|
|
if (!SvROK(data_sv)) croak("aov: data is required and must be a reference"); |
|
7261
|
|
|
|
|
|
|
// |
|
7262
|
|
|
|
|
|
|
// PHASE 1: Data Extraction |
|
7263
|
|
|
|
|
|
|
// |
|
7264
|
10
|
|
|
|
|
|
ref = SvRV(data_sv); |
|
7265
|
10
|
50
|
|
|
|
|
if (SvTYPE(ref) == SVt_PVHV) { |
|
7266
|
10
|
|
|
|
|
|
HV*restrict hv = (HV*)ref; |
|
7267
|
10
|
50
|
|
|
|
|
if (hv_iterinit(hv) == 0) croak("aov: Data hash is empty"); |
|
7268
|
10
|
|
|
|
|
|
entry = hv_iternext(hv); |
|
7269
|
10
|
50
|
|
|
|
|
if (entry) { |
|
7270
|
10
|
|
|
|
|
|
SV*restrict val = hv_iterval(hv, entry); |
|
7271
|
10
|
50
|
|
|
|
|
if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
7272
|
10
|
|
|
|
|
|
data_hoa = hv; |
|
7273
|
10
|
|
|
|
|
|
n = av_len((AV*)SvRV(val)) + 1; |
|
7274
|
10
|
50
|
|
|
|
|
Newx(row_names, n, char*); |
|
7275
|
80
|
100
|
|
|
|
|
for(i = 0; i < n; i++) { |
|
7276
|
70
|
|
|
|
|
|
char buf[32]; snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i+1)); |
|
7277
|
70
|
|
|
|
|
|
row_names[i] = savepv(buf); |
|
7278
|
|
|
|
|
|
|
} |
|
7279
|
0
|
0
|
|
|
|
|
} else if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVHV) { |
|
|
|
0
|
|
|
|
|
|
|
7280
|
0
|
|
|
|
|
|
n = hv_iterinit(hv); |
|
7281
|
0
|
0
|
|
|
|
|
Newx(row_names, n, char*); Newx(row_hashes, n, HV*); |
|
|
|
0
|
|
|
|
|
|
|
7282
|
0
|
|
|
|
|
|
i = 0; |
|
7283
|
0
|
0
|
|
|
|
|
while ((entry = hv_iternext(hv))) { |
|
7284
|
|
|
|
|
|
|
I32 len; |
|
7285
|
0
|
|
|
|
|
|
row_names[i] = savepv(hv_iterkey(entry, &len)); |
|
7286
|
0
|
|
|
|
|
|
row_hashes[i] = (HV*)SvRV(hv_iterval(hv, entry)); |
|
7287
|
0
|
|
|
|
|
|
i++; |
|
7288
|
|
|
|
|
|
|
} |
|
7289
|
0
|
|
|
|
|
|
} else croak("aov: Hash values must be ArrayRefs (HoA) or HashRefs (HoH)"); |
|
7290
|
|
|
|
|
|
|
} |
|
7291
|
0
|
0
|
|
|
|
|
} else if (SvTYPE(ref) == SVt_PVAV) { |
|
7292
|
0
|
|
|
|
|
|
AV*restrict av = (AV*)ref; |
|
7293
|
0
|
|
|
|
|
|
n = av_len(av) + 1; |
|
7294
|
0
|
0
|
|
|
|
|
Newx(row_names, n, char*); |
|
7295
|
0
|
0
|
|
|
|
|
Newx(row_hashes, n, HV*); |
|
7296
|
0
|
0
|
|
|
|
|
for (i = 0; i < n; i++) { |
|
7297
|
0
|
|
|
|
|
|
SV**restrict val = av_fetch(av, i, 0); |
|
7298
|
0
|
0
|
|
|
|
|
if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVHV) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
7299
|
0
|
|
|
|
|
|
row_hashes[i] = (HV*)SvRV(*val); |
|
7300
|
|
|
|
|
|
|
char buf[32]; |
|
7301
|
0
|
|
|
|
|
|
snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i + 1)); |
|
7302
|
0
|
|
|
|
|
|
row_names[i] = savepv(buf); |
|
7303
|
|
|
|
|
|
|
} else { |
|
7304
|
0
|
0
|
|
|
|
|
for (size_t k = 0; k < i; k++) Safefree(row_names[k]); |
|
7305
|
0
|
|
|
|
|
|
Safefree(row_names); Safefree(row_hashes); |
|
7306
|
0
|
|
|
|
|
|
croak("aov: Array values must be HashRefs (AoH)"); |
|
7307
|
|
|
|
|
|
|
} |
|
7308
|
|
|
|
|
|
|
} |
|
7309
|
0
|
|
|
|
|
|
} else croak("aov: Data must be an Array or Hash reference"); |
|
7310
|
|
|
|
|
|
|
// |
|
7311
|
|
|
|
|
|
|
// PHASE 2: Formula Parsing & `.` Expansion |
|
7312
|
|
|
|
|
|
|
// |
|
7313
|
10
|
|
|
|
|
|
src = (char*)formula; dst = f_cpy; |
|
7314
|
123
|
100
|
|
|
|
|
while (*src && (dst - f_cpy < 511)) { if (!isspace(*src)) { *dst++ = *src; } src++; } |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
7315
|
10
|
|
|
|
|
|
*dst = '\0'; |
|
7316
|
10
|
|
|
|
|
|
tilde = strchr(f_cpy, '~'); |
|
7317
|
10
|
100
|
|
|
|
|
if (!tilde) { |
|
7318
|
3
|
100
|
|
|
|
|
for (i = 0; i < n; i++) Safefree(row_names[i]); |
|
7319
|
1
|
50
|
|
|
|
|
Safefree(row_names); if (row_hashes) Safefree(row_hashes); |
|
7320
|
1
|
|
|
|
|
|
croak("aov: invalid formula, missing '~'"); |
|
7321
|
|
|
|
|
|
|
} |
|
7322
|
9
|
|
|
|
|
|
*tilde = '\0'; |
|
7323
|
9
|
|
|
|
|
|
lhs = f_cpy; |
|
7324
|
9
|
|
|
|
|
|
rhs = tilde + 1; |
|
7325
|
|
|
|
|
|
|
char *restrict p_idx; |
|
7326
|
9
|
50
|
|
|
|
|
while ((p_idx = strstr(rhs, "-1")) != NULL) { has_intercept = FALSE; memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); } |
|
7327
|
9
|
50
|
|
|
|
|
while ((p_idx = strstr(rhs, "+0")) != NULL) { has_intercept = FALSE; memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); } |
|
7328
|
9
|
50
|
|
|
|
|
while ((p_idx = strstr(rhs, "0+")) != NULL) { has_intercept = FALSE; memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); } |
|
7329
|
9
|
50
|
|
|
|
|
if (rhs[0] == '0' && rhs[1] == '\0') { has_intercept = FALSE; rhs[0] = '\0'; } |
|
|
|
0
|
|
|
|
|
|
|
7330
|
9
|
50
|
|
|
|
|
while ((p_idx = strstr(rhs, "+1")) != NULL) { memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); } |
|
7331
|
9
|
50
|
|
|
|
|
if (rhs[0] == '1' && rhs[1] == '\0') { rhs[0] = '\0'; } |
|
|
|
0
|
|
|
|
|
|
|
7332
|
9
|
50
|
|
|
|
|
else if (rhs[0] == '1' && rhs[1] == '+') { memmove(rhs, rhs + 2, strlen(rhs + 2) + 1); } |
|
|
|
0
|
|
|
|
|
|
|
7333
|
|
|
|
|
|
|
|
|
7334
|
9
|
50
|
|
|
|
|
while ((p_idx = strstr(rhs, "++")) != NULL) memmove(p_idx, p_idx + 1, strlen(p_idx + 1) + 1); |
|
7335
|
9
|
50
|
|
|
|
|
if (rhs[0] == '+') memmove(rhs, rhs + 1, strlen(rhs + 1) + 1); |
|
7336
|
9
|
|
|
|
|
|
size_t len_rhs = strlen(rhs); |
|
7337
|
9
|
50
|
|
|
|
|
if (len_rhs > 0 && rhs[len_rhs - 1] == '+') rhs[len_rhs - 1] = '\0'; |
|
|
|
50
|
|
|
|
|
|
|
7338
|
9
|
|
|
|
|
|
char rhs_expanded[2048] = ""; |
|
7339
|
9
|
|
|
|
|
|
size_t rhs_len = 0; |
|
7340
|
9
|
|
|
|
|
|
chunk = strtok(rhs, "+"); |
|
7341
|
21
|
100
|
|
|
|
|
while (chunk != NULL) { |
|
7342
|
12
|
100
|
|
|
|
|
if (strcmp(chunk, ".") == 0) { |
|
7343
|
1
|
|
|
|
|
|
AV *restrict cols = get_all_columns(aTHX_ data_hoa, row_hashes, n); |
|
7344
|
4
|
100
|
|
|
|
|
for (size_t c = 0; c <= av_len(cols); c++) { |
|
7345
|
3
|
|
|
|
|
|
SV **restrict col_sv = av_fetch(cols, c, 0); |
|
7346
|
3
|
50
|
|
|
|
|
if (col_sv && SvOK(*col_sv)) { |
|
|
|
50
|
|
|
|
|
|
|
7347
|
3
|
|
|
|
|
|
const char *restrict col_name = SvPV_nolen(*col_sv); |
|
7348
|
3
|
100
|
|
|
|
|
if (strcmp(col_name, lhs) != 0) { |
|
7349
|
2
|
|
|
|
|
|
size_t slen = strlen(col_name); |
|
7350
|
2
|
50
|
|
|
|
|
if (rhs_len + slen + 2 < sizeof(rhs_expanded)) { |
|
7351
|
2
|
100
|
|
|
|
|
if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; } |
|
7352
|
2
|
|
|
|
|
|
strcat(rhs_expanded, col_name); |
|
7353
|
2
|
|
|
|
|
|
rhs_len += slen; |
|
7354
|
|
|
|
|
|
|
} |
|
7355
|
|
|
|
|
|
|
} |
|
7356
|
|
|
|
|
|
|
} |
|
7357
|
|
|
|
|
|
|
} |
|
7358
|
1
|
|
|
|
|
|
SvREFCNT_dec(cols); |
|
7359
|
|
|
|
|
|
|
} else { |
|
7360
|
11
|
|
|
|
|
|
size_t slen = strlen(chunk); |
|
7361
|
11
|
50
|
|
|
|
|
if (rhs_len + slen + 2 < sizeof(rhs_expanded)) { |
|
7362
|
11
|
100
|
|
|
|
|
if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; } |
|
7363
|
11
|
|
|
|
|
|
strcat(rhs_expanded, chunk); |
|
7364
|
11
|
|
|
|
|
|
rhs_len += slen; |
|
7365
|
|
|
|
|
|
|
} |
|
7366
|
|
|
|
|
|
|
} |
|
7367
|
12
|
|
|
|
|
|
chunk = strtok(NULL, "+"); |
|
7368
|
|
|
|
|
|
|
} |
|
7369
|
|
|
|
|
|
|
// Setup arrays safely |
|
7370
|
9
|
|
|
|
|
|
Newx(terms, term_cap, char*); |
|
7371
|
9
|
|
|
|
|
|
Newx(uniq_terms, term_cap, char*); |
|
7372
|
9
|
|
|
|
|
|
Newx(exp_terms, exp_cap, char*); Newx(parent_term, exp_cap, char*); |
|
7373
|
9
|
|
|
|
|
|
Newx(is_dummy, exp_cap, bool); Newx(is_interact, exp_cap, bool); |
|
7374
|
9
|
|
|
|
|
|
Newx(dummy_base, exp_cap, char*); Newx(dummy_level, exp_cap, char*); |
|
7375
|
9
|
|
|
|
|
|
Newx(term_map, exp_cap, int); Newx(left_idx, exp_cap, int); Newx(right_idx, exp_cap, int); |
|
7376
|
9
|
50
|
|
|
|
|
if (has_intercept) { terms[num_terms++] = savepv("Intercept"); } |
|
7377
|
9
|
50
|
|
|
|
|
if (strlen(rhs_expanded) > 0) { |
|
7378
|
9
|
|
|
|
|
|
chunk = strtok(rhs_expanded, "+"); |
|
7379
|
22
|
100
|
|
|
|
|
while (chunk != NULL) { |
|
7380
|
13
|
50
|
|
|
|
|
if (num_terms >= term_cap - 3) { |
|
7381
|
0
|
|
|
|
|
|
term_cap *= 2; |
|
7382
|
0
|
|
|
|
|
|
Renew(terms, term_cap, char*); Renew(uniq_terms, term_cap, char*); |
|
7383
|
|
|
|
|
|
|
} |
|
7384
|
13
|
|
|
|
|
|
char *restrict star = strchr(chunk, '*'); |
|
7385
|
13
|
100
|
|
|
|
|
if (star) { |
|
7386
|
1
|
|
|
|
|
|
*star = '\0'; |
|
7387
|
1
|
|
|
|
|
|
char *restrict left = chunk; |
|
7388
|
1
|
|
|
|
|
|
char *restrict right = star + 1; |
|
7389
|
1
|
|
|
|
|
|
char *restrict c_l = strchr(left, '^'); |
|
7390
|
1
|
50
|
|
|
|
|
if (c_l && strncmp(left, "I(", 2) != 0) *c_l = '\0'; |
|
|
|
0
|
|
|
|
|
|
|
7391
|
1
|
50
|
|
|
|
|
char *restrict c_r = strchr(right, '^'); if (c_r && strncmp(right, "I(", 2) != 0) *c_r = '\0'; |
|
|
|
0
|
|
|
|
|
|
|
7392
|
1
|
|
|
|
|
|
terms[num_terms++] = savepv(left); |
|
7393
|
1
|
|
|
|
|
|
terms[num_terms++] = savepv(right); |
|
7394
|
1
|
|
|
|
|
|
size_t inter_len = strlen(left) + strlen(right) + 2; |
|
7395
|
1
|
|
|
|
|
|
terms[num_terms] = (char*)safemalloc(inter_len); |
|
7396
|
1
|
|
|
|
|
|
snprintf(terms[num_terms++], inter_len, "%s:%s", left, right); |
|
7397
|
|
|
|
|
|
|
} else { |
|
7398
|
12
|
|
|
|
|
|
char *restrict c_chunk = strchr(chunk, '^'); |
|
7399
|
12
|
50
|
|
|
|
|
if (c_chunk && strncmp(chunk, "I(", 2) != 0) *c_chunk = '\0'; |
|
|
|
0
|
|
|
|
|
|
|
7400
|
12
|
|
|
|
|
|
terms[num_terms++] = savepv(chunk); |
|
7401
|
|
|
|
|
|
|
} |
|
7402
|
13
|
|
|
|
|
|
chunk = strtok(NULL, "+"); |
|
7403
|
|
|
|
|
|
|
} |
|
7404
|
|
|
|
|
|
|
} |
|
7405
|
|
|
|
|
|
|
|
|
7406
|
33
|
100
|
|
|
|
|
for (i = 0; i < num_terms; i++) { |
|
7407
|
24
|
|
|
|
|
|
bool found = FALSE; |
|
7408
|
46
|
100
|
|
|
|
|
for (size_t k = 0; k < num_uniq; k++) { |
|
7409
|
22
|
50
|
|
|
|
|
if (strcmp(terms[i], uniq_terms[k]) == 0) { found = TRUE; break; } |
|
7410
|
|
|
|
|
|
|
} |
|
7411
|
24
|
50
|
|
|
|
|
if (!found) uniq_terms[num_uniq++] = savepv(terms[i]); |
|
7412
|
|
|
|
|
|
|
} |
|
7413
|
9
|
|
|
|
|
|
p = num_uniq; |
|
7414
|
|
|
|
|
|
|
|
|
7415
|
9
|
|
|
|
|
|
Newxz(term_base_level, num_uniq, char*); |
|
7416
|
|
|
|
|
|
|
|
|
7417
|
|
|
|
|
|
|
/* PHASE 3: Categorical & Interaction Expansion */ |
|
7418
|
32
|
100
|
|
|
|
|
for (j = 0; j < p; j++) { |
|
7419
|
24
|
100
|
|
|
|
|
if (p_exp + 64 >= exp_cap) { |
|
7420
|
9
|
|
|
|
|
|
exp_cap *= 2; |
|
7421
|
9
|
|
|
|
|
|
Renew(exp_terms, exp_cap, char*); Renew(parent_term, exp_cap, char*); |
|
7422
|
9
|
|
|
|
|
|
Renew(is_dummy, exp_cap, bool); Renew(is_interact, exp_cap, bool); |
|
7423
|
9
|
|
|
|
|
|
Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*); |
|
7424
|
9
|
|
|
|
|
|
Renew(term_map, exp_cap, int); Renew(left_idx, exp_cap, int); Renew(right_idx, exp_cap, int); |
|
7425
|
|
|
|
|
|
|
} |
|
7426
|
|
|
|
|
|
|
|
|
7427
|
24
|
100
|
|
|
|
|
if (strcmp(uniq_terms[j], "Intercept") == 0) { |
|
7428
|
9
|
|
|
|
|
|
exp_terms[p_exp] = savepv("Intercept"); |
|
7429
|
9
|
|
|
|
|
|
parent_term[p_exp] = savepv("Intercept"); |
|
7430
|
9
|
|
|
|
|
|
is_dummy[p_exp] = FALSE; is_interact[p_exp] = FALSE; |
|
7431
|
9
|
|
|
|
|
|
term_map[p_exp] = j; |
|
7432
|
9
|
|
|
|
|
|
p_exp++; |
|
7433
|
9
|
|
|
|
|
|
continue; |
|
7434
|
|
|
|
|
|
|
} |
|
7435
|
|
|
|
|
|
|
|
|
7436
|
15
|
|
|
|
|
|
char *restrict colon = strchr(uniq_terms[j], ':'); |
|
7437
|
15
|
100
|
|
|
|
|
if (colon) { |
|
7438
|
|
|
|
|
|
|
char left[256], right[256]; |
|
7439
|
2
|
|
|
|
|
|
strncpy(left, uniq_terms[j], colon - uniq_terms[j]); |
|
7440
|
2
|
|
|
|
|
|
left[colon - uniq_terms[j]] = '\0'; |
|
7441
|
2
|
|
|
|
|
|
strcpy(right, colon + 1); |
|
7442
|
|
|
|
|
|
|
|
|
7443
|
2
|
|
|
|
|
|
int *restrict l_indices = (int*)safemalloc(p_exp * sizeof(int)); int l_count = 0; |
|
7444
|
2
|
|
|
|
|
|
int *restrict r_indices = (int*)safemalloc(p_exp * sizeof(int)); int r_count = 0; |
|
7445
|
6
|
100
|
|
|
|
|
for (size_t e = 0; e < p_exp; e++) { |
|
7446
|
4
|
100
|
|
|
|
|
if (strcmp(parent_term[e], left) == 0) l_indices[l_count++] = e; |
|
7447
|
4
|
100
|
|
|
|
|
if (strcmp(parent_term[e], right) == 0) r_indices[r_count++] = e; |
|
7448
|
|
|
|
|
|
|
} |
|
7449
|
|
|
|
|
|
|
|
|
7450
|
2
|
100
|
|
|
|
|
if (l_count == 0 || r_count == 0) { |
|
|
|
50
|
|
|
|
|
|
|
7451
|
1
|
|
|
|
|
|
Safefree(l_indices); Safefree(r_indices); |
|
7452
|
1
|
|
|
|
|
|
croak("aov: Interaction term '%s' requires its main effects to be explicitly included in the formula", uniq_terms[j]); |
|
7453
|
|
|
|
|
|
|
} else { |
|
7454
|
2
|
100
|
|
|
|
|
for (unsigned int li = 0; li < l_count; li++) { |
|
7455
|
2
|
100
|
|
|
|
|
for (unsigned int ri = 0; ri < r_count; ri++) { |
|
7456
|
1
|
50
|
|
|
|
|
if (p_exp >= exp_cap) { |
|
7457
|
0
|
|
|
|
|
|
exp_cap *= 2; |
|
7458
|
0
|
|
|
|
|
|
Renew(exp_terms, exp_cap, char*); Renew(parent_term, exp_cap, char*); |
|
7459
|
0
|
|
|
|
|
|
Renew(is_dummy, exp_cap, bool); Renew(is_interact, exp_cap, bool); |
|
7460
|
0
|
|
|
|
|
|
Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*); |
|
7461
|
0
|
|
|
|
|
|
Renew(term_map, exp_cap, int); Renew(left_idx, exp_cap, int); Renew(right_idx, exp_cap, int); |
|
7462
|
|
|
|
|
|
|
} |
|
7463
|
1
|
|
|
|
|
|
size_t t_len = strlen(exp_terms[l_indices[li]]) + strlen(exp_terms[r_indices[ri]]) + 2; |
|
7464
|
1
|
|
|
|
|
|
exp_terms[p_exp] = (char*)safemalloc(t_len); |
|
7465
|
1
|
|
|
|
|
|
snprintf(exp_terms[p_exp], t_len, "%s:%s", exp_terms[l_indices[li]], exp_terms[r_indices[ri]]); |
|
7466
|
1
|
|
|
|
|
|
parent_term[p_exp] = savepv(uniq_terms[j]); |
|
7467
|
1
|
|
|
|
|
|
is_dummy[p_exp] = FALSE; is_interact[p_exp] = TRUE; |
|
7468
|
1
|
|
|
|
|
|
left_idx[p_exp] = l_indices[li]; |
|
7469
|
1
|
|
|
|
|
|
right_idx[p_exp] = r_indices[ri]; |
|
7470
|
1
|
|
|
|
|
|
term_map[p_exp] = j; |
|
7471
|
1
|
|
|
|
|
|
p_exp++; |
|
7472
|
|
|
|
|
|
|
} |
|
7473
|
|
|
|
|
|
|
} |
|
7474
|
|
|
|
|
|
|
} |
|
7475
|
1
|
|
|
|
|
|
Safefree(l_indices); Safefree(r_indices); |
|
7476
|
|
|
|
|
|
|
} else { |
|
7477
|
13
|
100
|
|
|
|
|
if (is_column_categorical(aTHX_ data_hoa, row_hashes, n, uniq_terms[j])) { |
|
7478
|
4
|
|
|
|
|
|
char **restrict levels = NULL; |
|
7479
|
4
|
|
|
|
|
|
unsigned int num_levels = 0, levels_cap = 8; |
|
7480
|
4
|
|
|
|
|
|
Newx(levels, levels_cap, char*); |
|
7481
|
65
|
100
|
|
|
|
|
for (i = 0; i < n; i++) { |
|
7482
|
61
|
|
|
|
|
|
char*restrict str_val = get_data_string_alloc(aTHX_ data_hoa, row_hashes, i, uniq_terms[j]); |
|
7483
|
61
|
50
|
|
|
|
|
if (str_val) { |
|
7484
|
61
|
|
|
|
|
|
bool found = FALSE; |
|
7485
|
96
|
100
|
|
|
|
|
for (size_t l = 0; l < num_levels; l++) { |
|
7486
|
87
|
100
|
|
|
|
|
if (strcmp(levels[l], str_val) == 0) { found = TRUE; break; } |
|
7487
|
|
|
|
|
|
|
} |
|
7488
|
61
|
100
|
|
|
|
|
if (!found) { |
|
7489
|
9
|
50
|
|
|
|
|
if (num_levels >= levels_cap) { levels_cap *= 2; Renew(levels, levels_cap, char*); } |
|
7490
|
9
|
|
|
|
|
|
levels[num_levels++] = savepv(str_val); |
|
7491
|
|
|
|
|
|
|
} |
|
7492
|
61
|
|
|
|
|
|
Safefree(str_val); |
|
7493
|
|
|
|
|
|
|
} |
|
7494
|
|
|
|
|
|
|
} |
|
7495
|
4
|
50
|
|
|
|
|
if (num_levels > 0) { |
|
7496
|
9
|
100
|
|
|
|
|
for (size_t l1 = 0; l1 < num_levels - 1; l1++) { |
|
7497
|
11
|
100
|
|
|
|
|
for (size_t l2 = l1 + 1; l2 < num_levels; l2++) { |
|
7498
|
6
|
100
|
|
|
|
|
if (strcmp(levels[l1], levels[l2]) > 0) { |
|
7499
|
1
|
|
|
|
|
|
char *tmp = levels[l1]; levels[l1] = levels[l2]; levels[l2] = tmp; |
|
7500
|
|
|
|
|
|
|
} |
|
7501
|
|
|
|
|
|
|
} |
|
7502
|
|
|
|
|
|
|
} |
|
7503
|
|
|
|
|
|
|
|
|
7504
|
4
|
|
|
|
|
|
term_base_level[j] = savepv(levels[0]); |
|
7505
|
|
|
|
|
|
|
|
|
7506
|
9
|
100
|
|
|
|
|
for (size_t l = 1; l < num_levels; l++) { |
|
7507
|
5
|
50
|
|
|
|
|
if (p_exp >= exp_cap) { |
|
7508
|
0
|
|
|
|
|
|
exp_cap *= 2; |
|
7509
|
0
|
|
|
|
|
|
Renew(exp_terms, exp_cap, char*); Renew(parent_term, exp_cap, char*); |
|
7510
|
0
|
|
|
|
|
|
Renew(is_dummy, exp_cap, bool); Renew(is_interact, exp_cap, bool); |
|
7511
|
0
|
|
|
|
|
|
Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*); |
|
7512
|
0
|
|
|
|
|
|
Renew(term_map, exp_cap, int); Renew(left_idx, exp_cap, int); Renew(right_idx, exp_cap, int); |
|
7513
|
|
|
|
|
|
|
} |
|
7514
|
5
|
|
|
|
|
|
size_t t_len = strlen(uniq_terms[j]) + strlen(levels[l]) + 1; |
|
7515
|
5
|
|
|
|
|
|
exp_terms[p_exp] = (char*)safemalloc(t_len); |
|
7516
|
5
|
|
|
|
|
|
snprintf(exp_terms[p_exp], t_len, "%s%s", uniq_terms[j], levels[l]); |
|
7517
|
5
|
|
|
|
|
|
parent_term[p_exp] = savepv(uniq_terms[j]); |
|
7518
|
5
|
|
|
|
|
|
is_dummy[p_exp] = TRUE; is_interact[p_exp] = FALSE; |
|
7519
|
5
|
|
|
|
|
|
dummy_base[p_exp] = savepv(uniq_terms[j]); |
|
7520
|
5
|
|
|
|
|
|
dummy_level[p_exp] = savepv(levels[l]); |
|
7521
|
5
|
|
|
|
|
|
term_map[p_exp] = j; |
|
7522
|
5
|
|
|
|
|
|
p_exp++; |
|
7523
|
|
|
|
|
|
|
} |
|
7524
|
13
|
100
|
|
|
|
|
for (size_t l = 0; l < num_levels; l++) Safefree(levels[l]); |
|
7525
|
4
|
|
|
|
|
|
Safefree(levels); |
|
7526
|
|
|
|
|
|
|
} else { |
|
7527
|
0
|
|
|
|
|
|
Safefree(levels); |
|
7528
|
0
|
|
|
|
|
|
exp_terms[p_exp] = savepv(uniq_terms[j]); |
|
7529
|
0
|
|
|
|
|
|
parent_term[p_exp] = savepv(uniq_terms[j]); |
|
7530
|
0
|
|
|
|
|
|
is_dummy[p_exp] = FALSE; is_interact[p_exp] = FALSE; |
|
7531
|
0
|
|
|
|
|
|
term_map[p_exp] = j; |
|
7532
|
0
|
|
|
|
|
|
p_exp++; |
|
7533
|
|
|
|
|
|
|
} |
|
7534
|
|
|
|
|
|
|
} else { |
|
7535
|
9
|
|
|
|
|
|
exp_terms[p_exp] = savepv(uniq_terms[j]); |
|
7536
|
9
|
|
|
|
|
|
parent_term[p_exp] = savepv(uniq_terms[j]); |
|
7537
|
9
|
|
|
|
|
|
is_dummy[p_exp] = FALSE; is_interact[p_exp] = FALSE; |
|
7538
|
9
|
|
|
|
|
|
term_map[p_exp] = j; |
|
7539
|
9
|
|
|
|
|
|
p_exp++; |
|
7540
|
|
|
|
|
|
|
} |
|
7541
|
|
|
|
|
|
|
} |
|
7542
|
|
|
|
|
|
|
} |
|
7543
|
8
|
|
|
|
|
|
X_mat = (double**)safemalloc(n * sizeof(double*)); |
|
7544
|
72
|
100
|
|
|
|
|
for(i = 0; i < n; i++) X_mat[i] = (double*)safemalloc(p_exp * sizeof(double)); |
|
7545
|
8
|
50
|
|
|
|
|
Newx(Y, n, double); |
|
7546
|
|
|
|
|
|
|
// PHASE 4: Matrix Construction & Listwise Deletion |
|
7547
|
72
|
100
|
|
|
|
|
for (i = 0; i < n; i++) { |
|
7548
|
64
|
|
|
|
|
|
NV y_val = evaluate_term(aTHX_ data_hoa, row_hashes, i, lhs); |
|
7549
|
64
|
50
|
|
|
|
|
if (isnan(y_val)) { Safefree(row_names[i]); continue; } |
|
7550
|
64
|
|
|
|
|
|
bool row_ok = TRUE; |
|
7551
|
64
|
|
|
|
|
|
NV *restrict row_x = (NV*)safemalloc(p_exp * sizeof(NV)); |
|
7552
|
258
|
100
|
|
|
|
|
for (j = 0; j < p_exp; j++) { |
|
7553
|
194
|
100
|
|
|
|
|
if (strcmp(exp_terms[j], "Intercept") == 0) { |
|
7554
|
64
|
|
|
|
|
|
row_x[j] = 1.0; |
|
7555
|
130
|
100
|
|
|
|
|
} else if (is_interact[j]) { |
|
7556
|
20
|
|
|
|
|
|
row_x[j] = row_x[left_idx[j]] * row_x[right_idx[j]]; |
|
7557
|
110
|
100
|
|
|
|
|
} else if (is_dummy[j]) { |
|
7558
|
70
|
|
|
|
|
|
char*restrict str_val = get_data_string_alloc(aTHX_ data_hoa, row_hashes, i, dummy_base[j]); |
|
7559
|
70
|
50
|
|
|
|
|
if (str_val) { |
|
7560
|
70
|
100
|
|
|
|
|
row_x[j] = (strcmp(str_val, dummy_level[j]) == 0) ? 1.0 : 0.0; |
|
7561
|
70
|
|
|
|
|
|
Safefree(str_val); |
|
7562
|
0
|
|
|
|
|
|
} else { row_ok = FALSE; break; } |
|
7563
|
|
|
|
|
|
|
} else { |
|
7564
|
40
|
|
|
|
|
|
row_x[j] = evaluate_term(aTHX_ data_hoa, row_hashes, i, parent_term[j]); |
|
7565
|
40
|
50
|
|
|
|
|
if (isnan(row_x[j])) { row_ok = FALSE; break; } |
|
7566
|
|
|
|
|
|
|
} |
|
7567
|
|
|
|
|
|
|
} |
|
7568
|
64
|
50
|
|
|
|
|
if (!row_ok) { Safefree(row_names[i]); Safefree(row_x); continue; } |
|
7569
|
64
|
|
|
|
|
|
Y[valid_n] = y_val; |
|
7570
|
258
|
100
|
|
|
|
|
for (j = 0; j < p_exp; j++) X_mat[valid_n][j] = row_x[j]; |
|
7571
|
64
|
|
|
|
|
|
valid_n++; |
|
7572
|
64
|
|
|
|
|
|
Safefree(row_x); |
|
7573
|
64
|
|
|
|
|
|
Safefree(row_names[i]); |
|
7574
|
|
|
|
|
|
|
} |
|
7575
|
8
|
|
|
|
|
|
Safefree(row_names); |
|
7576
|
8
|
100
|
|
|
|
|
if (valid_n <= p_exp) { |
|
7577
|
|
|
|
|
|
|
// Full Clean Up |
|
7578
|
4
|
100
|
|
|
|
|
for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms); |
|
7579
|
4
|
100
|
|
|
|
|
for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms); |
|
7580
|
4
|
100
|
|
|
|
|
for (j = 0; j < p_exp; j++) { |
|
7581
|
3
|
|
|
|
|
|
Safefree(exp_terms[j]); Safefree(parent_term[j]); |
|
7582
|
3
|
50
|
|
|
|
|
if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); } |
|
7583
|
|
|
|
|
|
|
} |
|
7584
|
1
|
|
|
|
|
|
Safefree(exp_terms); Safefree(parent_term); |
|
7585
|
1
|
|
|
|
|
|
Safefree(is_dummy); Safefree(is_interact); |
|
7586
|
1
|
|
|
|
|
|
Safefree(dummy_base); Safefree(dummy_level); |
|
7587
|
1
|
|
|
|
|
|
Safefree(term_map); Safefree(left_idx); Safefree(right_idx); |
|
7588
|
3
|
100
|
|
|
|
|
for(i = 0; i < n; i++) Safefree(X_mat[i]); |
|
7589
|
1
|
|
|
|
|
|
Safefree(X_mat); Safefree(Y); |
|
7590
|
1
|
50
|
|
|
|
|
if (row_hashes) Safefree(row_hashes); |
|
7591
|
4
|
50
|
|
|
|
|
for (i = 0; i < num_uniq; i++) { if (term_base_level[i]) Safefree(term_base_level[i]); } |
|
|
|
100
|
|
|
|
|
|
|
7592
|
1
|
|
|
|
|
|
Safefree(term_base_level); |
|
7593
|
1
|
|
|
|
|
|
croak("aov: 0 degrees of freedom (too many NAs or parameters > observations)"); |
|
7594
|
|
|
|
|
|
|
} |
|
7595
|
|
|
|
|
|
|
// PHASE 5: Math & Output Formatting |
|
7596
|
7
|
|
|
|
|
|
bool *restrict aliased_qr = (bool*)safemalloc(p_exp * sizeof(bool)); |
|
7597
|
7
|
|
|
|
|
|
size_t *restrict rank_map = (size_t*)safemalloc(p_exp * sizeof(size_t)); |
|
7598
|
7
|
|
|
|
|
|
apply_householder_aov(X_mat, Y, valid_n, p_exp, aliased_qr, rank_map); |
|
7599
|
|
|
|
|
|
|
NV *restrict term_ss; |
|
7600
|
|
|
|
|
|
|
int *restrict term_df; |
|
7601
|
7
|
|
|
|
|
|
Newxz(term_ss, num_uniq, NV); |
|
7602
|
7
|
|
|
|
|
|
Newxz(term_df, num_uniq, int); |
|
7603
|
27
|
100
|
|
|
|
|
for (i = 0; i < p_exp; i++) { |
|
7604
|
20
|
100
|
|
|
|
|
if (strcmp(exp_terms[i], "Intercept") == 0) continue; |
|
7605
|
13
|
100
|
|
|
|
|
if (aliased_qr[i]) continue; |
|
7606
|
12
|
|
|
|
|
|
int t_idx = term_map[i]; |
|
7607
|
12
|
|
|
|
|
|
size_t r_k = rank_map[i]; |
|
7608
|
12
|
|
|
|
|
|
term_ss[t_idx] += Y[r_k] * Y[r_k]; |
|
7609
|
12
|
|
|
|
|
|
term_df[t_idx] += 1; |
|
7610
|
|
|
|
|
|
|
} |
|
7611
|
7
|
|
|
|
|
|
int rank = 0; |
|
7612
|
27
|
100
|
|
|
|
|
for (i = 0; i < p_exp; i++) { |
|
7613
|
20
|
100
|
|
|
|
|
if (!aliased_qr[i]) rank++; |
|
7614
|
|
|
|
|
|
|
} |
|
7615
|
7
|
|
|
|
|
|
NV rss_prev = 0.0; |
|
7616
|
50
|
100
|
|
|
|
|
for (i = rank; i < valid_n; i++) { |
|
7617
|
43
|
|
|
|
|
|
rss_prev += Y[i] * Y[i]; |
|
7618
|
|
|
|
|
|
|
} |
|
7619
|
7
|
|
|
|
|
|
int res_df = valid_n - rank; |
|
7620
|
7
|
50
|
|
|
|
|
NV ms_res = (res_df > 0) ? rss_prev / res_df : 0.0; |
|
7621
|
7
|
|
|
|
|
|
HV*restrict ret_hash = newHV(); |
|
7622
|
26
|
100
|
|
|
|
|
for (j = 0; j < num_uniq; j++) { |
|
7623
|
19
|
100
|
|
|
|
|
if (strcmp(uniq_terms[j], "Intercept") == 0) continue; |
|
7624
|
12
|
|
|
|
|
|
HV*restrict term_stats = newHV(); |
|
7625
|
12
|
|
|
|
|
|
NV ss = term_ss[j]; |
|
7626
|
12
|
|
|
|
|
|
int df = term_df[j]; |
|
7627
|
12
|
100
|
|
|
|
|
NV ms = (df > 0) ? ss / df : 0.0; |
|
7628
|
|
|
|
|
|
|
|
|
7629
|
12
|
|
|
|
|
|
hv_stores(term_stats, "Df", newSViv(df)); |
|
7630
|
12
|
|
|
|
|
|
hv_stores(term_stats, "Sum Sq", newSVnv(ss)); |
|
7631
|
12
|
|
|
|
|
|
hv_stores(term_stats, "Mean Sq", newSVnv(ms)); |
|
7632
|
23
|
50
|
|
|
|
|
if (ms_res > 0.0 && df > 0) { |
|
|
|
100
|
|
|
|
|
|
|
7633
|
11
|
|
|
|
|
|
NV f_val = ms / ms_res; |
|
7634
|
11
|
|
|
|
|
|
hv_stores(term_stats, "F value", newSVnv(f_val)); |
|
7635
|
11
|
|
|
|
|
|
hv_stores(term_stats, "Pr(>F)", newSVnv(1.0 - pf(f_val, (NV)df, (NV)res_df))); |
|
7636
|
|
|
|
|
|
|
} else { |
|
7637
|
1
|
|
|
|
|
|
hv_stores(term_stats, "F value", newSVnv(NAN)); |
|
7638
|
1
|
|
|
|
|
|
hv_stores(term_stats, "Pr(>F)", newSVnv(NAN)); |
|
7639
|
|
|
|
|
|
|
} |
|
7640
|
12
|
|
|
|
|
|
hv_store(ret_hash, uniq_terms[j], strlen(uniq_terms[j]), newRV_noinc((SV*)term_stats), 0); |
|
7641
|
|
|
|
|
|
|
} |
|
7642
|
7
|
|
|
|
|
|
HV*restrict res_stats = newHV(); |
|
7643
|
7
|
|
|
|
|
|
hv_stores(res_stats, "Df", newSViv(res_df)); |
|
7644
|
7
|
|
|
|
|
|
hv_stores(res_stats, "Sum Sq", newSVnv(rss_prev)); |
|
7645
|
7
|
|
|
|
|
|
hv_stores(res_stats, "Mean Sq", newSVnv(ms_res)); |
|
7646
|
7
|
|
|
|
|
|
hv_stores(ret_hash, "Residuals", newRV_noinc((SV*)res_stats)); |
|
7647
|
|
|
|
|
|
|
{ |
|
7648
|
7
|
|
|
|
|
|
HV *restrict tgt_hoa = data_hoa; |
|
7649
|
7
|
|
|
|
|
|
HV **restrict tgt_row_hashes = row_hashes; |
|
7650
|
7
|
|
|
|
|
|
size_t tgt_n = n; |
|
7651
|
|
|
|
|
|
|
// Route evaluation to the original unstacked HoA when a formula was implied |
|
7652
|
7
|
100
|
|
|
|
|
if (is_stacked) { |
|
7653
|
1
|
|
|
|
|
|
tgt_hoa = (HV*)SvRV(orig_data_sv); |
|
7654
|
1
|
|
|
|
|
|
tgt_row_hashes = NULL; |
|
7655
|
1
|
|
|
|
|
|
hv_iterinit(tgt_hoa); |
|
7656
|
1
|
|
|
|
|
|
HE *restrict e = hv_iternext(tgt_hoa); |
|
7657
|
1
|
50
|
|
|
|
|
if (e) { |
|
7658
|
1
|
|
|
|
|
|
SV *val = hv_iterval(tgt_hoa, e); |
|
7659
|
1
|
50
|
|
|
|
|
if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
7660
|
1
|
|
|
|
|
|
tgt_n = av_len((AV*)SvRV(val)) + 1; |
|
7661
|
|
|
|
|
|
|
} |
|
7662
|
|
|
|
|
|
|
} |
|
7663
|
|
|
|
|
|
|
} |
|
7664
|
7
|
|
|
|
|
|
AV *restrict all_cols = get_all_columns(aTHX_ tgt_hoa, tgt_row_hashes, tgt_n); |
|
7665
|
7
|
|
|
|
|
|
HV *restrict mean_hv = newHV(); |
|
7666
|
7
|
|
|
|
|
|
HV *restrict size_hv = newHV(); |
|
7667
|
25
|
100
|
|
|
|
|
for (size_t c = 0; c <= (size_t)av_len(all_cols); c++) { |
|
7668
|
18
|
|
|
|
|
|
SV **restrict col_sv = av_fetch(all_cols, c, 0); |
|
7669
|
18
|
50
|
|
|
|
|
if (!col_sv || !SvOK(*col_sv)) continue; |
|
|
|
50
|
|
|
|
|
|
|
7670
|
18
|
|
|
|
|
|
const char *restrict col_name = SvPV_nolen(*col_sv); |
|
7671
|
18
|
|
|
|
|
|
NV col_sum = 0.0; |
|
7672
|
18
|
|
|
|
|
|
IV col_count = 0; |
|
7673
|
165
|
100
|
|
|
|
|
for (i = 0; i < tgt_n; i++) { |
|
7674
|
147
|
|
|
|
|
|
NV val = evaluate_term(aTHX_ tgt_hoa, tgt_row_hashes, i, col_name); |
|
7675
|
147
|
100
|
|
|
|
|
if (!isnan(val)) { col_sum += val; col_count++; } |
|
7676
|
|
|
|
|
|
|
} |
|
7677
|
18
|
100
|
|
|
|
|
NV col_mean = (col_count > 0) ? col_sum / col_count : NAN; |
|
7678
|
18
|
|
|
|
|
|
hv_store(mean_hv, col_name, strlen(col_name), newSVnv(col_mean), 0); |
|
7679
|
18
|
|
|
|
|
|
hv_store(size_hv, col_name, strlen(col_name), newSViv(col_count), 0); |
|
7680
|
|
|
|
|
|
|
} |
|
7681
|
7
|
|
|
|
|
|
SvREFCNT_dec(all_cols); |
|
7682
|
7
|
|
|
|
|
|
HV *restrict gs_hv = newHV(); |
|
7683
|
7
|
|
|
|
|
|
hv_stores(gs_hv, "mean", newRV_noinc((SV*)mean_hv)); |
|
7684
|
7
|
|
|
|
|
|
hv_stores(gs_hv, "size", newRV_noinc((SV*)size_hv)); |
|
7685
|
7
|
|
|
|
|
|
hv_stores(ret_hash, "group_stats", newRV_noinc((SV*)gs_hv)); |
|
7686
|
|
|
|
|
|
|
} |
|
7687
|
|
|
|
|
|
|
// Deep Cleanup |
|
7688
|
26
|
100
|
|
|
|
|
for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms); |
|
7689
|
26
|
100
|
|
|
|
|
for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms); |
|
7690
|
27
|
100
|
|
|
|
|
for (j = 0; j < p_exp; j++) { |
|
7691
|
20
|
|
|
|
|
|
Safefree(exp_terms[j]); Safefree(parent_term[j]); |
|
7692
|
20
|
100
|
|
|
|
|
if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); } |
|
7693
|
|
|
|
|
|
|
} |
|
7694
|
7
|
|
|
|
|
|
Safefree(exp_terms); Safefree(parent_term); |
|
7695
|
7
|
|
|
|
|
|
Safefree(is_dummy); Safefree(is_interact); |
|
7696
|
7
|
|
|
|
|
|
Safefree(dummy_base); Safefree(dummy_level); |
|
7697
|
7
|
|
|
|
|
|
Safefree(term_map); Safefree(left_idx); Safefree(right_idx); |
|
7698
|
7
|
|
|
|
|
|
Safefree(term_ss); Safefree(term_df); |
|
7699
|
69
|
100
|
|
|
|
|
for (i = 0; i < n; i++) Safefree(X_mat[i]); |
|
7700
|
7
|
|
|
|
|
|
Safefree(X_mat); Safefree(Y); |
|
7701
|
7
|
|
|
|
|
|
Safefree(aliased_qr); Safefree(rank_map); |
|
7702
|
26
|
100
|
|
|
|
|
for (i = 0; i < num_uniq; i++) { if (term_base_level[i]) Safefree(term_base_level[i]); } |
|
|
|
100
|
|
|
|
|
|
|
7703
|
7
|
|
|
|
|
|
Safefree(term_base_level); |
|
7704
|
7
|
50
|
|
|
|
|
if (row_hashes) Safefree(row_hashes); |
|
7705
|
7
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)ret_hash); |
|
7706
|
|
|
|
|
|
|
} |
|
7707
|
|
|
|
|
|
|
OUTPUT: |
|
7708
|
|
|
|
|
|
|
RETVAL |
|
7709
|
|
|
|
|
|
|
|
|
7710
|
|
|
|
|
|
|
PROTOTYPES: DISABLE |
|
7711
|
|
|
|
|
|
|
|
|
7712
|
|
|
|
|
|
|
|
|
7713
|
|
|
|
|
|
|
SV* fisher_test(...) |
|
7714
|
|
|
|
|
|
|
CODE: |
|
7715
|
|
|
|
|
|
|
{ |
|
7716
|
6
|
100
|
|
|
|
|
if (items < 1) croak("fisher_test requires at least a data reference"); |
|
7717
|
|
|
|
|
|
|
|
|
7718
|
5
|
|
|
|
|
|
SV *restrict data_ref = ST(0); |
|
7719
|
5
|
|
|
|
|
|
NV conf_level = 0.95; |
|
7720
|
5
|
|
|
|
|
|
const char *restrict alternative = "two.sided"; |
|
7721
|
|
|
|
|
|
|
|
|
7722
|
7
|
100
|
|
|
|
|
for (unsigned int i = 1; i < items; i += 2) { |
|
7723
|
2
|
50
|
|
|
|
|
if (i + 1 >= items) croak("fisher_test: odd number of named arguments"); |
|
7724
|
2
|
|
|
|
|
|
const char *restrict key = SvPV_nolen(ST(i)); |
|
7725
|
2
|
|
|
|
|
|
SV *restrict val = ST(i + 1); |
|
7726
|
2
|
50
|
|
|
|
|
if (strEQ(key, "conf_level") || strEQ(key, "conf.level")) { |
|
|
|
50
|
|
|
|
|
|
|
7727
|
0
|
|
|
|
|
|
conf_level = SvNV(val); |
|
7728
|
0
|
0
|
|
|
|
|
if (!(conf_level > 0 && conf_level < 1)) |
|
|
|
0
|
|
|
|
|
|
|
7729
|
0
|
|
|
|
|
|
croak("fisher_test: conf_level must be between 0 and 1"); |
|
7730
|
2
|
50
|
|
|
|
|
} else if (strEQ(key, "alternative")) { |
|
7731
|
2
|
|
|
|
|
|
alternative = SvPV_nolen(val); |
|
7732
|
2
|
50
|
|
|
|
|
if (strNE(alternative, "two.sided") && strNE(alternative, "less") && |
|
|
|
100
|
|
|
|
|
|
|
7733
|
1
|
50
|
|
|
|
|
strNE(alternative, "greater")) |
|
7734
|
0
|
|
|
|
|
|
croak("fisher_test: alternative must be 'two.sided', 'less' or 'greater'"); |
|
7735
|
|
|
|
|
|
|
} else { |
|
7736
|
0
|
|
|
|
|
|
croak("fisher_test: unknown argument '%s'", key); |
|
7737
|
|
|
|
|
|
|
} |
|
7738
|
|
|
|
|
|
|
} |
|
7739
|
5
|
50
|
|
|
|
|
if (!SvROK(data_ref)) croak("fisher_test requires a reference to a 2x2 Array or Hash"); |
|
7740
|
5
|
|
|
|
|
|
SV *restrict deref = SvRV(data_ref); |
|
7741
|
5
|
|
|
|
|
|
long a = 0, b = 0, c = 0, d = 0; |
|
7742
|
5
|
100
|
|
|
|
|
if (SvTYPE(deref) == SVt_PVAV) { |
|
7743
|
2
|
|
|
|
|
|
AV *restrict outer = (AV *)deref; |
|
7744
|
2
|
50
|
|
|
|
|
if (av_len(outer) != 1) croak("Outer array must have exactly 2 rows"); |
|
7745
|
2
|
|
|
|
|
|
SV **restrict r1p = av_fetch(outer, 0, 0); |
|
7746
|
2
|
|
|
|
|
|
SV **restrict r2p = av_fetch(outer, 1, 0); |
|
7747
|
2
|
50
|
|
|
|
|
if (!(r1p && r2p && SvROK(*r1p) && SvROK(*r2p) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
7748
|
2
|
50
|
|
|
|
|
&& SvTYPE(SvRV(*r1p)) == SVt_PVAV && SvTYPE(SvRV(*r2p)) == SVt_PVAV)) |
|
|
|
50
|
|
|
|
|
|
|
7749
|
0
|
|
|
|
|
|
croak("Invalid 2D array structure: need two array-ref rows"); |
|
7750
|
2
|
|
|
|
|
|
AV *restrict r1 = (AV *)SvRV(*r1p), *r2 = (AV *)SvRV(*r2p); |
|
7751
|
2
|
50
|
|
|
|
|
if (av_len(r1) != 1 || av_len(r2) != 1) |
|
|
|
50
|
|
|
|
|
|
|
7752
|
0
|
|
|
|
|
|
croak("Each row must have exactly 2 columns"); |
|
7753
|
2
|
|
|
|
|
|
a = ft_cell(aTHX_ *av_fetch(r1, 0, 0), "cell [0][0]"); |
|
7754
|
2
|
|
|
|
|
|
b = ft_cell(aTHX_ *av_fetch(r1, 1, 0), "cell [0][1]"); |
|
7755
|
2
|
|
|
|
|
|
c = ft_cell(aTHX_ *av_fetch(r2, 0, 0), "cell [1][0]"); |
|
7756
|
2
|
|
|
|
|
|
d = ft_cell(aTHX_ *av_fetch(r2, 1, 0), "cell [1][1]"); |
|
7757
|
3
|
50
|
|
|
|
|
} else if (SvTYPE(deref) == SVt_PVHV) { |
|
7758
|
|
|
|
|
|
|
/* 2x2 hash; rows and columns are ordered by lexical key sort so the |
|
7759
|
|
|
|
|
|
|
* result is deterministic regardless of Perl's hash randomization. */ |
|
7760
|
3
|
|
|
|
|
|
HV *restrict outer = (HV *)deref; |
|
7761
|
3
|
50
|
|
|
|
|
if (HvUSEDKEYS(outer) != 2) croak("Outer hash must have exactly 2 keys"); |
|
|
|
50
|
|
|
|
|
|
|
7762
|
3
|
|
|
|
|
|
hv_iterinit(outer); |
|
7763
|
3
|
|
|
|
|
|
HE *restrict e1 = hv_iternext(outer), *e2 = hv_iternext(outer); |
|
7764
|
3
|
|
|
|
|
|
const char *restrict ok1 = SvPV_nolen(hv_iterkeysv(e1)); |
|
7765
|
3
|
|
|
|
|
|
int swap_rows = strcmp(ok1, SvPV_nolen(hv_iterkeysv(e2))) > 0; |
|
7766
|
3
|
100
|
|
|
|
|
SV *restrict row1_sv = hv_iterval(outer, swap_rows ? e2 : e1); |
|
7767
|
3
|
100
|
|
|
|
|
SV *restrict row2_sv = hv_iterval(outer, swap_rows ? e1 : e2); |
|
7768
|
3
|
50
|
|
|
|
|
if (!SvROK(row1_sv) || SvTYPE(SvRV(row1_sv)) != SVt_PVHV || |
|
|
|
50
|
|
|
|
|
|
|
7769
|
3
|
50
|
|
|
|
|
!SvROK(row2_sv) || SvTYPE(SvRV(row2_sv)) != SVt_PVHV) |
|
|
|
50
|
|
|
|
|
|
|
7770
|
0
|
|
|
|
|
|
croak("Inner elements must be hash refs"); |
|
7771
|
|
|
|
|
|
|
|
|
7772
|
3
|
|
|
|
|
|
HV *restrict rows[2]; rows[0] = (HV *)SvRV(row1_sv); rows[1] = (HV *)SvRV(row2_sv); |
|
7773
|
|
|
|
|
|
|
long cells[2][2]; |
|
7774
|
9
|
100
|
|
|
|
|
for (unsigned int rr = 0; rr < 2; rr++) { |
|
7775
|
6
|
|
|
|
|
|
HV *restrict in = rows[rr]; |
|
7776
|
6
|
50
|
|
|
|
|
if (HvUSEDKEYS(in) != 2) croak("Inner hashes must have exactly 2 keys"); |
|
|
|
50
|
|
|
|
|
|
|
7777
|
6
|
|
|
|
|
|
hv_iterinit(in); |
|
7778
|
6
|
|
|
|
|
|
HE *c1 = hv_iternext(in), *c2 = hv_iternext(in); |
|
7779
|
6
|
|
|
|
|
|
const char *k1 = SvPV_nolen(hv_iterkeysv(c1)); |
|
7780
|
6
|
|
|
|
|
|
int swap_cols = strcmp(k1, SvPV_nolen(hv_iterkeysv(c2))) > 0; |
|
7781
|
6
|
100
|
|
|
|
|
HE *col0 = swap_cols ? c2 : c1; |
|
7782
|
6
|
100
|
|
|
|
|
HE *col1 = swap_cols ? c1 : c2; |
|
7783
|
6
|
|
|
|
|
|
cells[rr][0] = ft_cell(aTHX_ hv_iterval(in, col0), "hash cell"); |
|
7784
|
6
|
|
|
|
|
|
cells[rr][1] = ft_cell(aTHX_ hv_iterval(in, col1), "hash cell"); |
|
7785
|
|
|
|
|
|
|
} |
|
7786
|
3
|
|
|
|
|
|
a = cells[0][0]; b = cells[0][1]; c = cells[1][0]; d = cells[1][1]; |
|
7787
|
|
|
|
|
|
|
} else { |
|
7788
|
0
|
|
|
|
|
|
croak("Input must be a 2D Array or 2D Hash"); |
|
7789
|
|
|
|
|
|
|
} |
|
7790
|
5
|
50
|
|
|
|
|
if (a + b + c + d == 0) croak("fisher_test: table is all zeros"); |
|
7791
|
5
|
|
|
|
|
|
NV p_val = exact_p_value(a, b, c, d, alternative); |
|
7792
|
|
|
|
|
|
|
NV mle_or, ci_low, ci_high; |
|
7793
|
5
|
|
|
|
|
|
calculate_exact_stats(a, b, c, d, conf_level, alternative, &mle_or, &ci_low, &ci_high); |
|
7794
|
|
|
|
|
|
|
|
|
7795
|
5
|
|
|
|
|
|
HV *restrict ret = newHV(); |
|
7796
|
5
|
|
|
|
|
|
hv_stores(ret, "method", newSVpv("Fisher's Exact Test for Count Data", 0)); |
|
7797
|
5
|
|
|
|
|
|
hv_stores(ret, "alternative", newSVpv(alternative, 0)); |
|
7798
|
5
|
|
|
|
|
|
AV *restrict ci = newAV(); |
|
7799
|
5
|
|
|
|
|
|
av_push(ci, newSVnv(ci_low)); |
|
7800
|
5
|
|
|
|
|
|
av_push(ci, newSVnv(ci_high)); |
|
7801
|
5
|
|
|
|
|
|
hv_stores(ret, "conf_int", newRV_noinc((SV *)ci)); |
|
7802
|
5
|
|
|
|
|
|
HV *restrict est = newHV(); |
|
7803
|
5
|
|
|
|
|
|
hv_stores(est, "odds ratio", newSVnv(mle_or)); |
|
7804
|
5
|
|
|
|
|
|
hv_stores(ret, "estimate", newRV_noinc((SV *)est)); |
|
7805
|
5
|
|
|
|
|
|
hv_stores(ret, "p_value", newSVnv(p_val)); |
|
7806
|
5
|
|
|
|
|
|
hv_stores(ret, "conf_level", newSVnv(conf_level)); |
|
7807
|
5
|
|
|
|
|
|
RETVAL = newRV_noinc((SV *)ret); |
|
7808
|
|
|
|
|
|
|
} |
|
7809
|
|
|
|
|
|
|
OUTPUT: |
|
7810
|
|
|
|
|
|
|
RETVAL |
|
7811
|
|
|
|
|
|
|
|
|
7812
|
|
|
|
|
|
|
SV* power_t_test(...) |
|
7813
|
|
|
|
|
|
|
CODE: |
|
7814
|
|
|
|
|
|
|
{ |
|
7815
|
7
|
|
|
|
|
|
SV*restrict sv_n = NULL; |
|
7816
|
7
|
|
|
|
|
|
SV*restrict sv_delta = NULL; |
|
7817
|
7
|
|
|
|
|
|
SV*restrict sv_sd = NULL; |
|
7818
|
7
|
|
|
|
|
|
SV*restrict sv_sig_level = NULL; |
|
7819
|
7
|
|
|
|
|
|
SV*restrict sv_power = NULL; |
|
7820
|
|
|
|
|
|
|
|
|
7821
|
7
|
|
|
|
|
|
const char* restrict type = "two.sample"; |
|
7822
|
7
|
|
|
|
|
|
const char* restrict alternative = "two.sided"; |
|
7823
|
7
|
|
|
|
|
|
bool strict = FALSE; |
|
7824
|
7
|
|
|
|
|
|
NV tol = pow(2.2204460492503131e-16, 0.25); |
|
7825
|
|
|
|
|
|
|
|
|
7826
|
7
|
50
|
|
|
|
|
if (items % 2 != 0) croak("Usage: power_t_test(n => 30, delta => 0.5, sd => 1.0, ...)"); |
|
7827
|
34
|
100
|
|
|
|
|
for (unsigned short int i = 0; i < items; i += 2) { |
|
7828
|
27
|
|
|
|
|
|
const char* restrict key = SvPV_nolen(ST(i)); |
|
7829
|
27
|
|
|
|
|
|
SV* restrict val = ST(i+1); |
|
7830
|
|
|
|
|
|
|
|
|
7831
|
27
|
100
|
|
|
|
|
if (strEQ(key, "n")) sv_n = val; |
|
7832
|
26
|
100
|
|
|
|
|
else if (strEQ(key, "delta")) sv_delta = val; |
|
7833
|
19
|
100
|
|
|
|
|
else if (strEQ(key, "sd")) sv_sd = val; |
|
7834
|
12
|
50
|
|
|
|
|
else if (strEQ(key, "sig.level") || strEQ(key, "sig_level")) sv_sig_level = val; |
|
|
|
100
|
|
|
|
|
|
|
7835
|
11
|
100
|
|
|
|
|
else if (strEQ(key, "power")) sv_power = val; |
|
7836
|
5
|
100
|
|
|
|
|
else if (strEQ(key, "type")) type = SvPV_nolen(val); |
|
7837
|
2
|
50
|
|
|
|
|
else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val); |
|
7838
|
0
|
0
|
|
|
|
|
else if (strEQ(key, "strict")) strict = SvTRUE(val); |
|
7839
|
0
|
0
|
|
|
|
|
else if (strEQ(key, "tol")) tol = SvNV(val); |
|
7840
|
0
|
|
|
|
|
|
else croak("power_t_test: unknown argument '%s'", key); |
|
7841
|
|
|
|
|
|
|
} |
|
7842
|
|
|
|
|
|
|
|
|
7843
|
7
|
100
|
|
|
|
|
bool is_null_n = (!sv_n || !SvOK(sv_n)); |
|
|
|
50
|
|
|
|
|
|
|
7844
|
7
|
50
|
|
|
|
|
bool is_null_delta = (!sv_delta || !SvOK(sv_delta)); |
|
|
|
50
|
|
|
|
|
|
|
7845
|
7
|
100
|
|
|
|
|
bool is_null_power = (!sv_power || !SvOK(sv_power)); |
|
|
|
50
|
|
|
|
|
|
|
7846
|
7
|
50
|
|
|
|
|
bool is_null_sd = (sv_sd && !SvOK(sv_sd)); |
|
|
|
50
|
|
|
|
|
|
|
7847
|
7
|
100
|
|
|
|
|
bool is_null_sig_level = (sv_sig_level && !SvOK(sv_sig_level)); |
|
|
|
50
|
|
|
|
|
|
|
7848
|
|
|
|
|
|
|
|
|
7849
|
7
|
|
|
|
|
|
unsigned int missing_count = 0; |
|
7850
|
7
|
100
|
|
|
|
|
if (is_null_n) missing_count++; |
|
7851
|
7
|
50
|
|
|
|
|
if (is_null_delta) missing_count++; |
|
7852
|
7
|
100
|
|
|
|
|
if (is_null_power) missing_count++; |
|
7853
|
7
|
50
|
|
|
|
|
if (is_null_sd) missing_count++; |
|
7854
|
7
|
50
|
|
|
|
|
if (is_null_sig_level) missing_count++; |
|
7855
|
|
|
|
|
|
|
|
|
7856
|
7
|
50
|
|
|
|
|
if (missing_count != 1) { |
|
7857
|
0
|
|
|
|
|
|
croak("power_t_test: exactly one of 'n', 'delta', 'sd', 'power', and 'sig_level' must be undef/NULL"); |
|
7858
|
|
|
|
|
|
|
} |
|
7859
|
|
|
|
|
|
|
|
|
7860
|
7
|
100
|
|
|
|
|
NV n = is_null_n ? 0.0 : SvNV(sv_n); |
|
7861
|
7
|
50
|
|
|
|
|
NV delta = is_null_delta ? 0.0 : SvNV(sv_delta); |
|
7862
|
7
|
50
|
|
|
|
|
NV sd = (!sv_sd || is_null_sd) ? 1.0 : SvNV(sv_sd); |
|
|
|
50
|
|
|
|
|
|
|
7863
|
7
|
100
|
|
|
|
|
NV sig_level = (!sv_sig_level || is_null_sig_level) ? 0.05 : SvNV(sv_sig_level); |
|
|
|
50
|
|
|
|
|
|
|
7864
|
7
|
100
|
|
|
|
|
NV power = is_null_power ? 0.0 : SvNV(sv_power); |
|
7865
|
7
|
100
|
|
|
|
|
short int tsample = (strEQ(type, "one.sample") || strEQ(type, "paired")) ? 1 : 2; |
|
|
|
100
|
|
|
|
|
|
|
7866
|
7
|
100
|
|
|
|
|
short int tside = (strEQ(alternative, "one.sided") || strEQ(alternative, "greater") || strEQ(alternative, "less")) ? 1 : 2; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
7867
|
7
|
100
|
|
|
|
|
if (tside == 2 && !is_null_delta) delta = fabs(delta); |
|
|
|
50
|
|
|
|
|
|
|
7868
|
7
|
100
|
|
|
|
|
if (is_null_power) { |
|
7869
|
1
|
|
|
|
|
|
power = p_body(n, delta, sd, sig_level, tsample, tside, strict); |
|
7870
|
6
|
50
|
|
|
|
|
} else if (is_null_n) { |
|
7871
|
6
|
|
|
|
|
|
NV low = 2.0, high = 1e7; |
|
7872
|
6
|
50
|
|
|
|
|
while (p_body(high, delta, sd, sig_level, tsample, tside, strict) < power && high < 1e12) high *= 2.0; |
|
|
|
0
|
|
|
|
|
|
|
7873
|
228
|
100
|
|
|
|
|
while (high - low > tol) { |
|
7874
|
222
|
|
|
|
|
|
NV mid = low + (high - low) / 2.0; |
|
7875
|
222
|
100
|
|
|
|
|
if (p_body(mid, delta, sd, sig_level, tsample, tside, strict) < power) low = mid; |
|
7876
|
173
|
|
|
|
|
|
else high = mid; |
|
7877
|
|
|
|
|
|
|
} |
|
7878
|
6
|
|
|
|
|
|
n = low + (high - low) / 2.0; |
|
7879
|
0
|
0
|
|
|
|
|
} else if (is_null_sd) { |
|
7880
|
0
|
|
|
|
|
|
NV low = delta * 1e-7, high = delta * 1e7; |
|
7881
|
0
|
0
|
|
|
|
|
while (high - low > tol) { |
|
7882
|
0
|
|
|
|
|
|
NV mid = low + (high - low) / 2.0; |
|
7883
|
0
|
0
|
|
|
|
|
if (p_body(n, delta, mid, sig_level, tsample, tside, strict) > power) low = mid; |
|
7884
|
0
|
|
|
|
|
|
else high = mid; |
|
7885
|
|
|
|
|
|
|
} |
|
7886
|
0
|
|
|
|
|
|
sd = low + (high - low) / 2.0; |
|
7887
|
0
|
0
|
|
|
|
|
} else if (is_null_delta) { |
|
7888
|
0
|
|
|
|
|
|
NV low = sd * 1e-7, high = sd * 1e7; |
|
7889
|
0
|
0
|
|
|
|
|
while (p_body(n, high, sd, sig_level, tsample, tside, strict) < power && high < 1e12) high *= 2.0; |
|
|
|
0
|
|
|
|
|
|
|
7890
|
0
|
0
|
|
|
|
|
while (high - low > tol) { |
|
7891
|
0
|
|
|
|
|
|
NV mid = low + (high - low) / 2.0; |
|
7892
|
0
|
0
|
|
|
|
|
if (p_body(n, mid, sd, sig_level, tsample, tside, strict) < power) low = mid; |
|
7893
|
0
|
|
|
|
|
|
else high = mid; |
|
7894
|
|
|
|
|
|
|
} |
|
7895
|
0
|
|
|
|
|
|
delta = low + (high - low) / 2.0; |
|
7896
|
0
|
0
|
|
|
|
|
} else if (is_null_sig_level) { |
|
7897
|
0
|
|
|
|
|
|
NV low = 1e-10, high = 1.0 - 1e-10; |
|
7898
|
0
|
0
|
|
|
|
|
while (high - low > tol) { |
|
7899
|
0
|
|
|
|
|
|
NV mid = low + (high - low) / 2.0; |
|
7900
|
0
|
0
|
|
|
|
|
if (p_body(n, delta, sd, mid, tsample, tside, strict) < power) low = mid; |
|
7901
|
0
|
|
|
|
|
|
else high = mid; |
|
7902
|
|
|
|
|
|
|
} |
|
7903
|
0
|
|
|
|
|
|
sig_level = low + (high - low) / 2.0; |
|
7904
|
|
|
|
|
|
|
} |
|
7905
|
7
|
|
|
|
|
|
HV*restrict ret = newHV(); |
|
7906
|
7
|
|
|
|
|
|
hv_stores(ret, "n", newSVnv(n)); |
|
7907
|
7
|
|
|
|
|
|
hv_stores(ret, "delta", newSVnv(delta)); |
|
7908
|
7
|
|
|
|
|
|
hv_stores(ret, "sd", newSVnv(sd)); |
|
7909
|
7
|
|
|
|
|
|
hv_stores(ret, "sig.level", newSVnv(sig_level)); |
|
7910
|
7
|
|
|
|
|
|
hv_stores(ret, "power", newSVnv(power)); |
|
7911
|
7
|
|
|
|
|
|
hv_stores(ret, "alternative", newSVpv(alternative, 0)); |
|
7912
|
7
|
100
|
|
|
|
|
const char*restrict m_str = (tsample == 1) ? (strEQ(type, "paired") ? "Paired t test power calculation" : "One-sample t test power calculation") : "Two-sample t test power calculation"; |
|
|
|
100
|
|
|
|
|
|
|
7913
|
7
|
|
|
|
|
|
hv_stores(ret, "method", newSVpv(m_str, 0)); |
|
7914
|
7
|
100
|
|
|
|
|
const char*restrict n_str = (tsample == 2) ? "n is number in *each* group" : (strEQ(type, "paired") ? "n is number of *pairs*, sd is std.dev. of *differences* within pairs" : ""); |
|
|
|
100
|
|
|
|
|
|
|
7915
|
7
|
100
|
|
|
|
|
if (n_str[0] != '\0') hv_stores(ret, "note", newSVpv(n_str, 0)); |
|
7916
|
7
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)ret); |
|
7917
|
|
|
|
|
|
|
} |
|
7918
|
|
|
|
|
|
|
OUTPUT: |
|
7919
|
|
|
|
|
|
|
RETVAL |
|
7920
|
|
|
|
|
|
|
|
|
7921
|
|
|
|
|
|
|
SV* kruskal_test(...) |
|
7922
|
|
|
|
|
|
|
CODE: |
|
7923
|
|
|
|
|
|
|
{ |
|
7924
|
3
|
|
|
|
|
|
SV *restrict x_sv = NULL, *restrict g_sv = NULL, *restrict h_sv = NULL; |
|
7925
|
3
|
|
|
|
|
|
unsigned int arg_idx = 0; |
|
7926
|
|
|
|
|
|
|
// 1. Shift positional arguments |
|
7927
|
|
|
|
|
|
|
// Accept either: (arrayref, arrayref) or (hashref) |
|
7928
|
3
|
50
|
|
|
|
|
if (arg_idx < items && SvROK(ST(arg_idx))) { |
|
|
|
100
|
|
|
|
|
|
|
7929
|
2
|
|
|
|
|
|
svtype t = SvTYPE(SvRV(ST(arg_idx))); |
|
7930
|
2
|
100
|
|
|
|
|
if (t == SVt_PVAV) { |
|
7931
|
1
|
|
|
|
|
|
x_sv = ST(arg_idx++); |
|
7932
|
1
|
50
|
|
|
|
|
} else if (t == SVt_PVHV) { |
|
7933
|
1
|
|
|
|
|
|
h_sv = ST(arg_idx++); /* hash-of-arrays shortcut */ |
|
7934
|
|
|
|
|
|
|
} |
|
7935
|
|
|
|
|
|
|
} |
|
7936
|
3
|
100
|
|
|
|
|
if (!h_sv && arg_idx < items |
|
|
|
50
|
|
|
|
|
|
|
7937
|
2
|
100
|
|
|
|
|
&& SvROK(ST(arg_idx)) |
|
7938
|
1
|
50
|
|
|
|
|
&& SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) { |
|
7939
|
1
|
|
|
|
|
|
g_sv = ST(arg_idx++); |
|
7940
|
|
|
|
|
|
|
} |
|
7941
|
|
|
|
|
|
|
// 2. Parse named arguments (fallback) |
|
7942
|
5
|
100
|
|
|
|
|
for (; arg_idx < items; arg_idx += 2) { |
|
7943
|
2
|
|
|
|
|
|
const char *restrict key = SvPV_nolen(ST(arg_idx)); |
|
7944
|
2
|
|
|
|
|
|
SV *restrict val = ST(arg_idx + 1); |
|
7945
|
2
|
100
|
|
|
|
|
if (strEQ(key, "x")) x_sv = val; |
|
7946
|
1
|
50
|
|
|
|
|
else if (strEQ(key, "g")) g_sv = val; |
|
7947
|
0
|
0
|
|
|
|
|
else if (strEQ(key, "h")) h_sv = val; |
|
7948
|
0
|
|
|
|
|
|
else croak("kruskal_test: unknown argument '%s'", key); |
|
7949
|
|
|
|
|
|
|
} |
|
7950
|
|
|
|
|
|
|
// 3. Mutual-exclusion guard |
|
7951
|
3
|
100
|
|
|
|
|
if (h_sv && (x_sv || g_sv)) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
7952
|
0
|
|
|
|
|
|
croak("kruskal_test: cannot mix 'h' (hash-of-arrays) with 'x'/'g' inputs"); |
|
7953
|
|
|
|
|
|
|
|
|
7954
|
|
|
|
|
|
|
// Shared state filled by whichever input branch runs |
|
7955
|
3
|
|
|
|
|
|
RankInfo *restrict ri = NULL; |
|
7956
|
3
|
|
|
|
|
|
char **restrict group_names = NULL; /* Track names to build group_stats */ |
|
7957
|
3
|
|
|
|
|
|
size_t valid_n = 0, k = 0; |
|
7958
|
|
|
|
|
|
|
/* 4a. Hash-of-arrays input path */ |
|
7959
|
|
|
|
|
|
|
/* my %x = ( group1 => [...], group2 => [...], ... ) */ |
|
7960
|
|
|
|
|
|
|
/* ------------------------------------------------------------------ */ |
|
7961
|
3
|
100
|
|
|
|
|
if (h_sv) { |
|
7962
|
1
|
50
|
|
|
|
|
if (!SvROK(h_sv) || SvTYPE(SvRV(h_sv)) != SVt_PVHV) |
|
|
|
50
|
|
|
|
|
|
|
7963
|
0
|
|
|
|
|
|
croak("kruskal_test: 'h' must be a HASH reference"); |
|
7964
|
1
|
|
|
|
|
|
HV *restrict h_hv = (HV*)SvRV(h_sv); |
|
7965
|
|
|
|
|
|
|
// First pass – validate values and tally total elements |
|
7966
|
1
|
|
|
|
|
|
size_t total = 0; |
|
7967
|
1
|
|
|
|
|
|
hv_iterinit(h_hv); |
|
7968
|
|
|
|
|
|
|
HE *restrict he; |
|
7969
|
4
|
100
|
|
|
|
|
while ((he = hv_iternext(h_hv))) { |
|
7970
|
3
|
|
|
|
|
|
SV *restrict val = HeVAL(he); |
|
7971
|
3
|
50
|
|
|
|
|
if (!SvROK(val) || SvTYPE(SvRV(val)) != SVt_PVAV) |
|
|
|
50
|
|
|
|
|
|
|
7972
|
0
|
|
|
|
|
|
croak("kruskal_test: every value in 'h' must be an ARRAY reference"); |
|
7973
|
3
|
|
|
|
|
|
total += (size_t)(av_len((AV*)SvRV(val)) + 1); |
|
7974
|
|
|
|
|
|
|
} |
|
7975
|
1
|
50
|
|
|
|
|
if (total < 2) croak("not enough observations"); |
|
7976
|
1
|
|
|
|
|
|
ri = (RankInfo *)safemalloc(total * sizeof(RankInfo)); |
|
7977
|
1
|
50
|
|
|
|
|
size_t num_keys = HvKEYS(h_hv); |
|
7978
|
1
|
|
|
|
|
|
group_names = (char **)safecalloc(num_keys, sizeof(char*)); |
|
7979
|
|
|
|
|
|
|
/* 2nd pass – fill ri[], assigning one group_id per hash key */ |
|
7980
|
1
|
|
|
|
|
|
size_t group_id = 0; |
|
7981
|
1
|
|
|
|
|
|
hv_iterinit(h_hv); |
|
7982
|
4
|
100
|
|
|
|
|
while ((he = hv_iternext(h_hv))) { |
|
7983
|
|
|
|
|
|
|
STRLEN klen; |
|
7984
|
3
|
50
|
|
|
|
|
const char *restrict key_str = HePV(he, klen); |
|
7985
|
3
|
|
|
|
|
|
group_names[group_id] = savepvn(key_str, klen); // Save string key |
|
7986
|
3
|
|
|
|
|
|
AV *restrict av = (AV*)SvRV(HeVAL(he)); |
|
7987
|
3
|
|
|
|
|
|
size_t n_g = (size_t)(av_len(av) + 1); |
|
7988
|
17
|
100
|
|
|
|
|
for (size_t i = 0; i < n_g; i++) { |
|
7989
|
14
|
|
|
|
|
|
SV **restrict el = av_fetch(av, i, 0); |
|
7990
|
14
|
50
|
|
|
|
|
if (el && SvOK(*el) && looks_like_number(*el)) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
7991
|
14
|
|
|
|
|
|
ri[valid_n].val = SvNV(*el); |
|
7992
|
14
|
|
|
|
|
|
ri[valid_n].idx = group_id; /* group identity */ |
|
7993
|
14
|
|
|
|
|
|
valid_n++; |
|
7994
|
|
|
|
|
|
|
} |
|
7995
|
|
|
|
|
|
|
} |
|
7996
|
3
|
|
|
|
|
|
group_id++; |
|
7997
|
|
|
|
|
|
|
} |
|
7998
|
1
|
|
|
|
|
|
k = group_id; /* number of unique groups = number of hash keys */ |
|
7999
|
|
|
|
|
|
|
/* 4b. Original x / g array-pair input path */ |
|
8000
|
|
|
|
|
|
|
} else { |
|
8001
|
2
|
50
|
|
|
|
|
if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
8002
|
0
|
|
|
|
|
|
croak("kruskal_test: 'x' is a required argument and must be an ARRAY reference"); |
|
8003
|
2
|
50
|
|
|
|
|
if (!g_sv || !SvROK(g_sv) || SvTYPE(SvRV(g_sv)) != SVt_PVAV) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
8004
|
0
|
|
|
|
|
|
croak("kruskal_test: 'g' is a required argument and must be an ARRAY reference"); |
|
8005
|
|
|
|
|
|
|
|
|
8006
|
2
|
|
|
|
|
|
AV *restrict x_av = (AV*)SvRV(x_sv); |
|
8007
|
2
|
|
|
|
|
|
AV *restrict g_av = (AV*)SvRV(g_sv); |
|
8008
|
2
|
|
|
|
|
|
size_t nx = (size_t)(av_len(x_av) + 1); |
|
8009
|
2
|
|
|
|
|
|
size_t ng = (size_t)(av_len(g_av) + 1); |
|
8010
|
2
|
50
|
|
|
|
|
if (nx != ng) croak("kruskal_test: 'x' and 'g' must have the same length"); |
|
8011
|
2
|
50
|
|
|
|
|
if (nx < 2) croak("not enough observations"); |
|
8012
|
|
|
|
|
|
|
|
|
8013
|
2
|
|
|
|
|
|
ri = (RankInfo *)safemalloc(nx * sizeof(RankInfo)); |
|
8014
|
2
|
|
|
|
|
|
group_names = (char **)safecalloc(nx, sizeof(char*)); // Upper bound |
|
8015
|
|
|
|
|
|
|
|
|
8016
|
|
|
|
|
|
|
// Map string group names → contiguous integer IDs |
|
8017
|
2
|
|
|
|
|
|
HV *restrict group_map = newHV(); |
|
8018
|
2
|
|
|
|
|
|
size_t next_group_id = 0; |
|
8019
|
|
|
|
|
|
|
|
|
8020
|
30
|
100
|
|
|
|
|
for (size_t i = 0; i < nx; i++) { |
|
8021
|
28
|
|
|
|
|
|
SV **restrict x_el = av_fetch(x_av, i, 0); |
|
8022
|
28
|
|
|
|
|
|
SV **restrict g_el = av_fetch(g_av, i, 0); |
|
8023
|
28
|
50
|
|
|
|
|
if (x_el && SvOK(*x_el) && looks_like_number(*x_el) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
8024
|
28
|
50
|
|
|
|
|
&& g_el && SvOK(*g_el)) { |
|
|
|
50
|
|
|
|
|
|
|
8025
|
28
|
|
|
|
|
|
const char *restrict g_str = SvPV_nolen(*g_el); |
|
8026
|
28
|
|
|
|
|
|
STRLEN glen = strlen(g_str); |
|
8027
|
28
|
|
|
|
|
|
SV **restrict id_sv = hv_fetch(group_map, g_str, glen, 0); |
|
8028
|
|
|
|
|
|
|
size_t group_id; |
|
8029
|
28
|
100
|
|
|
|
|
if (id_sv) { |
|
8030
|
22
|
|
|
|
|
|
group_id = SvUV(*id_sv); |
|
8031
|
|
|
|
|
|
|
} else { |
|
8032
|
6
|
|
|
|
|
|
group_id = next_group_id++; |
|
8033
|
6
|
|
|
|
|
|
hv_store(group_map, g_str, glen, newSVuv(group_id), 0); |
|
8034
|
6
|
|
|
|
|
|
group_names[group_id] = savepvn(g_str, glen); // Save string key |
|
8035
|
|
|
|
|
|
|
} |
|
8036
|
28
|
|
|
|
|
|
ri[valid_n].val = SvNV(*x_el); |
|
8037
|
28
|
|
|
|
|
|
ri[valid_n].idx = group_id; |
|
8038
|
28
|
|
|
|
|
|
valid_n++; |
|
8039
|
|
|
|
|
|
|
} |
|
8040
|
|
|
|
|
|
|
} |
|
8041
|
2
|
|
|
|
|
|
k = next_group_id; |
|
8042
|
2
|
|
|
|
|
|
SvREFCNT_dec(group_map); |
|
8043
|
|
|
|
|
|
|
} |
|
8044
|
|
|
|
|
|
|
/* 5. Shared post-extraction validation */ |
|
8045
|
3
|
50
|
|
|
|
|
if (valid_n < 2 || k < 2) { |
|
|
|
50
|
|
|
|
|
|
|
8046
|
0
|
|
|
|
|
|
Safefree(ri); |
|
8047
|
0
|
0
|
|
|
|
|
if (group_names) { |
|
8048
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < k; i++) { if (group_names[i]) Safefree(group_names[i]); } |
|
|
|
0
|
|
|
|
|
|
|
8049
|
0
|
|
|
|
|
|
Safefree(group_names); |
|
8050
|
|
|
|
|
|
|
} |
|
8051
|
0
|
0
|
|
|
|
|
if (valid_n < 2) croak("not enough observations"); |
|
8052
|
0
|
|
|
|
|
|
croak("all observations are in the same group"); |
|
8053
|
|
|
|
|
|
|
} |
|
8054
|
|
|
|
|
|
|
// 6. Ranking and Tie Accumulation (Reusing LikeR Helper) |
|
8055
|
3
|
|
|
|
|
|
bool has_ties = 0; |
|
8056
|
3
|
|
|
|
|
|
NV tie_adj = rank_and_count_ties(ri, valid_n, &has_ties); |
|
8057
|
|
|
|
|
|
|
// 7. Aggregate Sum of Ranks AND Actual Values by Group |
|
8058
|
3
|
|
|
|
|
|
NV *restrict group_rank_sums = (NV *)safecalloc(k, sizeof(NV)); |
|
8059
|
3
|
|
|
|
|
|
NV *restrict group_val_sums = (NV *)safecalloc(k, sizeof(NV)); // For Mean |
|
8060
|
3
|
|
|
|
|
|
size_t *restrict group_counts = (size_t *)safecalloc(k, sizeof(size_t)); |
|
8061
|
45
|
100
|
|
|
|
|
for (size_t i = 0; i < valid_n; i++) { |
|
8062
|
42
|
|
|
|
|
|
size_t g_id = ri[i].idx; |
|
8063
|
42
|
|
|
|
|
|
group_rank_sums[g_id] += ri[i].rank; |
|
8064
|
42
|
|
|
|
|
|
group_val_sums[g_id] += ri[i].val; |
|
8065
|
42
|
|
|
|
|
|
group_counts[g_id]++; |
|
8066
|
|
|
|
|
|
|
} |
|
8067
|
|
|
|
|
|
|
// 8. Calculate STATISTIC |
|
8068
|
3
|
|
|
|
|
|
NV stat_base = 0.0; |
|
8069
|
12
|
100
|
|
|
|
|
for (size_t i = 0; i < k; i++) { |
|
8070
|
9
|
50
|
|
|
|
|
if (group_counts[i] > 0) |
|
8071
|
9
|
|
|
|
|
|
stat_base += (group_rank_sums[i] * group_rank_sums[i]) |
|
8072
|
9
|
|
|
|
|
|
/ (NV)group_counts[i]; |
|
8073
|
|
|
|
|
|
|
} |
|
8074
|
3
|
|
|
|
|
|
NV n_d = (NV)valid_n; |
|
8075
|
3
|
|
|
|
|
|
NV stat = (12.0 * stat_base / (n_d * (n_d + 1.0))) - 3.0 * (n_d + 1.0); |
|
8076
|
3
|
50
|
|
|
|
|
if (tie_adj > 0.0) { |
|
8077
|
0
|
|
|
|
|
|
NV tie_denom = 1.0 - (tie_adj / (n_d * n_d * n_d - n_d)); |
|
8078
|
0
|
|
|
|
|
|
stat /= tie_denom; |
|
8079
|
|
|
|
|
|
|
} |
|
8080
|
3
|
|
|
|
|
|
int df = (int)k - 1; |
|
8081
|
3
|
|
|
|
|
|
NV p_val = get_p_value(stat, df); |
|
8082
|
|
|
|
|
|
|
// 9. Return structured data exactly like R's htest |
|
8083
|
3
|
|
|
|
|
|
HV *restrict res = newHV(); |
|
8084
|
3
|
|
|
|
|
|
hv_stores(res, "statistic", newSVnv(stat)); |
|
8085
|
3
|
|
|
|
|
|
hv_stores(res, "parameter", newSViv(df)); |
|
8086
|
3
|
|
|
|
|
|
hv_stores(res, "p_value", newSVnv(p_val)); |
|
8087
|
3
|
|
|
|
|
|
hv_stores(res, "p.value", newSVnv(p_val)); |
|
8088
|
3
|
|
|
|
|
|
hv_stores(res, "method", newSVpv("Kruskal-Wallis rank sum test", 0)); |
|
8089
|
|
|
|
|
|
|
// 10. Build the group_stats hash |
|
8090
|
3
|
|
|
|
|
|
HV *restrict group_stats = newHV(); |
|
8091
|
3
|
|
|
|
|
|
HV *restrict stats_mean = newHV(); |
|
8092
|
3
|
|
|
|
|
|
HV *restrict stats_size = newHV(); |
|
8093
|
12
|
100
|
|
|
|
|
for (size_t i = 0; i < k; i++) { |
|
8094
|
9
|
50
|
|
|
|
|
if (group_counts[i] > 0 && group_names[i]) { |
|
|
|
50
|
|
|
|
|
|
|
8095
|
9
|
|
|
|
|
|
double mean = group_val_sums[i] / (double)group_counts[i]; |
|
8096
|
9
|
|
|
|
|
|
size_t nlen = strlen(group_names[i]); |
|
8097
|
9
|
|
|
|
|
|
hv_store(stats_mean, group_names[i], nlen, newSVnv(mean), 0); |
|
8098
|
9
|
|
|
|
|
|
hv_store(stats_size, group_names[i], nlen, newSVuv(group_counts[i]), 0); |
|
8099
|
|
|
|
|
|
|
} |
|
8100
|
9
|
50
|
|
|
|
|
if (group_names[i]) Safefree(group_names[i]); // Clean up name copy |
|
8101
|
|
|
|
|
|
|
} |
|
8102
|
|
|
|
|
|
|
// Embed the nested hashes |
|
8103
|
3
|
|
|
|
|
|
hv_stores(group_stats, "mean", newRV_noinc((SV*)stats_mean)); |
|
8104
|
3
|
|
|
|
|
|
hv_stores(group_stats, "size", newRV_noinc((SV*)stats_size)); |
|
8105
|
3
|
|
|
|
|
|
hv_stores(res, "group_stats", newRV_noinc((SV*)group_stats)); |
|
8106
|
|
|
|
|
|
|
// Memory Cleanup |
|
8107
|
3
|
|
|
|
|
|
Safefree(group_names); Safefree(group_rank_sums); |
|
8108
|
3
|
|
|
|
|
|
Safefree(group_val_sums); Safefree(group_counts); Safefree(ri); |
|
8109
|
|
|
|
|
|
|
|
|
8110
|
3
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)res); |
|
8111
|
|
|
|
|
|
|
} |
|
8112
|
|
|
|
|
|
|
OUTPUT: |
|
8113
|
|
|
|
|
|
|
RETVAL |
|
8114
|
|
|
|
|
|
|
|
|
8115
|
|
|
|
|
|
|
SV* var_test(...) |
|
8116
|
|
|
|
|
|
|
CODE: |
|
8117
|
|
|
|
|
|
|
{ |
|
8118
|
6
|
|
|
|
|
|
SV* restrict x_sv = NULL; |
|
8119
|
6
|
|
|
|
|
|
SV* restrict y_sv = NULL; |
|
8120
|
6
|
|
|
|
|
|
NV ratio = 1.0, conf_level = 0.95; |
|
8121
|
6
|
|
|
|
|
|
const char* restrict alternative = "two.sided"; |
|
8122
|
6
|
|
|
|
|
|
unsigned int arg_idx = 0; |
|
8123
|
|
|
|
|
|
|
|
|
8124
|
|
|
|
|
|
|
// 1. Shift positional argument 'x' if it's an array reference |
|
8125
|
6
|
50
|
|
|
|
|
if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
8126
|
6
|
|
|
|
|
|
x_sv = ST(arg_idx); |
|
8127
|
6
|
|
|
|
|
|
arg_idx++; |
|
8128
|
|
|
|
|
|
|
} |
|
8129
|
|
|
|
|
|
|
|
|
8130
|
|
|
|
|
|
|
// 2. Shift positional argument 'y' if it's an array reference |
|
8131
|
6
|
50
|
|
|
|
|
if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
8132
|
6
|
|
|
|
|
|
y_sv = ST(arg_idx); |
|
8133
|
6
|
|
|
|
|
|
arg_idx++; |
|
8134
|
|
|
|
|
|
|
} |
|
8135
|
|
|
|
|
|
|
// Ensure the remaining arguments form complete key-value pairs |
|
8136
|
6
|
50
|
|
|
|
|
if ((items - arg_idx) % 2 != 0) { |
|
8137
|
0
|
|
|
|
|
|
croak("Usage: var_test(\\@x, \\@y, key => value, ...)"); |
|
8138
|
|
|
|
|
|
|
} |
|
8139
|
|
|
|
|
|
|
// --- Parse named arguments from the remaining flat stack --- |
|
8140
|
8
|
100
|
|
|
|
|
for (; arg_idx < items; arg_idx += 2) { |
|
8141
|
2
|
|
|
|
|
|
const char* restrict key = SvPV_nolen(ST(arg_idx)); |
|
8142
|
2
|
|
|
|
|
|
SV* restrict val = ST(arg_idx + 1); |
|
8143
|
|
|
|
|
|
|
|
|
8144
|
2
|
50
|
|
|
|
|
if (strEQ(key, "x")) x_sv = val; |
|
8145
|
2
|
50
|
|
|
|
|
else if (strEQ(key, "y")) y_sv = val; |
|
8146
|
2
|
100
|
|
|
|
|
else if (strEQ(key, "ratio")) ratio = SvNV(val); |
|
8147
|
1
|
50
|
|
|
|
|
else if (strEQ(key, "conf_level") || strEQ(key, "conf.level")) conf_level = SvNV(val); |
|
|
|
0
|
|
|
|
|
|
|
8148
|
0
|
0
|
|
|
|
|
else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val); |
|
8149
|
0
|
|
|
|
|
|
else croak("var_test: unknown argument '%s'", key); |
|
8150
|
|
|
|
|
|
|
} |
|
8151
|
|
|
|
|
|
|
// --- Validate required inputs / types --- |
|
8152
|
6
|
50
|
|
|
|
|
if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
8153
|
0
|
|
|
|
|
|
croak("var_test: 'x' is a required argument and must be an ARRAY reference"); |
|
8154
|
6
|
50
|
|
|
|
|
if (!y_sv || !SvROK(y_sv) || SvTYPE(SvRV(y_sv)) != SVt_PVAV) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
8155
|
0
|
|
|
|
|
|
croak("var_test: 'y' is a required argument and must be an ARRAY reference"); |
|
8156
|
|
|
|
|
|
|
|
|
8157
|
6
|
50
|
|
|
|
|
if (ratio <= 0.0 || !isfinite(ratio)) |
|
|
|
50
|
|
|
|
|
|
|
8158
|
0
|
|
|
|
|
|
croak("var_test: 'ratio' must be a single positive number"); |
|
8159
|
6
|
50
|
|
|
|
|
if (conf_level <= 0.0 || conf_level >= 1.0 || !isfinite(conf_level)) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
8160
|
0
|
|
|
|
|
|
croak("var_test: 'conf.level' must be a single number between 0 and 1"); |
|
8161
|
6
|
|
|
|
|
|
AV* restrict x_av = (AV*)SvRV(x_sv); |
|
8162
|
6
|
|
|
|
|
|
AV* restrict y_av = (AV*)SvRV(y_sv); |
|
8163
|
6
|
|
|
|
|
|
size_t nx_raw = av_len(x_av) + 1; |
|
8164
|
6
|
|
|
|
|
|
size_t ny_raw = av_len(y_av) + 1; |
|
8165
|
|
|
|
|
|
|
// --- Computation via Welford's Algorithm (ignoring NaNs) --- |
|
8166
|
6
|
|
|
|
|
|
NV mean_x = 0.0, M2_x = 0.0; |
|
8167
|
6
|
|
|
|
|
|
size_t nx = 0; |
|
8168
|
32
|
100
|
|
|
|
|
for (size_t i = 0; i < nx_raw; i++) { |
|
8169
|
26
|
|
|
|
|
|
SV** restrict tv = av_fetch(x_av, i, 0); |
|
8170
|
26
|
50
|
|
|
|
|
if (tv && SvOK(*tv) && looks_like_number(*tv)) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
8171
|
26
|
|
|
|
|
|
NV val = SvNV(*tv); |
|
8172
|
26
|
50
|
|
|
|
|
if (!isnan(val) && isfinite(val)) { |
|
|
|
50
|
|
|
|
|
|
|
8173
|
26
|
|
|
|
|
|
nx++; |
|
8174
|
26
|
|
|
|
|
|
NV delta = val - mean_x; |
|
8175
|
26
|
|
|
|
|
|
mean_x += delta / nx; |
|
8176
|
26
|
|
|
|
|
|
M2_x += delta * (val - mean_x); |
|
8177
|
|
|
|
|
|
|
} |
|
8178
|
|
|
|
|
|
|
} |
|
8179
|
|
|
|
|
|
|
} |
|
8180
|
|
|
|
|
|
|
|
|
8181
|
6
|
|
|
|
|
|
NV mean_y = 0.0, M2_y = 0.0; |
|
8182
|
6
|
|
|
|
|
|
size_t ny = 0; |
|
8183
|
27
|
100
|
|
|
|
|
for (size_t i = 0; i < ny_raw; i++) { |
|
8184
|
21
|
|
|
|
|
|
SV** restrict tv = av_fetch(y_av, i, 0); |
|
8185
|
21
|
50
|
|
|
|
|
if (tv && SvOK(*tv) && looks_like_number(*tv)) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
8186
|
21
|
|
|
|
|
|
NV val = SvNV(*tv); |
|
8187
|
21
|
50
|
|
|
|
|
if (!isnan(val) && isfinite(val)) { |
|
|
|
50
|
|
|
|
|
|
|
8188
|
21
|
|
|
|
|
|
ny++; |
|
8189
|
21
|
|
|
|
|
|
NV delta = val - mean_y; |
|
8190
|
21
|
|
|
|
|
|
mean_y += delta / ny; |
|
8191
|
21
|
|
|
|
|
|
M2_y += delta * (val - mean_y); |
|
8192
|
|
|
|
|
|
|
} |
|
8193
|
|
|
|
|
|
|
} |
|
8194
|
|
|
|
|
|
|
} |
|
8195
|
|
|
|
|
|
|
|
|
8196
|
6
|
100
|
|
|
|
|
if (nx < 2) croak("not enough 'x' observations"); |
|
8197
|
5
|
100
|
|
|
|
|
if (ny < 2) croak("not enough 'y' observations"); |
|
8198
|
|
|
|
|
|
|
|
|
8199
|
4
|
|
|
|
|
|
NV df_x = (NV)(nx - 1); |
|
8200
|
4
|
|
|
|
|
|
NV df_y = (NV)(ny - 1); |
|
8201
|
4
|
|
|
|
|
|
NV var_x = M2_x / df_x; |
|
8202
|
4
|
|
|
|
|
|
NV var_y = M2_y / df_y; |
|
8203
|
4
|
100
|
|
|
|
|
if (var_y == 0.0) croak("var_test: variance of 'y' is zero (cannot divide by zero)"); |
|
8204
|
|
|
|
|
|
|
// --- Statistics Math --- |
|
8205
|
3
|
|
|
|
|
|
NV estimate = var_x / var_y; |
|
8206
|
3
|
|
|
|
|
|
NV statistic = estimate / ratio; |
|
8207
|
3
|
|
|
|
|
|
NV p_val = pf(statistic, df_x, df_y); |
|
8208
|
3
|
|
|
|
|
|
NV ci_lower = 0.0, ci_upper = INFINITY; |
|
8209
|
3
|
50
|
|
|
|
|
if (strcmp(alternative, "less") == 0) { |
|
8210
|
0
|
|
|
|
|
|
ci_upper = estimate / qf_bisection(1.0 - conf_level, df_x, df_y); |
|
8211
|
3
|
50
|
|
|
|
|
} else if (strcmp(alternative, "greater") == 0) { |
|
8212
|
0
|
|
|
|
|
|
p_val = 1.0 - p_val; |
|
8213
|
0
|
|
|
|
|
|
ci_lower = estimate / qf_bisection(conf_level, df_x, df_y); |
|
8214
|
|
|
|
|
|
|
} else { |
|
8215
|
|
|
|
|
|
|
// two.sided |
|
8216
|
3
|
|
|
|
|
|
NV p1 = p_val; |
|
8217
|
3
|
|
|
|
|
|
NV p2 = 1.0 - p_val; |
|
8218
|
3
|
50
|
|
|
|
|
p_val = 2.0 * (p1 < p2 ? p1 : p2); |
|
8219
|
3
|
|
|
|
|
|
NV beta = (1.0 - conf_level) / 2.0; |
|
8220
|
3
|
|
|
|
|
|
ci_lower = estimate / qf_bisection(1.0 - beta, df_x, df_y); |
|
8221
|
3
|
|
|
|
|
|
ci_upper = estimate / qf_bisection(beta, df_x, df_y); |
|
8222
|
|
|
|
|
|
|
} |
|
8223
|
|
|
|
|
|
|
// --- Pack Results --- |
|
8224
|
3
|
|
|
|
|
|
HV* restrict results = newHV(); |
|
8225
|
3
|
|
|
|
|
|
hv_store(results, "statistic", 9, newSVnv(statistic), 0); |
|
8226
|
3
|
|
|
|
|
|
AV* restrict param_av = newAV(); |
|
8227
|
3
|
|
|
|
|
|
av_push(param_av, newSVnv(df_x)); |
|
8228
|
3
|
|
|
|
|
|
av_push(param_av, newSVnv(df_y)); |
|
8229
|
3
|
|
|
|
|
|
hv_store(results, "parameter", 9, newRV_noinc((SV*)param_av), 0); |
|
8230
|
3
|
|
|
|
|
|
hv_store(results, "p_value", 7, newSVnv(p_val), 0); |
|
8231
|
3
|
|
|
|
|
|
AV* restrict conf_int = newAV(); |
|
8232
|
3
|
|
|
|
|
|
av_push(conf_int, newSVnv(ci_lower)); |
|
8233
|
3
|
|
|
|
|
|
av_push(conf_int, newSVnv(ci_upper)); |
|
8234
|
3
|
|
|
|
|
|
hv_store(results, "conf_int", 8, newRV_noinc((SV*)conf_int), 0); |
|
8235
|
3
|
|
|
|
|
|
hv_store(results, "estimate", 8, newSVnv(estimate), 0); |
|
8236
|
3
|
|
|
|
|
|
hv_store(results, "null_value", 10, newSVnv(ratio), 0); |
|
8237
|
3
|
|
|
|
|
|
hv_store(results, "alternative", 11, newSVpv(alternative, 0), 0); |
|
8238
|
3
|
|
|
|
|
|
hv_store(results, "method", 6, newSVpv("F test to compare two variances", 0), 0); |
|
8239
|
3
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)results); |
|
8240
|
|
|
|
|
|
|
} |
|
8241
|
|
|
|
|
|
|
OUTPUT: |
|
8242
|
|
|
|
|
|
|
RETVAL |
|
8243
|
|
|
|
|
|
|
|
|
8244
|
|
|
|
|
|
|
SV *sample(ref, n = 1) |
|
8245
|
|
|
|
|
|
|
SV *ref |
|
8246
|
|
|
|
|
|
|
IV n |
|
8247
|
|
|
|
|
|
|
PREINIT: |
|
8248
|
6
|
50
|
|
|
|
|
SV *restrict ret = &PL_sv_undef; |
|
8249
|
|
|
|
|
|
|
CODE: |
|
8250
|
6
|
50
|
|
|
|
|
if (!PL_srand_called) { |
|
8251
|
0
|
|
|
|
|
|
(void)seedDrand01((Rand_seed_t)Perl_seed(aTHX)); |
|
8252
|
0
|
|
|
|
|
|
PL_srand_called = TRUE; |
|
8253
|
|
|
|
|
|
|
} |
|
8254
|
6
|
50
|
|
|
|
|
if (n < 0) n = 0; |
|
8255
|
6
|
50
|
|
|
|
|
if (SvROK(ref)) { |
|
8256
|
6
|
|
|
|
|
|
SV *restrict rv = SvRV(ref); |
|
8257
|
|
|
|
|
|
|
/* --- HASH REFERENCE --- */ |
|
8258
|
6
|
100
|
|
|
|
|
if (SvTYPE(rv) == SVt_PVHV) { |
|
8259
|
3
|
|
|
|
|
|
HV *restrict hv = (HV *)rv; |
|
8260
|
3
|
|
|
|
|
|
unsigned count = hv_iterinit(hv); |
|
8261
|
3
|
50
|
|
|
|
|
unsigned limit = (n < (IV)count) ? (I32)n : count; |
|
8262
|
3
|
|
|
|
|
|
HV *restrict ret_hv = newHV(); |
|
8263
|
|
|
|
|
|
|
|
|
8264
|
3
|
50
|
|
|
|
|
if (count > 0 && limit > 0) { |
|
|
|
50
|
|
|
|
|
|
|
8265
|
|
|
|
|
|
|
HE **restrict entries; |
|
8266
|
|
|
|
|
|
|
HE *restrict entry; |
|
8267
|
|
|
|
|
|
|
unsigned i; |
|
8268
|
3
|
|
|
|
|
|
Newx(entries, count, HE *); |
|
8269
|
|
|
|
|
|
|
/* Collect all HE pointers in one pass */ |
|
8270
|
3
|
|
|
|
|
|
i = 0; |
|
8271
|
15
|
100
|
|
|
|
|
while ((entry = hv_iternext(hv))) |
|
8272
|
12
|
|
|
|
|
|
entries[i++] = entry; |
|
8273
|
|
|
|
|
|
|
|
|
8274
|
|
|
|
|
|
|
/* Partial Fisher-Yates (only 'limit' passes) */ |
|
8275
|
9
|
100
|
|
|
|
|
for (i = 0; i < limit; i++) { |
|
8276
|
6
|
|
|
|
|
|
I32 j = i + (I32)(Drand01() * (count - i)); |
|
8277
|
6
|
|
|
|
|
|
HE *restrict tmp = entries[i]; |
|
8278
|
6
|
|
|
|
|
|
entries[i] = entries[j]; |
|
8279
|
6
|
|
|
|
|
|
entries[j] = tmp; |
|
8280
|
|
|
|
|
|
|
} |
|
8281
|
|
|
|
|
|
|
|
|
8282
|
|
|
|
|
|
|
/* Pre-size result hash to avoid rehashing during population */ |
|
8283
|
3
|
|
|
|
|
|
hv_ksplit(ret_hv, limit); |
|
8284
|
|
|
|
|
|
|
|
|
8285
|
9
|
100
|
|
|
|
|
for (i = 0; i < limit; i++) { |
|
8286
|
6
|
|
|
|
|
|
HEK *restrict hek = HeKEY_hek(entries[i]); |
|
8287
|
|
|
|
|
|
|
/* |
|
8288
|
|
|
|
|
|
|
* hv_store() with a precomputed hash skips the hash |
|
8289
|
|
|
|
|
|
|
* computation entirely. Negative klen signals UTF-8. |
|
8290
|
|
|
|
|
|
|
*/ |
|
8291
|
6
|
50
|
|
|
|
|
(void)hv_store( |
|
8292
|
|
|
|
|
|
|
ret_hv, |
|
8293
|
|
|
|
|
|
|
HEK_KEY(hek), |
|
8294
|
|
|
|
|
|
|
HEK_UTF8(hek) ? -(I32)HEK_LEN(hek) : (I32)HEK_LEN(hek), |
|
8295
|
|
|
|
|
|
|
SvREFCNT_inc(HeVAL(entries[i])), /* HeVAL: direct macro, no call */ |
|
8296
|
|
|
|
|
|
|
HeHASH(entries[i]) /* reuse precomputed hash */ |
|
8297
|
|
|
|
|
|
|
); |
|
8298
|
|
|
|
|
|
|
} |
|
8299
|
3
|
|
|
|
|
|
Safefree(entries); |
|
8300
|
|
|
|
|
|
|
} |
|
8301
|
3
|
|
|
|
|
|
ret = newRV_noinc((SV *)ret_hv); |
|
8302
|
3
|
50
|
|
|
|
|
} else if (SvTYPE(rv) == SVt_PVAV) {/* --- ARRAY REFERENCE --- */ |
|
8303
|
3
|
|
|
|
|
|
AV *restrict av = (AV *)rv; |
|
8304
|
3
|
50
|
|
|
|
|
size_t count = av_top_index(av) + 1; /* signed; 0 for empty AV */ |
|
8305
|
3
|
|
|
|
|
|
size_t limit = (n < count) ? (size_t)n : count; |
|
8306
|
3
|
|
|
|
|
|
AV *restrict ret_av = newAV(); |
|
8307
|
|
|
|
|
|
|
/* Pre-allocate the result array to avoid incremental reallocs */ |
|
8308
|
3
|
50
|
|
|
|
|
if (n > 0) |
|
8309
|
3
|
|
|
|
|
|
av_extend(ret_av, (size_t)n - 1); |
|
8310
|
3
|
50
|
|
|
|
|
if (count > 0) { |
|
8311
|
3
|
|
|
|
|
|
SV **restrict src = AvARRAY(av); /* direct pointer into AV's C array */ |
|
8312
|
|
|
|
|
|
|
size_t *restrict idx; |
|
8313
|
|
|
|
|
|
|
|
|
8314
|
|
|
|
|
|
|
/* Shuffle indices rather than SV** to keep the original AV intact */ |
|
8315
|
3
|
50
|
|
|
|
|
Newx(idx, count, size_t); |
|
8316
|
18
|
100
|
|
|
|
|
for (size_t i = 0; i < count; i++) |
|
8317
|
15
|
|
|
|
|
|
idx[i] = i; |
|
8318
|
|
|
|
|
|
|
// Partial Fisher-Yates on the index array |
|
8319
|
9
|
100
|
|
|
|
|
for (size_t i = 0; i < limit; i++) { |
|
8320
|
6
|
|
|
|
|
|
size_t j = i + (size_t)(Drand01() * (count - i)); |
|
8321
|
6
|
|
|
|
|
|
size_t tmp = idx[i]; |
|
8322
|
6
|
|
|
|
|
|
idx[i] = idx[j]; |
|
8323
|
6
|
|
|
|
|
|
idx[j] = tmp; |
|
8324
|
|
|
|
|
|
|
} |
|
8325
|
|
|
|
|
|
|
|
|
8326
|
9
|
100
|
|
|
|
|
for (size_t i = 0; i < (size_t)n; i++) { |
|
8327
|
6
|
50
|
|
|
|
|
if (i < limit) { |
|
8328
|
6
|
|
|
|
|
|
SV *restrict sv = src[idx[i]]; /* AvARRAY direct access — no av_fetch call */ |
|
8329
|
|
|
|
|
|
|
SV *restrict push_sv; |
|
8330
|
6
|
50
|
|
|
|
|
if (sv && sv != &PL_sv_undef) |
|
|
|
50
|
|
|
|
|
|
|
8331
|
6
|
|
|
|
|
|
push_sv = SvREFCNT_inc(sv); |
|
8332
|
|
|
|
|
|
|
else |
|
8333
|
0
|
|
|
|
|
|
push_sv = newSV(0); |
|
8334
|
6
|
|
|
|
|
|
av_push(ret_av, push_sv); |
|
8335
|
|
|
|
|
|
|
} else { |
|
8336
|
0
|
|
|
|
|
|
av_push(ret_av, newSV(0)); |
|
8337
|
|
|
|
|
|
|
} |
|
8338
|
|
|
|
|
|
|
} |
|
8339
|
3
|
|
|
|
|
|
Safefree(idx); |
|
8340
|
|
|
|
|
|
|
} else { |
|
8341
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < (size_t)n; i++) |
|
8342
|
0
|
|
|
|
|
|
av_push(ret_av, newSV(0)); |
|
8343
|
|
|
|
|
|
|
} |
|
8344
|
3
|
|
|
|
|
|
ret = newRV_noinc((SV *)ret_av); |
|
8345
|
|
|
|
|
|
|
} |
|
8346
|
|
|
|
|
|
|
} |
|
8347
|
6
|
|
|
|
|
|
RETVAL = ret; |
|
8348
|
|
|
|
|
|
|
OUTPUT: |
|
8349
|
|
|
|
|
|
|
RETVAL |
|
8350
|
|
|
|
|
|
|
|
|
8351
|
|
|
|
|
|
|
SV* dnorm(...) |
|
8352
|
|
|
|
|
|
|
CODE: |
|
8353
|
|
|
|
|
|
|
{ |
|
8354
|
23
|
50
|
|
|
|
|
if (items < 1) { |
|
8355
|
0
|
|
|
|
|
|
croak("Usage: dnorm(x), dnorm(x, mean => 0, sd => 1, log => 0)"); |
|
8356
|
|
|
|
|
|
|
} |
|
8357
|
23
|
|
|
|
|
|
SV*restrict x_sv = ST(0); |
|
8358
|
23
|
|
|
|
|
|
NV mean = 0.0, sd = 1.0; /*defaults*/ |
|
8359
|
23
|
|
|
|
|
|
bool give_log = 0; |
|
8360
|
|
|
|
|
|
|
// --- Parse remaining named arguments from the flat stack --- |
|
8361
|
23
|
50
|
|
|
|
|
if ((items - 1) % 2 != 0) { |
|
8362
|
0
|
|
|
|
|
|
croak("dnorm: Expected an even number of key-value named arguments after 'x'"); |
|
8363
|
|
|
|
|
|
|
} |
|
8364
|
32
|
100
|
|
|
|
|
for (size_t i = 1; i < items; i += 2) { |
|
8365
|
9
|
|
|
|
|
|
const char* restrict key = SvPV_nolen(ST(i)); |
|
8366
|
9
|
|
|
|
|
|
SV* restrict val = ST(i + 1); |
|
8367
|
9
|
100
|
|
|
|
|
if (strEQ(key, "mean")) mean = SvNV(val); |
|
8368
|
6
|
100
|
|
|
|
|
else if (strEQ(key, "sd")) sd = SvNV(val); |
|
8369
|
2
|
50
|
|
|
|
|
else if (strEQ(key, "log")) give_log = SvTRUE(val) ? 1 : 0; |
|
8370
|
0
|
|
|
|
|
|
else croak("dnorm: unknown argument '%s'", key); |
|
8371
|
|
|
|
|
|
|
} |
|
8372
|
|
|
|
|
|
|
// --- Branch based on scalar vs. arrayref for 'x' --- |
|
8373
|
24
|
100
|
|
|
|
|
if (SvROK(x_sv) && SvTYPE(SvRV(x_sv)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
8374
|
|
|
|
|
|
|
// x is an array reference |
|
8375
|
1
|
|
|
|
|
|
AV *restrict x_av = (AV*)SvRV(x_sv); |
|
8376
|
1
|
|
|
|
|
|
IV n = av_len(x_av) + 1; |
|
8377
|
1
|
|
|
|
|
|
AV *restrict result_av = newAV(); |
|
8378
|
1
|
50
|
|
|
|
|
if (n > 0) { |
|
8379
|
1
|
|
|
|
|
|
av_extend(result_av, n - 1); |
|
8380
|
4
|
100
|
|
|
|
|
for (IV i = 0; i < n; i++) { |
|
8381
|
3
|
|
|
|
|
|
SV **restrict elem = av_fetch(x_av, i, 0); |
|
8382
|
3
|
50
|
|
|
|
|
NV x_val = (elem && *elem) ? SvNV(*elem) : NAN; |
|
|
|
50
|
|
|
|
|
|
|
8383
|
3
|
|
|
|
|
|
NV res = c_dnorm(x_val, mean, sd, give_log); |
|
8384
|
3
|
|
|
|
|
|
av_store(result_av, i, newSVnv(res)); |
|
8385
|
|
|
|
|
|
|
} |
|
8386
|
|
|
|
|
|
|
} |
|
8387
|
1
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)result_av); |
|
8388
|
|
|
|
|
|
|
} else { |
|
8389
|
|
|
|
|
|
|
// x is a single numeric scalar |
|
8390
|
22
|
|
|
|
|
|
NV x_val = SvNV(x_sv); |
|
8391
|
22
|
|
|
|
|
|
NV res = c_dnorm(x_val, mean, sd, give_log); |
|
8392
|
22
|
|
|
|
|
|
RETVAL = newSVnv(res); |
|
8393
|
|
|
|
|
|
|
} |
|
8394
|
|
|
|
|
|
|
} |
|
8395
|
|
|
|
|
|
|
OUTPUT: |
|
8396
|
|
|
|
|
|
|
RETVAL |
|
8397
|
|
|
|
|
|
|
|
|
8398
|
|
|
|
|
|
|
void ljoin(h_ref, i_ref) |
|
8399
|
|
|
|
|
|
|
SV *h_ref; |
|
8400
|
|
|
|
|
|
|
SV *i_ref; |
|
8401
|
|
|
|
|
|
|
PREINIT: |
|
8402
|
|
|
|
|
|
|
HV *restrict h_hv, *restrict i_hv; |
|
8403
|
|
|
|
|
|
|
HE *restrict h_entry; |
|
8404
|
|
|
|
|
|
|
CODE: |
|
8405
|
|
|
|
|
|
|
/* 1. Validate inputs are hash references */ |
|
8406
|
4
|
50
|
|
|
|
|
if (!SvROK(h_ref) || SvTYPE(SvRV(h_ref)) != SVt_PVHV) { |
|
|
|
50
|
|
|
|
|
|
|
8407
|
0
|
|
|
|
|
|
croak("First argument to ljoin must be a hash reference"); |
|
8408
|
|
|
|
|
|
|
} |
|
8409
|
4
|
50
|
|
|
|
|
if (!SvROK(i_ref) || SvTYPE(SvRV(i_ref)) != SVt_PVHV) { |
|
|
|
50
|
|
|
|
|
|
|
8410
|
0
|
|
|
|
|
|
croak("Second argument to ljoin must be a hash reference"); |
|
8411
|
|
|
|
|
|
|
} |
|
8412
|
4
|
|
|
|
|
|
h_hv = (HV *)SvRV(h_ref); |
|
8413
|
4
|
|
|
|
|
|
i_hv = (HV *)SvRV(i_ref); |
|
8414
|
|
|
|
|
|
|
/* 2. Iterate through the primary hash ($h) */ |
|
8415
|
4
|
|
|
|
|
|
hv_iterinit(h_hv); |
|
8416
|
8
|
100
|
|
|
|
|
while ((h_entry = hv_iternext(h_hv))) { |
|
8417
|
4
|
|
|
|
|
|
SV *restrict row_key_sv = hv_iterkeysv(h_entry); |
|
8418
|
4
|
|
|
|
|
|
SV *restrict h_row_sv = hv_iterval(h_hv, h_entry); |
|
8419
|
|
|
|
|
|
|
// 3. Check if this row key exists in the secondary hash ($i) |
|
8420
|
4
|
|
|
|
|
|
HE *restrict i_fetch_he = hv_fetch_ent(i_hv, row_key_sv, 0, 0); |
|
8421
|
4
|
50
|
|
|
|
|
if (i_fetch_he) { |
|
8422
|
4
|
|
|
|
|
|
SV *restrict i_row_sv = HeVAL(i_fetch_he); |
|
8423
|
|
|
|
|
|
|
// 4. Ensure $h->{row} is a Hash and $i->{row} is a valid reference |
|
8424
|
4
|
100
|
|
|
|
|
if (SvROK(h_row_sv) && SvTYPE(SvRV(h_row_sv)) == SVt_PVHV && SvROK(i_row_sv)) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
8425
|
3
|
|
|
|
|
|
HV *restrict h_row_hv = (HV *)SvRV(h_row_sv); |
|
8426
|
|
|
|
|
|
|
/* Case A: $i->{row} is a Hash Reference */ |
|
8427
|
3
|
100
|
|
|
|
|
if (SvTYPE(SvRV(i_row_sv)) == SVt_PVHV) { |
|
8428
|
2
|
|
|
|
|
|
HV *restrict i_row_hv = (HV *)SvRV(i_row_sv); |
|
8429
|
|
|
|
|
|
|
HE *restrict i_entry; |
|
8430
|
2
|
|
|
|
|
|
hv_iterinit(i_row_hv); |
|
8431
|
4
|
100
|
|
|
|
|
while ((i_entry = hv_iternext(i_row_hv))) { |
|
8432
|
2
|
|
|
|
|
|
SV *restrict col_key_sv = hv_iterkeysv(i_entry); |
|
8433
|
2
|
|
|
|
|
|
SV *restrict col_val = hv_iterval(i_row_hv, i_entry); |
|
8434
|
2
|
|
|
|
|
|
hv_store_ent(h_row_hv, col_key_sv, SvREFCNT_inc(col_val), 0); |
|
8435
|
|
|
|
|
|
|
} |
|
8436
|
1
|
50
|
|
|
|
|
} else if (SvTYPE(SvRV(i_row_sv)) == SVt_PVAV) { |
|
8437
|
|
|
|
|
|
|
// Case B: $i->{row} is an Array Reference |
|
8438
|
1
|
|
|
|
|
|
AV *restrict i_row_av = (AV *)SvRV(i_row_sv); |
|
8439
|
|
|
|
|
|
|
// av_len returns the top index (length - 1) |
|
8440
|
1
|
|
|
|
|
|
SSize_t top_idx = av_len(i_row_av); |
|
8441
|
|
|
|
|
|
|
// Iterate through the array in chunks of 2 (key-value pairs) |
|
8442
|
3
|
100
|
|
|
|
|
for (SSize_t idx = 0; idx < top_idx; idx += 2) { |
|
8443
|
2
|
|
|
|
|
|
SV **restrict key_svp = av_fetch(i_row_av, idx, 0); |
|
8444
|
2
|
|
|
|
|
|
SV **restrict val_svp = av_fetch(i_row_av, idx + 1, 0); |
|
8445
|
|
|
|
|
|
|
// Ensure both the key and value exist in the array |
|
8446
|
2
|
50
|
|
|
|
|
if (key_svp && val_svp) { |
|
|
|
50
|
|
|
|
|
|
|
8447
|
2
|
|
|
|
|
|
hv_store_ent(h_row_hv, *key_svp, SvREFCNT_inc(*val_svp), 0); |
|
8448
|
|
|
|
|
|
|
} |
|
8449
|
|
|
|
|
|
|
} |
|
8450
|
|
|
|
|
|
|
} |
|
8451
|
|
|
|
|
|
|
} |
|
8452
|
|
|
|
|
|
|
} |
|
8453
|
|
|
|
|
|
|
} |
|
8454
|
|
|
|
|
|
|
|
|
8455
|
|
|
|
|
|
|
void add_data(h_ref, i_ref) |
|
8456
|
|
|
|
|
|
|
SV *h_ref; |
|
8457
|
|
|
|
|
|
|
SV *i_ref; |
|
8458
|
|
|
|
|
|
|
PREINIT: |
|
8459
|
14
|
|
|
|
|
|
short int target_root_mode = 0; // 1 = Hash, 2 = Array |
|
8460
|
14
|
|
|
|
|
|
short int i_root_mode = 0; // 1 = Hash, 2 = Array |
|
8461
|
14
|
|
|
|
|
|
short int target_inner_mode = 0; // 0 = Unknown, 1 = Hash, 2 = Array |
|
8462
|
|
|
|
|
|
|
CODE: |
|
8463
|
|
|
|
|
|
|
// 1. Validate inputs (Allow both Hash and Array references at the root) |
|
8464
|
14
|
100
|
|
|
|
|
if (!SvROK(h_ref) || (SvTYPE(SvRV(h_ref)) != SVt_PVHV && SvTYPE(SvRV(h_ref)) != SVt_PVAV)) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
8465
|
1
|
|
|
|
|
|
croak("1st argument to add_data must be a hash or array reference"); |
|
8466
|
|
|
|
|
|
|
} |
|
8467
|
13
|
100
|
|
|
|
|
if (!SvROK(i_ref) || (SvTYPE(SvRV(i_ref)) != SVt_PVHV && SvTYPE(SvRV(i_ref)) != SVt_PVAV)) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
8468
|
1
|
|
|
|
|
|
croak("2nd argument to add_data must be a hash or array reference"); |
|
8469
|
|
|
|
|
|
|
} |
|
8470
|
12
|
100
|
|
|
|
|
target_root_mode = (SvTYPE(SvRV(h_ref)) == SVt_PVHV) ? 1 : 2; |
|
8471
|
12
|
100
|
|
|
|
|
i_root_mode = (SvTYPE(SvRV(i_ref)) == SVt_PVHV) ? 1 : 2; |
|
8472
|
|
|
|
|
|
|
// Probe h_ref for inner structure |
|
8473
|
12
|
100
|
|
|
|
|
if (target_root_mode == 1) { |
|
8474
|
10
|
|
|
|
|
|
HV *restrict h_hv = (HV *)SvRV(h_ref); |
|
8475
|
10
|
50
|
|
|
|
|
if (HvKEYS(h_hv) > 0) { |
|
|
|
100
|
|
|
|
|
|
|
8476
|
8
|
|
|
|
|
|
HE **restrict probe_array = HvARRAY(h_hv); |
|
8477
|
8
|
|
|
|
|
|
STRLEN probe_max = HvMAX(h_hv); |
|
8478
|
67
|
100
|
|
|
|
|
for (STRLEN p_idx = 0; p_idx <= probe_max && target_inner_mode == 0; p_idx++) { |
|
|
|
100
|
|
|
|
|
|
|
8479
|
67
|
100
|
|
|
|
|
for (HE *restrict p_entry = probe_array[p_idx]; p_entry && target_inner_mode == 0; p_entry = HeNEXT(p_entry)) { |
|
|
|
50
|
|
|
|
|
|
|
8480
|
8
|
|
|
|
|
|
SV *restrict val = HeVAL(p_entry); |
|
8481
|
8
|
50
|
|
|
|
|
if (SvROK(val)) { |
|
8482
|
8
|
100
|
|
|
|
|
if (SvTYPE(SvRV(val)) == SVt_PVHV) target_inner_mode = 1; |
|
8483
|
3
|
50
|
|
|
|
|
else if (SvTYPE(SvRV(val)) == SVt_PVAV) target_inner_mode = 2; |
|
8484
|
|
|
|
|
|
|
} |
|
8485
|
|
|
|
|
|
|
} |
|
8486
|
|
|
|
|
|
|
} |
|
8487
|
|
|
|
|
|
|
} |
|
8488
|
|
|
|
|
|
|
} else { |
|
8489
|
2
|
|
|
|
|
|
AV *restrict h_av = (AV *)SvRV(h_ref); |
|
8490
|
2
|
|
|
|
|
|
SSize_t top = av_len(h_av); |
|
8491
|
4
|
100
|
|
|
|
|
for (SSize_t p_idx = 0; p_idx <= top && target_inner_mode == 0; p_idx++) { |
|
|
|
50
|
|
|
|
|
|
|
8492
|
2
|
|
|
|
|
|
SV **restrict svp = av_fetch(h_av, p_idx, 0); |
|
8493
|
2
|
50
|
|
|
|
|
if (svp && *svp && SvROK(*svp)) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
8494
|
2
|
50
|
|
|
|
|
if (SvTYPE(SvRV(*svp)) == SVt_PVHV) target_inner_mode = 1; |
|
8495
|
0
|
0
|
|
|
|
|
else if (SvTYPE(SvRV(*svp)) == SVt_PVAV) target_inner_mode = 2; |
|
8496
|
|
|
|
|
|
|
} |
|
8497
|
|
|
|
|
|
|
} |
|
8498
|
|
|
|
|
|
|
} |
|
8499
|
|
|
|
|
|
|
// Target is empty, infer intent from source hash/array |
|
8500
|
12
|
100
|
|
|
|
|
if (target_inner_mode == 0) { |
|
8501
|
2
|
50
|
|
|
|
|
if (i_root_mode == 1) { |
|
8502
|
2
|
|
|
|
|
|
HV *restrict i_hv = (HV *)SvRV(i_ref); |
|
8503
|
2
|
50
|
|
|
|
|
if (HvKEYS(i_hv) > 0) { |
|
|
|
50
|
|
|
|
|
|
|
8504
|
2
|
|
|
|
|
|
HE **restrict probe_array = HvARRAY(i_hv); |
|
8505
|
2
|
|
|
|
|
|
STRLEN probe_max = HvMAX(i_hv); |
|
8506
|
18
|
100
|
|
|
|
|
for (STRLEN p_idx = 0; p_idx <= probe_max && target_inner_mode == 0; p_idx++) { |
|
|
|
50
|
|
|
|
|
|
|
8507
|
18
|
100
|
|
|
|
|
for (HE *restrict p_entry = probe_array[p_idx]; p_entry && target_inner_mode == 0; p_entry = HeNEXT(p_entry)) { |
|
|
|
50
|
|
|
|
|
|
|
8508
|
2
|
|
|
|
|
|
SV *restrict val = HeVAL(p_entry); |
|
8509
|
2
|
50
|
|
|
|
|
if (SvROK(val)) { |
|
8510
|
2
|
100
|
|
|
|
|
if (SvTYPE(SvRV(val)) == SVt_PVHV) target_inner_mode = 1; |
|
8511
|
1
|
50
|
|
|
|
|
else if (SvTYPE(SvRV(val)) == SVt_PVAV) target_inner_mode = 2; |
|
8512
|
|
|
|
|
|
|
} |
|
8513
|
|
|
|
|
|
|
} |
|
8514
|
|
|
|
|
|
|
} |
|
8515
|
|
|
|
|
|
|
} |
|
8516
|
|
|
|
|
|
|
} else { |
|
8517
|
0
|
|
|
|
|
|
AV *restrict i_av = (AV *)SvRV(i_ref); |
|
8518
|
0
|
|
|
|
|
|
SSize_t top = av_len(i_av); |
|
8519
|
0
|
0
|
|
|
|
|
for (SSize_t p_idx = 0; p_idx <= top && target_inner_mode == 0; p_idx++) { |
|
|
|
0
|
|
|
|
|
|
|
8520
|
0
|
|
|
|
|
|
SV **restrict svp = av_fetch(i_av, p_idx, 0); |
|
8521
|
0
|
0
|
|
|
|
|
if (svp && *svp && SvROK(*svp)) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8522
|
0
|
0
|
|
|
|
|
if (SvTYPE(SvRV(*svp)) == SVt_PVHV) target_inner_mode = 1; |
|
8523
|
0
|
0
|
|
|
|
|
else if (SvTYPE(SvRV(*svp)) == SVt_PVAV) target_inner_mode = 2; |
|
8524
|
|
|
|
|
|
|
} |
|
8525
|
|
|
|
|
|
|
} |
|
8526
|
|
|
|
|
|
|
} |
|
8527
|
|
|
|
|
|
|
} |
|
8528
|
12
|
50
|
|
|
|
|
if (target_inner_mode == 0) { target_inner_mode = 1; } |
|
8529
|
|
|
|
|
|
|
// 2. Iterate through the SECONDARY structure ($i) using a unified loop |
|
8530
|
12
|
|
|
|
|
|
SSize_t i_idx = 0, i_top = -1; |
|
8531
|
12
|
|
|
|
|
|
HV *restrict i_hv = NULL; |
|
8532
|
12
|
|
|
|
|
|
AV *restrict i_av = NULL; |
|
8533
|
12
|
100
|
|
|
|
|
if (i_root_mode == 1) { |
|
8534
|
10
|
|
|
|
|
|
i_hv = (HV *)SvRV(i_ref); |
|
8535
|
10
|
|
|
|
|
|
hv_iterinit(i_hv); |
|
8536
|
|
|
|
|
|
|
} else { |
|
8537
|
2
|
|
|
|
|
|
i_av = (AV *)SvRV(i_ref); |
|
8538
|
2
|
|
|
|
|
|
i_top = av_len(i_av); |
|
8539
|
|
|
|
|
|
|
} |
|
8540
|
24
|
|
|
|
|
|
while (1) { |
|
8541
|
36
|
|
|
|
|
|
SV *restrict row_key_sv = NULL; |
|
8542
|
36
|
|
|
|
|
|
SV *restrict i_row_sv = NULL; |
|
8543
|
36
|
|
|
|
|
|
SSize_t current_idx = 0; |
|
8544
|
36
|
100
|
|
|
|
|
if (i_root_mode == 1) { |
|
8545
|
30
|
|
|
|
|
|
HE *restrict i_entry = hv_iternext(i_hv); |
|
8546
|
30
|
100
|
|
|
|
|
if (!i_entry) break; |
|
8547
|
20
|
|
|
|
|
|
row_key_sv = hv_iterkeysv(i_entry); |
|
8548
|
20
|
|
|
|
|
|
i_row_sv = hv_iterval(i_hv, i_entry); |
|
8549
|
|
|
|
|
|
|
// Prep integer index in case target is an Array (Suppress warnings for non-numeric string keys) |
|
8550
|
20
|
100
|
|
|
|
|
current_idx = looks_like_number(row_key_sv) ? SvIV(row_key_sv) : -1; |
|
8551
|
|
|
|
|
|
|
} else { |
|
8552
|
6
|
100
|
|
|
|
|
if (i_idx > i_top) break; |
|
8553
|
4
|
|
|
|
|
|
current_idx = i_idx++; |
|
8554
|
4
|
|
|
|
|
|
SV **restrict svp = av_fetch(i_av, current_idx, 0); |
|
8555
|
4
|
50
|
|
|
|
|
if (!svp || !*svp) continue; |
|
|
|
50
|
|
|
|
|
|
|
8556
|
4
|
|
|
|
|
|
i_row_sv = *svp; |
|
8557
|
|
|
|
|
|
|
// Prep string key in case target is a Hash |
|
8558
|
4
|
|
|
|
|
|
row_key_sv = sv_2mortal(newSViv(current_idx)); |
|
8559
|
|
|
|
|
|
|
} |
|
8560
|
24
|
100
|
|
|
|
|
if (SvROK(i_row_sv)) { |
|
8561
|
23
|
|
|
|
|
|
SV *restrict h_row_sv = NULL; |
|
8562
|
23
|
|
|
|
|
|
HV *restrict h_row_hv = NULL; |
|
8563
|
23
|
|
|
|
|
|
AV *restrict h_row_av = NULL; |
|
8564
|
|
|
|
|
|
|
// 3. Fetch from $h |
|
8565
|
23
|
100
|
|
|
|
|
if (target_root_mode == 1) { |
|
8566
|
18
|
|
|
|
|
|
HE *restrict h_fetch_he = hv_fetch_ent((HV *)SvRV(h_ref), row_key_sv, 0, 0); |
|
8567
|
18
|
100
|
|
|
|
|
if (h_fetch_he) h_row_sv = HeVAL(h_fetch_he); |
|
8568
|
|
|
|
|
|
|
} else { |
|
8569
|
5
|
100
|
|
|
|
|
if (current_idx >= 0) { |
|
8570
|
4
|
|
|
|
|
|
SV **restrict h_fetch_svp = av_fetch((AV *)SvRV(h_ref), current_idx, 0); |
|
8571
|
4
|
100
|
|
|
|
|
if (h_fetch_svp && *h_fetch_svp) h_row_sv = *h_fetch_svp; |
|
|
|
50
|
|
|
|
|
|
|
8572
|
|
|
|
|
|
|
} |
|
8573
|
|
|
|
|
|
|
} |
|
8574
|
23
|
100
|
|
|
|
|
if (h_row_sv && SvROK(h_row_sv)) { |
|
|
|
50
|
|
|
|
|
|
|
8575
|
11
|
100
|
|
|
|
|
if (SvTYPE(SvRV(h_row_sv)) == SVt_PVHV) { |
|
8576
|
7
|
|
|
|
|
|
h_row_hv = (HV *)SvRV(h_row_sv); |
|
8577
|
4
|
50
|
|
|
|
|
} else if (SvTYPE(SvRV(h_row_sv)) == SVt_PVAV) { |
|
8578
|
4
|
|
|
|
|
|
h_row_av = (AV *)SvRV(h_row_sv); |
|
8579
|
|
|
|
|
|
|
} |
|
8580
|
|
|
|
|
|
|
} |
|
8581
|
|
|
|
|
|
|
// 4. Row DOES NOT exist (or is incompatible type): Create it matching target_inner_mode |
|
8582
|
23
|
100
|
|
|
|
|
if (!h_row_hv && !h_row_av) { |
|
|
|
100
|
|
|
|
|
|
|
8583
|
12
|
100
|
|
|
|
|
if (target_inner_mode == 2) { |
|
8584
|
3
|
|
|
|
|
|
h_row_av = newAV(); |
|
8585
|
3
|
|
|
|
|
|
h_row_sv = newRV_noinc((SV *)h_row_av); |
|
8586
|
|
|
|
|
|
|
} else { |
|
8587
|
9
|
|
|
|
|
|
h_row_hv = newHV(); |
|
8588
|
9
|
|
|
|
|
|
h_row_sv = newRV_noinc((SV *)h_row_hv); |
|
8589
|
|
|
|
|
|
|
} |
|
8590
|
12
|
100
|
|
|
|
|
if (target_root_mode == 1) { |
|
8591
|
9
|
|
|
|
|
|
hv_store_ent((HV *)SvRV(h_ref), row_key_sv, h_row_sv, 0); |
|
8592
|
|
|
|
|
|
|
} else { |
|
8593
|
3
|
100
|
|
|
|
|
if (current_idx >= 0) { |
|
8594
|
2
|
|
|
|
|
|
av_store((AV *)SvRV(h_ref), current_idx, h_row_sv); |
|
8595
|
|
|
|
|
|
|
} |
|
8596
|
|
|
|
|
|
|
} |
|
8597
|
|
|
|
|
|
|
} |
|
8598
|
|
|
|
|
|
|
// 5. Merge data across potentially mismatched inner structures |
|
8599
|
23
|
100
|
|
|
|
|
if (h_row_hv) { |
|
8600
|
16
|
100
|
|
|
|
|
if (SvTYPE(SvRV(i_row_sv)) == SVt_PVHV) { |
|
8601
|
|
|
|
|
|
|
// Hash into Hash (Direct copy) |
|
8602
|
12
|
|
|
|
|
|
HV *restrict i_inner_hv = (HV *)SvRV(i_row_sv); |
|
8603
|
|
|
|
|
|
|
HE *restrict i_inner_entry; |
|
8604
|
12
|
|
|
|
|
|
hv_iterinit(i_inner_hv); |
|
8605
|
25
|
100
|
|
|
|
|
while ((i_inner_entry = hv_iternext(i_inner_hv))) { |
|
8606
|
13
|
|
|
|
|
|
SV *restrict col_key_sv = hv_iterkeysv(i_inner_entry); |
|
8607
|
13
|
|
|
|
|
|
SV *restrict col_val = hv_iterval(i_inner_hv, i_inner_entry); |
|
8608
|
13
|
|
|
|
|
|
hv_store_ent(h_row_hv, col_key_sv, SvREFCNT_inc(col_val), 0); |
|
8609
|
|
|
|
|
|
|
} |
|
8610
|
4
|
50
|
|
|
|
|
} else if (SvTYPE(SvRV(i_row_sv)) == SVt_PVAV) { |
|
8611
|
|
|
|
|
|
|
// Array into Hash (Read pairs) |
|
8612
|
4
|
|
|
|
|
|
AV *restrict i_inner_av = (AV *)SvRV(i_row_sv); |
|
8613
|
4
|
|
|
|
|
|
SSize_t inner_top_idx = av_len(i_inner_av); |
|
8614
|
10
|
100
|
|
|
|
|
for (SSize_t idx = 0; idx < inner_top_idx; idx += 2) { |
|
8615
|
6
|
|
|
|
|
|
SV **restrict key_svp = av_fetch(i_inner_av, idx, 0); |
|
8616
|
6
|
|
|
|
|
|
SV **restrict val_svp = av_fetch(i_inner_av, idx + 1, 0); |
|
8617
|
6
|
50
|
|
|
|
|
if (key_svp && *key_svp && val_svp) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
8618
|
6
|
50
|
|
|
|
|
SV *restrict val_to_store = *val_svp ? *val_svp : &PL_sv_undef; |
|
8619
|
6
|
|
|
|
|
|
hv_store_ent(h_row_hv, *key_svp, SvREFCNT_inc(val_to_store), 0); |
|
8620
|
|
|
|
|
|
|
} |
|
8621
|
|
|
|
|
|
|
} |
|
8622
|
|
|
|
|
|
|
} |
|
8623
|
7
|
50
|
|
|
|
|
} else if (h_row_av) { |
|
8624
|
7
|
100
|
|
|
|
|
if (SvTYPE(SvRV(i_row_sv)) == SVt_PVAV) { |
|
8625
|
|
|
|
|
|
|
// Array into Array (Direct push with non-null pointer assurance) |
|
8626
|
5
|
|
|
|
|
|
AV *restrict i_inner_av = (AV *)SvRV(i_row_sv); |
|
8627
|
5
|
|
|
|
|
|
SSize_t inner_top_idx = av_len(i_inner_av); |
|
8628
|
16
|
100
|
|
|
|
|
for (SSize_t idx = 0; idx <= inner_top_idx; ++idx) { |
|
8629
|
11
|
|
|
|
|
|
SV **restrict val_svp = av_fetch(i_inner_av, idx, 0); |
|
8630
|
11
|
50
|
|
|
|
|
if (val_svp) { |
|
8631
|
11
|
50
|
|
|
|
|
SV *restrict val_to_push = *val_svp ? *val_svp : &PL_sv_undef; |
|
8632
|
11
|
|
|
|
|
|
SV *restrict sv_inc = SvREFCNT_inc(val_to_push); |
|
8633
|
11
|
50
|
|
|
|
|
if (sv_inc) { |
|
8634
|
11
|
|
|
|
|
|
av_push(h_row_av, sv_inc); |
|
8635
|
|
|
|
|
|
|
} |
|
8636
|
|
|
|
|
|
|
} |
|
8637
|
|
|
|
|
|
|
} |
|
8638
|
2
|
50
|
|
|
|
|
} else if (SvTYPE(SvRV(i_row_sv)) == SVt_PVHV) { |
|
8639
|
|
|
|
|
|
|
// Hash into Array (Flatten and push pairs with non-null pointer assurance) |
|
8640
|
2
|
|
|
|
|
|
HV *restrict i_inner_hv = (HV *)SvRV(i_row_sv); |
|
8641
|
|
|
|
|
|
|
HE *restrict i_inner_entry; |
|
8642
|
2
|
|
|
|
|
|
hv_iterinit(i_inner_hv); |
|
8643
|
4
|
100
|
|
|
|
|
while ((i_inner_entry = hv_iternext(i_inner_hv))) { |
|
8644
|
2
|
|
|
|
|
|
SV *restrict col_key_sv = hv_iterkeysv(i_inner_entry); |
|
8645
|
2
|
|
|
|
|
|
SV *restrict col_val = hv_iterval(i_inner_hv, i_inner_entry); |
|
8646
|
2
|
50
|
|
|
|
|
if (col_key_sv && col_val) { |
|
|
|
50
|
|
|
|
|
|
|
8647
|
2
|
|
|
|
|
|
SV *restrict sv_key_inc = SvREFCNT_inc(col_key_sv); |
|
8648
|
2
|
|
|
|
|
|
SV *restrict sv_val_inc = SvREFCNT_inc(col_val); |
|
8649
|
2
|
50
|
|
|
|
|
if (sv_key_inc && sv_val_inc) { |
|
|
|
50
|
|
|
|
|
|
|
8650
|
2
|
|
|
|
|
|
av_push(h_row_av, sv_key_inc); |
|
8651
|
2
|
|
|
|
|
|
av_push(h_row_av, sv_val_inc); |
|
8652
|
|
|
|
|
|
|
} |
|
8653
|
|
|
|
|
|
|
} |
|
8654
|
|
|
|
|
|
|
} |
|
8655
|
|
|
|
|
|
|
} |
|
8656
|
|
|
|
|
|
|
} |
|
8657
|
|
|
|
|
|
|
} |
|
8658
|
|
|
|
|
|
|
} |
|
8659
|
|
|
|
|
|
|
|
|
8660
|
|
|
|
|
|
|
SV* value_counts(...) |
|
8661
|
|
|
|
|
|
|
PREINIT: |
|
8662
|
|
|
|
|
|
|
HV*restrict counts_hv; |
|
8663
|
|
|
|
|
|
|
SV*restrict arg1; |
|
8664
|
|
|
|
|
|
|
CODE: |
|
8665
|
|
|
|
|
|
|
// 1. CHECK FOR DATA FIRST to prevent memory leaks if we die |
|
8666
|
11
|
100
|
|
|
|
|
if (items == 0) { |
|
8667
|
1
|
|
|
|
|
|
croak("value_counts: no data provided. At least one argument is required."); |
|
8668
|
|
|
|
|
|
|
} |
|
8669
|
10
|
|
|
|
|
|
arg1 = ST(0); |
|
8670
|
10
|
100
|
|
|
|
|
if (!SvOK(arg1)) { |
|
8671
|
1
|
|
|
|
|
|
croak("First argument to value_counts is NOT defined"); |
|
8672
|
|
|
|
|
|
|
} |
|
8673
|
|
|
|
|
|
|
// 2. Allocate memory only after we know we are proceeding |
|
8674
|
9
|
|
|
|
|
|
counts_hv = newHV(); |
|
8675
|
|
|
|
|
|
|
// CASE 1: Flattened Array (or single scalar) |
|
8676
|
9
|
100
|
|
|
|
|
if (!SvROK(arg1)) { |
|
8677
|
6
|
100
|
|
|
|
|
for (unsigned i = 0; i < items; i++) { |
|
8678
|
4
|
|
|
|
|
|
increment_count(aTHX_ counts_hv, ST(i)); |
|
8679
|
|
|
|
|
|
|
} |
|
8680
|
|
|
|
|
|
|
} else {// CASE 2: Array Reference |
|
8681
|
7
|
|
|
|
|
|
SV*restrict rv = SvRV(arg1); |
|
8682
|
7
|
100
|
|
|
|
|
if (SvTYPE(rv) == SVt_PVAV) { |
|
8683
|
1
|
|
|
|
|
|
AV*restrict av = (AV*)rv; |
|
8684
|
1
|
|
|
|
|
|
SSize_t len = av_len(av) + 1; |
|
8685
|
4
|
100
|
|
|
|
|
for (unsigned i = 0; i < len; i++) { |
|
8686
|
3
|
|
|
|
|
|
SV**restrict valp = av_fetch(av, i, 0); |
|
8687
|
3
|
50
|
|
|
|
|
if (valp) increment_count(aTHX_ counts_hv, *valp); |
|
8688
|
|
|
|
|
|
|
} |
|
8689
|
6
|
50
|
|
|
|
|
} else if (SvTYPE(rv) == SVt_PVHV) { // CASES 3, 4, 5: Hash Reference |
|
8690
|
6
|
|
|
|
|
|
HV*restrict hv = (HV*)rv; |
|
8691
|
|
|
|
|
|
|
// CASES 4 & 5: Nested Structure requiring a 2nd Argument |
|
8692
|
6
|
100
|
|
|
|
|
if (items > 1) { |
|
8693
|
3
|
|
|
|
|
|
SV*restrict arg2 = ST(1); |
|
8694
|
|
|
|
|
|
|
STRLEN klen; |
|
8695
|
3
|
|
|
|
|
|
const char*restrict key = SvPV(arg2, klen); |
|
8696
|
|
|
|
|
|
|
// DataFrame-style Column-Oriented data check |
|
8697
|
3
|
|
|
|
|
|
SV**restrict col_svp = hv_fetch(hv, key, klen, 0); |
|
8698
|
4
|
100
|
|
|
|
|
if (col_svp && SvROK(*col_svp) && SvTYPE(SvRV(*col_svp)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
8699
|
1
|
|
|
|
|
|
AV*restrict av = (AV*)SvRV(*col_svp); |
|
8700
|
1
|
|
|
|
|
|
SSize_t len = av_len(av) + 1; |
|
8701
|
4
|
100
|
|
|
|
|
for (unsigned i = 0; i < len; i++) { |
|
8702
|
3
|
|
|
|
|
|
SV**restrict valp = av_fetch(av, i, 0); |
|
8703
|
3
|
50
|
|
|
|
|
if (valp) increment_count(aTHX_ counts_hv, *valp); |
|
8704
|
|
|
|
|
|
|
} |
|
8705
|
|
|
|
|
|
|
} else { |
|
8706
|
|
|
|
|
|
|
// Fallback: Row-Oriented nested structure |
|
8707
|
|
|
|
|
|
|
HE*restrict he; |
|
8708
|
2
|
|
|
|
|
|
hv_iterinit(hv); |
|
8709
|
8
|
100
|
|
|
|
|
while ((he = hv_iternext(hv))) { |
|
8710
|
6
|
|
|
|
|
|
SV*restrict inner_sv = HeVAL(he); |
|
8711
|
6
|
50
|
|
|
|
|
if (SvROK(inner_sv)) { |
|
8712
|
6
|
|
|
|
|
|
SV*restrict inner_rv = SvRV(inner_sv); |
|
8713
|
6
|
50
|
|
|
|
|
if (SvTYPE(inner_rv) == SVt_PVHV) {// CASE 5: Hash of Hashes |
|
8714
|
6
|
|
|
|
|
|
HV*restrict inner_hv = (HV*)inner_rv; |
|
8715
|
6
|
|
|
|
|
|
SV**restrict valp = hv_fetch(inner_hv, key, klen, 0); |
|
8716
|
6
|
100
|
|
|
|
|
if (valp) increment_count(aTHX_ counts_hv, *valp); |
|
8717
|
0
|
0
|
|
|
|
|
} else if (SvTYPE(inner_rv) == SVt_PVAV) {// CASE 4: Hash of Arrays (Row-Oriented) |
|
8718
|
0
|
0
|
|
|
|
|
if (looks_like_number(arg2)) { |
|
8719
|
0
|
|
|
|
|
|
AV*restrict inner_av = (AV*)inner_rv; |
|
8720
|
0
|
|
|
|
|
|
SSize_t idx = SvIV(arg2); |
|
8721
|
0
|
|
|
|
|
|
SV**restrict valp = av_fetch(inner_av, idx, 0); |
|
8722
|
0
|
0
|
|
|
|
|
if (valp) increment_count(aTHX_ counts_hv, *valp); |
|
8723
|
|
|
|
|
|
|
} |
|
8724
|
|
|
|
|
|
|
} |
|
8725
|
|
|
|
|
|
|
} |
|
8726
|
|
|
|
|
|
|
} |
|
8727
|
|
|
|
|
|
|
} |
|
8728
|
|
|
|
|
|
|
} else { // CASE 3: Hash Reference (No 2nd argument) |
|
8729
|
|
|
|
|
|
|
HE*restrict he; |
|
8730
|
3
|
|
|
|
|
|
hv_iterinit(hv); |
|
8731
|
11
|
100
|
|
|
|
|
while ((he = hv_iternext(hv))) { |
|
8732
|
8
|
|
|
|
|
|
SV*restrict val = HeVAL(he); |
|
8733
|
8
|
100
|
|
|
|
|
if (SvROK(val)) {// --- SAFETY CHECK |
|
8734
|
5
|
|
|
|
|
|
SV*restrict inner_rv = SvRV(val); |
|
8735
|
|
|
|
|
|
|
// If it's a Hash of Arrays, count ALL elements in the inner arrays |
|
8736
|
5
|
100
|
|
|
|
|
if (SvTYPE(inner_rv) == SVt_PVAV) { |
|
8737
|
2
|
|
|
|
|
|
AV*restrict inner_av = (AV*)inner_rv; |
|
8738
|
2
|
|
|
|
|
|
SSize_t len = av_len(inner_av) + 1; |
|
8739
|
8
|
100
|
|
|
|
|
for (unsigned i = 0; i < len; i++) { |
|
8740
|
6
|
|
|
|
|
|
SV**restrict valp = av_fetch(inner_av, i, 0); |
|
8741
|
6
|
50
|
|
|
|
|
if (valp) increment_count(aTHX_ counts_hv, *valp); |
|
8742
|
|
|
|
|
|
|
} |
|
8743
|
3
|
50
|
|
|
|
|
} else if (SvTYPE(inner_rv) == SVt_PVHV) { |
|
8744
|
|
|
|
|
|
|
// If it's a Hash of Hashes, count ALL elements across all inner keys |
|
8745
|
3
|
|
|
|
|
|
HV*restrict inner_hv = (HV*)inner_rv; |
|
8746
|
|
|
|
|
|
|
HE*restrict inner_he; |
|
8747
|
3
|
|
|
|
|
|
hv_iterinit(inner_hv); |
|
8748
|
7
|
100
|
|
|
|
|
while ((inner_he = hv_iternext(inner_hv))) { |
|
8749
|
4
|
|
|
|
|
|
SV*restrict inner_val = HeVAL(inner_he); |
|
8750
|
4
|
|
|
|
|
|
increment_count(aTHX_ counts_hv, inner_val); |
|
8751
|
|
|
|
|
|
|
} |
|
8752
|
|
|
|
|
|
|
} else { /* Unrecognized nested reference type */ |
|
8753
|
0
|
|
|
|
|
|
SvREFCNT_dec((SV*)counts_hv); |
|
8754
|
0
|
|
|
|
|
|
croak("value_counts: Unsupported nested reference type."); |
|
8755
|
|
|
|
|
|
|
} |
|
8756
|
|
|
|
|
|
|
} else { |
|
8757
|
|
|
|
|
|
|
/* Simple scalar value */ |
|
8758
|
3
|
|
|
|
|
|
increment_count(aTHX_ counts_hv, val); |
|
8759
|
|
|
|
|
|
|
} |
|
8760
|
|
|
|
|
|
|
} |
|
8761
|
|
|
|
|
|
|
} |
|
8762
|
|
|
|
|
|
|
} else { |
|
8763
|
|
|
|
|
|
|
/* Safely decrement the reference count of our hash before dying to prevent a leak */ |
|
8764
|
0
|
|
|
|
|
|
SvREFCNT_dec((SV*)counts_hv); |
|
8765
|
0
|
|
|
|
|
|
croak("value_counts: Unsupported reference type."); |
|
8766
|
|
|
|
|
|
|
} |
|
8767
|
|
|
|
|
|
|
} |
|
8768
|
9
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)counts_hv); |
|
8769
|
|
|
|
|
|
|
OUTPUT: |
|
8770
|
|
|
|
|
|
|
RETVAL |
|
8771
|
|
|
|
|
|
|
|
|
8772
|
|
|
|
|
|
|
#define EVAL_FILTER(sub_sv, val_sv, keep) do { \ |
|
8773
|
|
|
|
|
|
|
dSP; \ |
|
8774
|
|
|
|
|
|
|
unsigned int count; \ |
|
8775
|
|
|
|
|
|
|
SV *restrict _ef_arg = (val_sv) ? (val_sv) : &PL_sv_undef; \ |
|
8776
|
|
|
|
|
|
|
ENTER; \ |
|
8777
|
|
|
|
|
|
|
SAVETMPS; \ |
|
8778
|
|
|
|
|
|
|
SAVE_DEFSV; \ |
|
8779
|
|
|
|
|
|
|
SvREFCNT_inc(_ef_arg); /* Prevent LEAVE from stealing the refcount */ \ |
|
8780
|
|
|
|
|
|
|
DEFSV_set(_ef_arg); \ |
|
8781
|
|
|
|
|
|
|
PUSHMARK(SP); \ |
|
8782
|
|
|
|
|
|
|
XPUSHs(_ef_arg); \ |
|
8783
|
|
|
|
|
|
|
PUTBACK; \ |
|
8784
|
|
|
|
|
|
|
count = call_sv(sub_sv, G_SCALAR | G_EVAL); \ |
|
8785
|
|
|
|
|
|
|
SPAGAIN; \ |
|
8786
|
|
|
|
|
|
|
if (SvTRUE(ERRSV)) { FREETMPS; LEAVE; croak(NULL); } \ |
|
8787
|
|
|
|
|
|
|
if (count > 0) { \ |
|
8788
|
|
|
|
|
|
|
SV *restrict ret_sv = POPs; \ |
|
8789
|
|
|
|
|
|
|
keep = SvTRUE(ret_sv); \ |
|
8790
|
|
|
|
|
|
|
} else { \ |
|
8791
|
|
|
|
|
|
|
keep = 0; \ |
|
8792
|
|
|
|
|
|
|
} \ |
|
8793
|
|
|
|
|
|
|
PUTBACK; \ |
|
8794
|
|
|
|
|
|
|
FREETMPS; \ |
|
8795
|
|
|
|
|
|
|
LEAVE; \ |
|
8796
|
|
|
|
|
|
|
} while (0) |
|
8797
|
|
|
|
|
|
|
|
|
8798
|
|
|
|
|
|
|
SV *group_by(data_ref, target_key_sv, group_key_sv, ...) |
|
8799
|
|
|
|
|
|
|
SV *data_ref; |
|
8800
|
|
|
|
|
|
|
SV *target_key_sv; |
|
8801
|
|
|
|
|
|
|
SV *group_key_sv; |
|
8802
|
|
|
|
|
|
|
PREINIT: |
|
8803
|
|
|
|
|
|
|
HV *restrict result_hv; |
|
8804
|
8
|
|
|
|
|
|
HV *restrict filter_hv = NULL; |
|
8805
|
|
|
|
|
|
|
SV *restrict result_ref; |
|
8806
|
|
|
|
|
|
|
CODE: |
|
8807
|
8
|
100
|
|
|
|
|
if (!SvOK(data_ref)) { |
|
8808
|
1
|
|
|
|
|
|
croak("First argument to group_by is NOT defined"); |
|
8809
|
|
|
|
|
|
|
} |
|
8810
|
7
|
100
|
|
|
|
|
if (!SvOK(target_key_sv)) { |
|
8811
|
1
|
|
|
|
|
|
croak("Second argument to group_by is NOT defined"); |
|
8812
|
|
|
|
|
|
|
} |
|
8813
|
6
|
100
|
|
|
|
|
if (!SvOK(group_key_sv)) { |
|
8814
|
1
|
|
|
|
|
|
croak("Third argument to group_by is NOT defined"); |
|
8815
|
|
|
|
|
|
|
} |
|
8816
|
|
|
|
|
|
|
/* 1. Validate the primary input is a reference */ |
|
8817
|
5
|
50
|
|
|
|
|
if (!SvROK(data_ref)) { |
|
8818
|
0
|
|
|
|
|
|
croak("First argument to group_by must be a reference (Array of Hashes, Hash of Arrays, or Hash of Hashes)"); |
|
8819
|
|
|
|
|
|
|
} |
|
8820
|
5
|
100
|
|
|
|
|
if (items > 3) { /* Capture the optional filter argument */ |
|
8821
|
2
|
|
|
|
|
|
SV *restrict filter_ref = ST(3); |
|
8822
|
2
|
50
|
|
|
|
|
if (SvROK(filter_ref) && SvTYPE(SvRV(filter_ref)) == SVt_PVHV) { |
|
|
|
50
|
|
|
|
|
|
|
8823
|
2
|
|
|
|
|
|
filter_hv = (HV *)SvRV(filter_ref); |
|
8824
|
|
|
|
|
|
|
} |
|
8825
|
|
|
|
|
|
|
} |
|
8826
|
5
|
|
|
|
|
|
result_hv = newHV(); /* 2. Allocate the hash that we will return */ |
|
8827
|
|
|
|
|
|
|
/* Mortalize immediately! If the callback croaks, the tmps stack |
|
8828
|
|
|
|
|
|
|
* will safely clean this up. */ |
|
8829
|
5
|
|
|
|
|
|
result_ref = sv_2mortal(newRV_noinc((SV *)result_hv)); |
|
8830
|
5
|
100
|
|
|
|
|
if (SvTYPE(SvRV(data_ref)) == SVt_PVAV) { /* Input is an Array of Hashes (AoH) */ |
|
8831
|
2
|
|
|
|
|
|
AV *restrict data_av = (AV *)SvRV(data_ref); |
|
8832
|
2
|
|
|
|
|
|
SSize_t len = av_len(data_av) + 1; |
|
8833
|
10
|
100
|
|
|
|
|
for (SSize_t i = 0; i < len; i++) { |
|
8834
|
8
|
|
|
|
|
|
SV **restrict row_svp = av_fetch(data_av, i, 0); |
|
8835
|
8
|
50
|
|
|
|
|
if (row_svp && SvROK(*row_svp) && SvTYPE(SvRV(*row_svp)) == SVt_PVHV) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
8836
|
8
|
|
|
|
|
|
HV *restrict row_hv = (HV *)SvRV(*row_svp); |
|
8837
|
8
|
|
|
|
|
|
HE *restrict group_he = hv_fetch_ent(row_hv, group_key_sv, 0, 0); |
|
8838
|
8
|
|
|
|
|
|
HE *restrict target_he = hv_fetch_ent(row_hv, target_key_sv, 0, 0); |
|
8839
|
8
|
50
|
|
|
|
|
if (group_he) { |
|
8840
|
8
|
|
|
|
|
|
SV *restrict group_val = HeVAL(group_he); |
|
8841
|
8
|
100
|
|
|
|
|
SV *restrict target_val = target_he ? HeVAL(target_he) : NULL; |
|
8842
|
8
|
100
|
|
|
|
|
if (target_val && SvOK(target_val)) { |
|
|
|
50
|
|
|
|
|
|
|
8843
|
7
|
|
|
|
|
|
bool pass_filter = 1; |
|
8844
|
7
|
100
|
|
|
|
|
if (filter_hv) { |
|
8845
|
|
|
|
|
|
|
HE *restrict f_he; |
|
8846
|
4
|
|
|
|
|
|
hv_iterinit(filter_hv); |
|
8847
|
6
|
100
|
|
|
|
|
while ((f_he = hv_iternext(filter_hv))) { |
|
8848
|
4
|
|
|
|
|
|
SV *restrict f_col = hv_iterkeysv(f_he); |
|
8849
|
4
|
|
|
|
|
|
SV *restrict f_sub = hv_iterval(filter_hv, f_he); |
|
8850
|
4
|
|
|
|
|
|
HE *restrict val_he = hv_fetch_ent(row_hv, f_col, 0, 0); |
|
8851
|
4
|
50
|
|
|
|
|
SV *restrict val_sv = val_he ? HeVAL(val_he) : NULL; |
|
8852
|
|
|
|
|
|
|
bool keep; |
|
8853
|
4
|
50
|
|
|
|
|
EVAL_FILTER(f_sub, val_sv, keep); |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
8854
|
4
|
100
|
|
|
|
|
if (!keep) { |
|
8855
|
2
|
|
|
|
|
|
pass_filter = 0; |
|
8856
|
2
|
|
|
|
|
|
break; |
|
8857
|
|
|
|
|
|
|
} |
|
8858
|
|
|
|
|
|
|
} |
|
8859
|
|
|
|
|
|
|
} |
|
8860
|
7
|
100
|
|
|
|
|
if (pass_filter) { |
|
8861
|
5
|
|
|
|
|
|
HE *restrict res_he = hv_fetch_ent(result_hv, group_val, 0, 0); |
|
8862
|
|
|
|
|
|
|
AV *restrict res_av; |
|
8863
|
5
|
100
|
|
|
|
|
if (res_he) { |
|
8864
|
1
|
|
|
|
|
|
res_av = (AV *)SvRV(HeVAL(res_he)); |
|
8865
|
|
|
|
|
|
|
} else { |
|
8866
|
4
|
|
|
|
|
|
res_av = newAV(); |
|
8867
|
4
|
|
|
|
|
|
hv_store_ent(result_hv, group_val, newRV_noinc((SV *)res_av), 0); |
|
8868
|
|
|
|
|
|
|
} |
|
8869
|
5
|
|
|
|
|
|
av_push(res_av, newSVsv(target_val)); |
|
8870
|
|
|
|
|
|
|
} |
|
8871
|
|
|
|
|
|
|
} |
|
8872
|
|
|
|
|
|
|
} |
|
8873
|
|
|
|
|
|
|
} |
|
8874
|
|
|
|
|
|
|
} |
|
8875
|
3
|
50
|
|
|
|
|
} else if (SvTYPE(SvRV(data_ref)) == SVt_PVHV) { |
|
8876
|
3
|
|
|
|
|
|
HV *restrict data_hv = (HV *)SvRV(data_ref); |
|
8877
|
3
|
|
|
|
|
|
HE *restrict group_he = hv_fetch_ent(data_hv, group_key_sv, 0, 0); |
|
8878
|
3
|
|
|
|
|
|
HE *restrict target_he = hv_fetch_ent(data_hv, target_key_sv, 0, 0); |
|
8879
|
3
|
100
|
|
|
|
|
if (group_he && target_he && |
|
|
|
50
|
|
|
|
|
|
|
8880
|
2
|
50
|
|
|
|
|
SvROK(HeVAL(group_he)) && SvTYPE(SvRV(HeVAL(group_he))) == SVt_PVAV && |
|
|
|
50
|
|
|
|
|
|
|
8881
|
4
|
50
|
|
|
|
|
SvROK(HeVAL(target_he)) && SvTYPE(SvRV(HeVAL(target_he))) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
8882
|
2
|
|
|
|
|
|
AV *restrict group_av = (AV *)SvRV(HeVAL(group_he)); |
|
8883
|
2
|
|
|
|
|
|
AV *restrict target_av = (AV *)SvRV(HeVAL(target_he)); |
|
8884
|
2
|
|
|
|
|
|
SSize_t g_len = av_len(group_av) + 1; |
|
8885
|
2
|
|
|
|
|
|
SSize_t t_len = av_len(target_av) + 1; |
|
8886
|
2
|
|
|
|
|
|
SSize_t len = g_len < t_len ? g_len : t_len; |
|
8887
|
10
|
100
|
|
|
|
|
for (SSize_t i = 0; i < len; i++) { |
|
8888
|
8
|
|
|
|
|
|
SV **restrict g_svp = av_fetch(group_av, i, 0); |
|
8889
|
8
|
|
|
|
|
|
SV **restrict t_svp = av_fetch(target_av, i, 0); |
|
8890
|
8
|
50
|
|
|
|
|
if (g_svp && *g_svp) { |
|
|
|
50
|
|
|
|
|
|
|
8891
|
8
|
|
|
|
|
|
SV *restrict g_val = *g_svp; |
|
8892
|
8
|
50
|
|
|
|
|
SV *restrict t_val = (t_svp && *t_svp) ? *t_svp : NULL; |
|
|
|
50
|
|
|
|
|
|
|
8893
|
8
|
50
|
|
|
|
|
if (t_val && SvOK(t_val)) { |
|
|
|
100
|
|
|
|
|
|
|
8894
|
7
|
|
|
|
|
|
bool pass_filter = 1; |
|
8895
|
7
|
100
|
|
|
|
|
if (filter_hv) { |
|
8896
|
|
|
|
|
|
|
HE *restrict f_he; |
|
8897
|
4
|
|
|
|
|
|
hv_iterinit(filter_hv); |
|
8898
|
6
|
100
|
|
|
|
|
while ((f_he = hv_iternext(filter_hv))) { |
|
8899
|
4
|
|
|
|
|
|
SV *restrict f_col = hv_iterkeysv(f_he); |
|
8900
|
4
|
|
|
|
|
|
SV *restrict f_sub = hv_iterval(filter_hv, f_he); |
|
8901
|
4
|
|
|
|
|
|
SV *restrict val_sv = NULL; |
|
8902
|
4
|
|
|
|
|
|
HE *restrict arr_he = hv_fetch_ent(data_hv, f_col, 0, 0); |
|
8903
|
4
|
50
|
|
|
|
|
if (arr_he && SvROK(HeVAL(arr_he)) && SvTYPE(SvRV(HeVAL(arr_he))) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
8904
|
4
|
|
|
|
|
|
AV *restrict col_av = (AV *)SvRV(HeVAL(arr_he)); |
|
8905
|
4
|
|
|
|
|
|
SV **restrict val_svp = av_fetch(col_av, i, 0); |
|
8906
|
4
|
50
|
|
|
|
|
if (val_svp) val_sv = *val_svp; |
|
8907
|
|
|
|
|
|
|
} |
|
8908
|
|
|
|
|
|
|
bool keep; |
|
8909
|
4
|
50
|
|
|
|
|
EVAL_FILTER(f_sub, val_sv, keep); |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
8910
|
4
|
100
|
|
|
|
|
if (!keep) { |
|
8911
|
2
|
|
|
|
|
|
pass_filter = 0; |
|
8912
|
2
|
|
|
|
|
|
break; |
|
8913
|
|
|
|
|
|
|
} |
|
8914
|
|
|
|
|
|
|
} |
|
8915
|
|
|
|
|
|
|
} |
|
8916
|
7
|
100
|
|
|
|
|
if (pass_filter) { |
|
8917
|
5
|
|
|
|
|
|
HE *restrict res_he = hv_fetch_ent(result_hv, g_val, 0, 0); |
|
8918
|
|
|
|
|
|
|
AV *restrict res_av; |
|
8919
|
5
|
100
|
|
|
|
|
if (res_he) { |
|
8920
|
1
|
|
|
|
|
|
res_av = (AV *)SvRV(HeVAL(res_he)); |
|
8921
|
|
|
|
|
|
|
} else { |
|
8922
|
4
|
|
|
|
|
|
res_av = newAV(); |
|
8923
|
4
|
|
|
|
|
|
hv_store_ent(result_hv, g_val, newRV_noinc((SV *)res_av), 0); |
|
8924
|
|
|
|
|
|
|
} |
|
8925
|
5
|
|
|
|
|
|
av_push(res_av, newSVsv(t_val)); |
|
8926
|
|
|
|
|
|
|
} |
|
8927
|
|
|
|
|
|
|
} |
|
8928
|
|
|
|
|
|
|
} |
|
8929
|
|
|
|
|
|
|
} |
|
8930
|
|
|
|
|
|
|
} else { |
|
8931
|
|
|
|
|
|
|
HE *restrict row_he; |
|
8932
|
1
|
|
|
|
|
|
hv_iterinit(data_hv); |
|
8933
|
6
|
100
|
|
|
|
|
while ((row_he = hv_iternext(data_hv))) { |
|
8934
|
5
|
|
|
|
|
|
SV *restrict row_val = hv_iterval(data_hv, row_he); |
|
8935
|
5
|
50
|
|
|
|
|
if (SvROK(row_val) && SvTYPE(SvRV(row_val)) == SVt_PVHV) { |
|
|
|
50
|
|
|
|
|
|
|
8936
|
5
|
|
|
|
|
|
HV *restrict inner_hv = (HV *)SvRV(row_val); |
|
8937
|
5
|
|
|
|
|
|
HE *restrict inner_group_he = hv_fetch_ent(inner_hv, group_key_sv, 0, 0); |
|
8938
|
5
|
|
|
|
|
|
HE *restrict inner_target_he = hv_fetch_ent(inner_hv, target_key_sv, 0, 0); |
|
8939
|
5
|
50
|
|
|
|
|
if (inner_group_he) { |
|
8940
|
5
|
|
|
|
|
|
SV *restrict g_val = HeVAL(inner_group_he); |
|
8941
|
5
|
100
|
|
|
|
|
SV *restrict t_val = inner_target_he ? HeVAL(inner_target_he) : NULL; |
|
8942
|
5
|
100
|
|
|
|
|
if (t_val && SvOK(t_val)) { |
|
|
|
100
|
|
|
|
|
|
|
8943
|
3
|
|
|
|
|
|
bool pass_filter = 1; |
|
8944
|
3
|
50
|
|
|
|
|
if (filter_hv) { |
|
8945
|
|
|
|
|
|
|
HE *restrict f_he; |
|
8946
|
0
|
|
|
|
|
|
hv_iterinit(filter_hv); |
|
8947
|
0
|
0
|
|
|
|
|
while ((f_he = hv_iternext(filter_hv))) { |
|
8948
|
0
|
|
|
|
|
|
SV *restrict f_col = hv_iterkeysv(f_he); |
|
8949
|
0
|
|
|
|
|
|
SV *restrict f_sub = hv_iterval(filter_hv, f_he); |
|
8950
|
0
|
|
|
|
|
|
HE *restrict val_he = hv_fetch_ent(inner_hv, f_col, 0, 0); |
|
8951
|
0
|
0
|
|
|
|
|
SV *restrict val_sv = val_he ? HeVAL(val_he) : NULL; |
|
8952
|
|
|
|
|
|
|
bool keep; |
|
8953
|
0
|
0
|
|
|
|
|
EVAL_FILTER(f_sub, val_sv, keep); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8954
|
0
|
0
|
|
|
|
|
if (!keep) { |
|
8955
|
0
|
|
|
|
|
|
pass_filter = 0; |
|
8956
|
0
|
|
|
|
|
|
break; |
|
8957
|
|
|
|
|
|
|
} |
|
8958
|
|
|
|
|
|
|
} |
|
8959
|
|
|
|
|
|
|
} |
|
8960
|
3
|
50
|
|
|
|
|
if (pass_filter) { |
|
8961
|
3
|
|
|
|
|
|
HE *restrict res_he = hv_fetch_ent(result_hv, g_val, 0, 0); |
|
8962
|
|
|
|
|
|
|
AV *restrict res_av; |
|
8963
|
3
|
100
|
|
|
|
|
if (res_he) { |
|
8964
|
1
|
|
|
|
|
|
res_av = (AV *)SvRV(HeVAL(res_he)); |
|
8965
|
|
|
|
|
|
|
} else { |
|
8966
|
2
|
|
|
|
|
|
res_av = newAV(); |
|
8967
|
2
|
|
|
|
|
|
hv_store_ent(result_hv, g_val, newRV_noinc((SV *)res_av), 0); |
|
8968
|
|
|
|
|
|
|
} |
|
8969
|
3
|
|
|
|
|
|
av_push(res_av, newSVsv(t_val)); |
|
8970
|
|
|
|
|
|
|
} |
|
8971
|
|
|
|
|
|
|
} |
|
8972
|
|
|
|
|
|
|
} |
|
8973
|
|
|
|
|
|
|
} |
|
8974
|
|
|
|
|
|
|
} |
|
8975
|
|
|
|
|
|
|
} |
|
8976
|
|
|
|
|
|
|
} else { |
|
8977
|
0
|
|
|
|
|
|
croak("First argument to group_by must be an Array or Hash reference"); |
|
8978
|
|
|
|
|
|
|
} |
|
8979
|
|
|
|
|
|
|
// Balance xsubpp's automatic sv_2mortal to prevent refcount dropping to -1 |
|
8980
|
5
|
|
|
|
|
|
RETVAL = SvREFCNT_inc(result_ref); |
|
8981
|
|
|
|
|
|
|
OUTPUT: |
|
8982
|
|
|
|
|
|
|
RETVAL |
|
8983
|
|
|
|
|
|
|
|
|
8984
|
|
|
|
|
|
|
SV* prcomp(...) |
|
8985
|
|
|
|
|
|
|
CODE: |
|
8986
|
|
|
|
|
|
|
{ |
|
8987
|
12
|
|
|
|
|
|
SV *restrict x_sv = NULL; |
|
8988
|
12
|
|
|
|
|
|
bool retx = TRUE, center = TRUE, do_scale = FALSE; |
|
8989
|
12
|
|
|
|
|
|
NV tol = -1.0; |
|
8990
|
12
|
|
|
|
|
|
long rank_opt = -1; |
|
8991
|
12
|
|
|
|
|
|
unsigned int arg_idx = 0; |
|
8992
|
|
|
|
|
|
|
// 1. Shift positional 'x' argument if provided |
|
8993
|
12
|
100
|
|
|
|
|
if (arg_idx < items && SvROK(ST(arg_idx))) { |
|
|
|
100
|
|
|
|
|
|
|
8994
|
10
|
|
|
|
|
|
int t = SvTYPE(SvRV(ST(arg_idx))); |
|
8995
|
10
|
100
|
|
|
|
|
if (t == SVt_PVAV || t == SVt_PVHV) { |
|
|
|
50
|
|
|
|
|
|
|
8996
|
10
|
|
|
|
|
|
x_sv = ST(arg_idx); |
|
8997
|
10
|
|
|
|
|
|
arg_idx++; |
|
8998
|
|
|
|
|
|
|
} |
|
8999
|
|
|
|
|
|
|
} |
|
9000
|
|
|
|
|
|
|
// 2. Parse named arguments |
|
9001
|
12
|
100
|
|
|
|
|
if ((items - arg_idx) % 2 != 0) croak("Usage: prcomp($data, key => value, ...)"); |
|
9002
|
14
|
100
|
|
|
|
|
for (; arg_idx < items; arg_idx += 2) { |
|
9003
|
4
|
|
|
|
|
|
const char *restrict key = SvPV_nolen(ST(arg_idx)); |
|
9004
|
4
|
|
|
|
|
|
SV *restrict val = ST(arg_idx + 1); |
|
9005
|
4
|
50
|
|
|
|
|
if (strEQ(key, "x")) x_sv = val; |
|
9006
|
4
|
50
|
|
|
|
|
else if (strEQ(key, "retx")) retx = SvTRUE(val); |
|
9007
|
4
|
50
|
|
|
|
|
else if (strEQ(key, "center")) center = SvTRUE(val); |
|
9008
|
4
|
100
|
|
|
|
|
else if (strEQ(key, "scale")) do_scale = SvTRUE(val); |
|
9009
|
2
|
100
|
|
|
|
|
else if (strEQ(key, "tol")) tol = SvOK(val) ? SvNV(val) : -1.0; |
|
|
|
50
|
|
|
|
|
|
|
9010
|
1
|
50
|
|
|
|
|
else if (strEQ(key, "rank")) rank_opt = SvOK(val) ? (long)SvIV(val) : -1; |
|
|
|
50
|
|
|
|
|
|
|
9011
|
0
|
|
|
|
|
|
else croak("prcomp: unknown argument '%s'", key); |
|
9012
|
|
|
|
|
|
|
} |
|
9013
|
|
|
|
|
|
|
|
|
9014
|
10
|
100
|
|
|
|
|
if (!x_sv || !SvROK(x_sv)) |
|
|
|
50
|
|
|
|
|
|
|
9015
|
1
|
|
|
|
|
|
croak("prcomp: 'x' is a required argument and must be a reference"); |
|
9016
|
|
|
|
|
|
|
|
|
9017
|
|
|
|
|
|
|
// 3. Detect Data Structure (AoA, HoA, HoH) |
|
9018
|
9
|
|
|
|
|
|
bool is_aoa = FALSE, is_hoa = FALSE, is_hoh = FALSE; |
|
9019
|
9
|
|
|
|
|
|
size_t n_raw = 0, p = 0; |
|
9020
|
9
|
|
|
|
|
|
char **restrict colnames = NULL; |
|
9021
|
9
|
|
|
|
|
|
SV *restrict ref = SvRV(x_sv); |
|
9022
|
|
|
|
|
|
|
|
|
9023
|
9
|
100
|
|
|
|
|
if (SvTYPE(ref) == SVt_PVAV) { |
|
9024
|
7
|
|
|
|
|
|
AV *restrict av = (AV*)ref; |
|
9025
|
7
|
|
|
|
|
|
n_raw = av_len(av) + 1; |
|
9026
|
7
|
100
|
|
|
|
|
if (n_raw > 0) { |
|
9027
|
6
|
|
|
|
|
|
SV **restrict first = av_fetch(av, 0, 0); |
|
9028
|
6
|
50
|
|
|
|
|
if (first && SvROK(*first) && SvTYPE(SvRV(*first)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
9029
|
6
|
|
|
|
|
|
is_aoa = TRUE; |
|
9030
|
6
|
|
|
|
|
|
p = av_len((AV*)SvRV(*first)) + 1; |
|
9031
|
0
|
|
|
|
|
|
} else croak("prcomp: Array reference must contain ArrayRefs (AoA)"); |
|
9032
|
|
|
|
|
|
|
} |
|
9033
|
2
|
50
|
|
|
|
|
} else if (SvTYPE(ref) == SVt_PVHV) { |
|
9034
|
2
|
|
|
|
|
|
HV *restrict hv = (HV*)ref; |
|
9035
|
2
|
50
|
|
|
|
|
if (hv_iterinit(hv) > 0) { |
|
9036
|
2
|
|
|
|
|
|
HE *restrict entry = hv_iternext(hv); |
|
9037
|
2
|
|
|
|
|
|
SV *restrict val = hv_iterval(hv, entry); |
|
9038
|
2
|
50
|
|
|
|
|
if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVAV) { |
|
|
|
100
|
|
|
|
|
|
|
9039
|
1
|
|
|
|
|
|
is_hoa = TRUE; |
|
9040
|
1
|
|
|
|
|
|
n_raw = av_len((AV*)SvRV(val)) + 1; |
|
9041
|
1
|
50
|
|
|
|
|
} else if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVHV) { |
|
|
|
50
|
|
|
|
|
|
|
9042
|
1
|
|
|
|
|
|
is_hoh = TRUE; |
|
9043
|
1
|
|
|
|
|
|
n_raw = hv_iterinit(hv); |
|
9044
|
0
|
|
|
|
|
|
} else croak("prcomp: Hash reference must contain ArrayRefs (HoA) or HashRefs (HoH)"); |
|
9045
|
|
|
|
|
|
|
} |
|
9046
|
|
|
|
|
|
|
} |
|
9047
|
|
|
|
|
|
|
|
|
9048
|
9
|
100
|
|
|
|
|
if (n_raw == 0 || (p == 0 && !is_hoa && !is_hoh)) croak("prcomp: input matrix is empty or has zero columns"); |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
9049
|
|
|
|
|
|
|
|
|
9050
|
|
|
|
|
|
|
// 4. Extract and Sort Column Names (for Hash inputs) |
|
9051
|
8
|
100
|
|
|
|
|
if (is_hoh) { |
|
9052
|
1
|
|
|
|
|
|
HV *restrict hv = (HV*)ref; |
|
9053
|
1
|
|
|
|
|
|
hv_iterinit(hv); |
|
9054
|
1
|
|
|
|
|
|
HE *restrict entry = hv_iternext(hv); |
|
9055
|
1
|
|
|
|
|
|
HV *restrict inner = (HV*)SvRV(hv_iterval(hv, entry)); |
|
9056
|
1
|
|
|
|
|
|
p = hv_iterinit(inner); |
|
9057
|
1
|
50
|
|
|
|
|
if (p == 0) croak("prcomp: inner hashes cannot be empty"); |
|
9058
|
|
|
|
|
|
|
|
|
9059
|
1
|
|
|
|
|
|
colnames = (char**)safemalloc(p * sizeof(char*)); |
|
9060
|
1
|
|
|
|
|
|
size_t c = 0; |
|
9061
|
3
|
100
|
|
|
|
|
while ((entry = hv_iternext(inner))) { |
|
9062
|
2
|
|
|
|
|
|
colnames[c++] = savepv(SvPV_nolen(hv_iterkeysv(entry))); |
|
9063
|
|
|
|
|
|
|
} |
|
9064
|
1
|
|
|
|
|
|
qsort(colnames, p, sizeof(char*), cmp_string_wt); |
|
9065
|
7
|
100
|
|
|
|
|
} else if (is_hoa) { |
|
9066
|
1
|
|
|
|
|
|
HV *restrict hv = (HV*)ref; |
|
9067
|
1
|
|
|
|
|
|
p = hv_iterinit(hv); |
|
9068
|
1
|
50
|
|
|
|
|
if (p == 0) croak("prcomp: input hash is empty"); |
|
9069
|
1
|
|
|
|
|
|
colnames = (char**)safemalloc(p * sizeof(char*)); |
|
9070
|
1
|
|
|
|
|
|
size_t c = 0; |
|
9071
|
|
|
|
|
|
|
HE *restrict entry; |
|
9072
|
3
|
100
|
|
|
|
|
while ((entry = hv_iternext(hv))) { |
|
9073
|
2
|
|
|
|
|
|
colnames[c++] = savepv(SvPV_nolen(hv_iterkeysv(entry))); |
|
9074
|
|
|
|
|
|
|
} |
|
9075
|
1
|
|
|
|
|
|
qsort(colnames, p, sizeof(char*), cmp_string_wt); |
|
9076
|
|
|
|
|
|
|
} |
|
9077
|
|
|
|
|
|
|
// 5. Extract data & apply listwise deletion for NaNs |
|
9078
|
8
|
|
|
|
|
|
NV *restrict X_mat = (NV*)safemalloc(n_raw * p * sizeof(NV)); |
|
9079
|
8
|
|
|
|
|
|
size_t n = 0; |
|
9080
|
8
|
100
|
|
|
|
|
if (is_aoa) { |
|
9081
|
6
|
|
|
|
|
|
AV *restrict av = (AV*)ref; |
|
9082
|
24
|
100
|
|
|
|
|
for (size_t i = 0; i < n_raw; i++) { |
|
9083
|
18
|
|
|
|
|
|
SV **restrict row_sv = av_fetch(av, i, 0); |
|
9084
|
18
|
50
|
|
|
|
|
if (row_sv && SvROK(*row_sv) && SvTYPE(SvRV(*row_sv)) == SVt_PVAV) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
9085
|
18
|
|
|
|
|
|
AV *restrict row_av = (AV*)SvRV(*row_sv); |
|
9086
|
18
|
|
|
|
|
|
bool row_ok = TRUE; |
|
9087
|
54
|
100
|
|
|
|
|
for (size_t j = 0; j < p; j++) { |
|
9088
|
36
|
|
|
|
|
|
SV **restrict cell_sv = av_fetch(row_av, j, 0); |
|
9089
|
71
|
50
|
|
|
|
|
if (cell_sv && SvOK(*cell_sv) && looks_like_number(*cell_sv)) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
9090
|
35
|
|
|
|
|
|
NV v = SvNV(*cell_sv); |
|
9091
|
35
|
50
|
|
|
|
|
if (!isfinite(v)) row_ok = FALSE; |
|
9092
|
35
|
|
|
|
|
|
else X_mat[n * p + j] = v; |
|
9093
|
1
|
|
|
|
|
|
} else row_ok = FALSE; |
|
9094
|
|
|
|
|
|
|
} |
|
9095
|
18
|
100
|
|
|
|
|
if (row_ok) n++; |
|
9096
|
|
|
|
|
|
|
} |
|
9097
|
|
|
|
|
|
|
} |
|
9098
|
2
|
100
|
|
|
|
|
} else if (is_hoa) { |
|
9099
|
1
|
|
|
|
|
|
HV *restrict hv = (HV*)ref; |
|
9100
|
1
|
|
|
|
|
|
AV **restrict col_arrays = (AV**)safemalloc(p * sizeof(AV*)); |
|
9101
|
3
|
100
|
|
|
|
|
for (size_t j = 0; j < p; j++) { |
|
9102
|
2
|
|
|
|
|
|
SV **restrict val = hv_fetch(hv, colnames[j], strlen(colnames[j]), 0); |
|
9103
|
2
|
|
|
|
|
|
col_arrays[j] = (AV*)SvRV(*val); |
|
9104
|
|
|
|
|
|
|
} |
|
9105
|
4
|
100
|
|
|
|
|
for (size_t i = 0; i < n_raw; i++) { |
|
9106
|
3
|
|
|
|
|
|
bool row_ok = TRUE; |
|
9107
|
9
|
100
|
|
|
|
|
for (size_t j = 0; j < p; j++) { |
|
9108
|
6
|
|
|
|
|
|
SV **restrict cell = av_fetch(col_arrays[j], i, 0); |
|
9109
|
12
|
50
|
|
|
|
|
if (cell && SvOK(*cell) && looks_like_number(*cell)) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
9110
|
6
|
|
|
|
|
|
NV v = SvNV(*cell); |
|
9111
|
6
|
50
|
|
|
|
|
if (!isfinite(v)) row_ok = FALSE; |
|
9112
|
6
|
|
|
|
|
|
else X_mat[n * p + j] = v; |
|
9113
|
0
|
|
|
|
|
|
} else row_ok = FALSE; |
|
9114
|
|
|
|
|
|
|
} |
|
9115
|
3
|
50
|
|
|
|
|
if (row_ok) n++; |
|
9116
|
|
|
|
|
|
|
} |
|
9117
|
1
|
|
|
|
|
|
Safefree(col_arrays); |
|
9118
|
1
|
50
|
|
|
|
|
} else if (is_hoh) { |
|
9119
|
1
|
|
|
|
|
|
HV *restrict hv = (HV*)ref; |
|
9120
|
1
|
|
|
|
|
|
hv_iterinit(hv); |
|
9121
|
|
|
|
|
|
|
HE *restrict entry; |
|
9122
|
4
|
100
|
|
|
|
|
while ((entry = hv_iternext(hv))) { |
|
9123
|
3
|
|
|
|
|
|
HV *restrict row_hv = (HV*)SvRV(hv_iterval(hv, entry)); |
|
9124
|
3
|
|
|
|
|
|
bool row_ok = TRUE; |
|
9125
|
9
|
100
|
|
|
|
|
for (size_t j = 0; j < p; j++) { |
|
9126
|
6
|
|
|
|
|
|
SV **restrict cell = hv_fetch(row_hv, colnames[j], strlen(colnames[j]), 0); |
|
9127
|
12
|
50
|
|
|
|
|
if (cell && SvOK(*cell) && looks_like_number(*cell)) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
9128
|
6
|
|
|
|
|
|
NV v = SvNV(*cell); |
|
9129
|
6
|
50
|
|
|
|
|
if (!isfinite(v)) row_ok = FALSE; |
|
9130
|
6
|
|
|
|
|
|
else X_mat[n * p + j] = v; |
|
9131
|
0
|
|
|
|
|
|
} else row_ok = FALSE; |
|
9132
|
|
|
|
|
|
|
} |
|
9133
|
3
|
50
|
|
|
|
|
if (row_ok) n++; |
|
9134
|
|
|
|
|
|
|
} |
|
9135
|
|
|
|
|
|
|
} |
|
9136
|
8
|
50
|
|
|
|
|
if (n == 0) { |
|
9137
|
0
|
0
|
|
|
|
|
if (colnames) { |
|
9138
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < p; i++) Safefree(colnames[i]); |
|
9139
|
0
|
|
|
|
|
|
Safefree(colnames); |
|
9140
|
|
|
|
|
|
|
} |
|
9141
|
0
|
|
|
|
|
|
Safefree(X_mat); |
|
9142
|
0
|
|
|
|
|
|
croak("prcomp: 0 valid observations after listwise NA deletion"); |
|
9143
|
|
|
|
|
|
|
} |
|
9144
|
|
|
|
|
|
|
// 6. Center and Scale |
|
9145
|
8
|
|
|
|
|
|
NV *restrict cen_vec = (NV*)safecalloc(p, sizeof(NV)); |
|
9146
|
8
|
|
|
|
|
|
NV *restrict sc_vec = (NV*)safecalloc(p, sizeof(NV)); |
|
9147
|
22
|
100
|
|
|
|
|
for (size_t j = 0; j < p; j++) { |
|
9148
|
15
|
|
|
|
|
|
NV col_sum = 0.0; |
|
9149
|
58
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) col_sum += X_mat[i * p + j]; |
|
9150
|
15
|
50
|
|
|
|
|
if (center) { |
|
9151
|
15
|
|
|
|
|
|
cen_vec[j] = col_sum / n; |
|
9152
|
58
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) X_mat[i * p + j] -= cen_vec[j]; |
|
9153
|
|
|
|
|
|
|
} |
|
9154
|
15
|
100
|
|
|
|
|
if (do_scale) { |
|
9155
|
3
|
|
|
|
|
|
NV sum_sq = 0.0; |
|
9156
|
12
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
9157
|
9
|
50
|
|
|
|
|
NV val = X_mat[i * p + j] - (center ? 0 : (col_sum / n)); |
|
9158
|
9
|
|
|
|
|
|
sum_sq += val * val; |
|
9159
|
|
|
|
|
|
|
} |
|
9160
|
3
|
50
|
|
|
|
|
sc_vec[j] = (n > 1) ? sqrt(sum_sq / (n - 1)) : 0.0; |
|
9161
|
3
|
100
|
|
|
|
|
if (sc_vec[j] <= 1e-15) { |
|
9162
|
1
|
|
|
|
|
|
Safefree(X_mat); Safefree(cen_vec); Safefree(sc_vec); |
|
9163
|
1
|
50
|
|
|
|
|
if (colnames) { for (size_t k = 0; k < p; k++) Safefree(colnames[k]); Safefree(colnames); } |
|
|
|
0
|
|
|
|
|
|
|
9164
|
1
|
|
|
|
|
|
croak("prcomp: cannot rescale a constant/zero column to unit variance"); |
|
9165
|
|
|
|
|
|
|
} |
|
9166
|
8
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) X_mat[i * p + j] /= sc_vec[j]; |
|
9167
|
|
|
|
|
|
|
} |
|
9168
|
|
|
|
|
|
|
} |
|
9169
|
|
|
|
|
|
|
// 7. Construct Covariance Matrix X^T X |
|
9170
|
7
|
|
|
|
|
|
NV *restrict XtX = (NV*)safecalloc(p * p, sizeof(NV)); |
|
9171
|
27
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
9172
|
60
|
100
|
|
|
|
|
for (size_t j = 0; j < p; j++) { |
|
9173
|
100
|
100
|
|
|
|
|
for (size_t k = j; k < p; k++) { |
|
9174
|
60
|
|
|
|
|
|
XtX[j * p + k] += X_mat[i * p + j] * X_mat[i * p + k]; |
|
9175
|
|
|
|
|
|
|
} |
|
9176
|
|
|
|
|
|
|
} |
|
9177
|
|
|
|
|
|
|
} |
|
9178
|
|
|
|
|
|
|
// Mirror the symmetric lower triangle |
|
9179
|
21
|
100
|
|
|
|
|
for (size_t j = 0; j < p; j++) { |
|
9180
|
21
|
100
|
|
|
|
|
for (size_t k = 0; k < j; k++) { |
|
9181
|
7
|
|
|
|
|
|
XtX[j * p + k] = XtX[k * p + j]; |
|
9182
|
|
|
|
|
|
|
} |
|
9183
|
|
|
|
|
|
|
} |
|
9184
|
|
|
|
|
|
|
// 8. Jacobi Eigen Decomposition |
|
9185
|
7
|
|
|
|
|
|
NV *restrict eigen_val = (NV*)safemalloc(p * sizeof(NV)); |
|
9186
|
7
|
|
|
|
|
|
NV *restrict eigen_vec = (NV*)safemalloc(p * p * sizeof(NV)); |
|
9187
|
7
|
|
|
|
|
|
jacobi_eigen(XtX, p, eigen_val, eigen_vec); |
|
9188
|
|
|
|
|
|
|
// 9. Calculate singular values (sdev) & handle dimensions (rank/tol) |
|
9189
|
7
|
|
|
|
|
|
size_t k_cols = (n < p) ? n : p; |
|
9190
|
7
|
100
|
|
|
|
|
if (rank_opt > 0 && rank_opt < (long)k_cols) k_cols = (size_t)rank_opt; |
|
|
|
50
|
|
|
|
|
|
|
9191
|
7
|
|
|
|
|
|
NV *restrict sdev = (NV*)safemalloc(k_cols * sizeof(NV)); |
|
9192
|
7
|
50
|
|
|
|
|
NV n_adj = (n > 1) ? (NV)(n - 1) : 1.0; |
|
9193
|
20
|
100
|
|
|
|
|
for (size_t j = 0; j < k_cols; j++) { |
|
9194
|
13
|
|
|
|
|
|
NV e_val = eigen_val[j]; |
|
9195
|
13
|
50
|
|
|
|
|
if (e_val < 0.0) e_val = 0.0; // clamp floating point inaccuracy |
|
9196
|
13
|
|
|
|
|
|
sdev[j] = sqrt(e_val / n_adj); |
|
9197
|
|
|
|
|
|
|
} |
|
9198
|
7
|
100
|
|
|
|
|
if (tol >= 0.0) { |
|
9199
|
1
|
|
|
|
|
|
size_t rank_est = 0; |
|
9200
|
1
|
|
|
|
|
|
NV threshold = sdev[0] * tol; |
|
9201
|
3
|
100
|
|
|
|
|
for (size_t j = 0; j < k_cols; j++) { |
|
9202
|
2
|
100
|
|
|
|
|
if (sdev[j] > threshold) rank_est++; |
|
9203
|
|
|
|
|
|
|
} |
|
9204
|
1
|
50
|
|
|
|
|
if (rank_est < k_cols) k_cols = rank_est; |
|
9205
|
|
|
|
|
|
|
} |
|
9206
|
|
|
|
|
|
|
// 10. Build Return Hash |
|
9207
|
7
|
|
|
|
|
|
HV *restrict res_hv = newHV(); |
|
9208
|
7
|
|
|
|
|
|
AV *restrict sdev_av = newAV(); |
|
9209
|
19
|
100
|
|
|
|
|
for (size_t j = 0; j < k_cols; j++) av_push(sdev_av, newSVnv(sdev[j])); |
|
9210
|
7
|
|
|
|
|
|
hv_stores(res_hv, "sdev", newRV_noinc((SV*)sdev_av)); |
|
9211
|
7
|
|
|
|
|
|
AV *restrict rot_av = newAV(); |
|
9212
|
21
|
100
|
|
|
|
|
for (size_t j = 0; j < p; j++) { |
|
9213
|
14
|
|
|
|
|
|
AV *restrict row_rot = newAV(); |
|
9214
|
38
|
100
|
|
|
|
|
for (size_t m = 0; m < k_cols; m++) { |
|
9215
|
24
|
|
|
|
|
|
av_push(row_rot, newSVnv(eigen_vec[j * p + m])); |
|
9216
|
|
|
|
|
|
|
} |
|
9217
|
14
|
|
|
|
|
|
av_push(rot_av, newRV_noinc((SV*)row_rot)); |
|
9218
|
|
|
|
|
|
|
} |
|
9219
|
7
|
|
|
|
|
|
hv_stores(res_hv, "rotation", newRV_noinc((SV*)rot_av)); |
|
9220
|
7
|
50
|
|
|
|
|
if (retx) { |
|
9221
|
7
|
|
|
|
|
|
AV *restrict x_ret_av = newAV(); |
|
9222
|
27
|
100
|
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
9223
|
20
|
|
|
|
|
|
AV *restrict row_x = newAV(); |
|
9224
|
54
|
100
|
|
|
|
|
for (size_t m = 0; m < k_cols; m++) { |
|
9225
|
34
|
|
|
|
|
|
NV x_rot_val = 0.0; |
|
9226
|
102
|
100
|
|
|
|
|
for (size_t c = 0; c < p; c++) { |
|
9227
|
68
|
|
|
|
|
|
x_rot_val += X_mat[i * p + c] * eigen_vec[c * p + m]; |
|
9228
|
|
|
|
|
|
|
} |
|
9229
|
34
|
|
|
|
|
|
av_push(row_x, newSVnv(x_rot_val)); |
|
9230
|
|
|
|
|
|
|
} |
|
9231
|
20
|
|
|
|
|
|
av_push(x_ret_av, newRV_noinc((SV*)row_x)); |
|
9232
|
|
|
|
|
|
|
} |
|
9233
|
7
|
|
|
|
|
|
hv_stores(res_hv, "x", newRV_noinc((SV*)x_ret_av)); |
|
9234
|
|
|
|
|
|
|
} |
|
9235
|
7
|
100
|
|
|
|
|
if (colnames) { |
|
9236
|
2
|
|
|
|
|
|
AV *restrict names_av = newAV(); |
|
9237
|
6
|
100
|
|
|
|
|
for (size_t j = 0; j < p; j++) { |
|
9238
|
4
|
|
|
|
|
|
av_push(names_av, newSVpv(colnames[j], 0)); |
|
9239
|
|
|
|
|
|
|
} |
|
9240
|
2
|
|
|
|
|
|
hv_stores(res_hv, "varnames", newRV_noinc((SV*)names_av)); |
|
9241
|
|
|
|
|
|
|
} |
|
9242
|
7
|
50
|
|
|
|
|
if (center) { |
|
9243
|
7
|
|
|
|
|
|
AV *restrict c_av = newAV(); |
|
9244
|
21
|
100
|
|
|
|
|
for (size_t j = 0; j < p; j++) av_push(c_av, newSVnv(cen_vec[j])); |
|
9245
|
7
|
|
|
|
|
|
hv_stores(res_hv, "center", newRV_noinc((SV*)c_av)); |
|
9246
|
|
|
|
|
|
|
} else { |
|
9247
|
0
|
|
|
|
|
|
hv_stores(res_hv, "center", newSVsv(&PL_sv_no)); |
|
9248
|
|
|
|
|
|
|
} |
|
9249
|
7
|
100
|
|
|
|
|
if (do_scale) { |
|
9250
|
1
|
|
|
|
|
|
AV *restrict sc_av = newAV(); |
|
9251
|
3
|
100
|
|
|
|
|
for (size_t j = 0; j < p; j++) av_push(sc_av, newSVnv(sc_vec[j])); |
|
9252
|
1
|
|
|
|
|
|
hv_stores(res_hv, "scale", newRV_noinc((SV*)sc_av)); |
|
9253
|
|
|
|
|
|
|
} else { |
|
9254
|
6
|
|
|
|
|
|
hv_stores(res_hv, "scale", newSVsv(&PL_sv_no)); |
|
9255
|
|
|
|
|
|
|
} |
|
9256
|
|
|
|
|
|
|
// Cleanup |
|
9257
|
7
|
100
|
|
|
|
|
if (colnames) { |
|
9258
|
6
|
100
|
|
|
|
|
for (size_t i = 0; i < p; i++) Safefree(colnames[i]); |
|
9259
|
2
|
|
|
|
|
|
Safefree(colnames); |
|
9260
|
|
|
|
|
|
|
} |
|
9261
|
7
|
|
|
|
|
|
Safefree(X_mat); Safefree(cen_vec); Safefree(sc_vec); |
|
9262
|
7
|
|
|
|
|
|
Safefree(XtX); Safefree(eigen_val); Safefree(eigen_vec); Safefree(sdev); |
|
9263
|
|
|
|
|
|
|
|
|
9264
|
7
|
|
|
|
|
|
RETVAL = newRV_noinc((SV*)res_hv); |
|
9265
|
|
|
|
|
|
|
} |
|
9266
|
|
|
|
|
|
|
OUTPUT: |
|
9267
|
|
|
|
|
|
|
RETVAL |
|
9268
|
|
|
|
|
|
|
|
|
9269
|
|
|
|
|
|
|
SV *transpose(input_ref) |
|
9270
|
|
|
|
|
|
|
SV *input_ref |
|
9271
|
|
|
|
|
|
|
PREINIT: |
|
9272
|
|
|
|
|
|
|
svtype ref_type; |
|
9273
|
|
|
|
|
|
|
SV *restrict retval_sv; |
|
9274
|
|
|
|
|
|
|
CODE: |
|
9275
|
38
|
50
|
|
|
|
|
SvGETMAGIC(input_ref); |
|
|
|
0
|
|
|
|
|
|
|
9276
|
38
|
100
|
|
|
|
|
if (!SvROK(input_ref)) |
|
9277
|
1
|
|
|
|
|
|
croak("Stats::LikeR::transpose: Input must be a hash ref or array ref"); |
|
9278
|
37
|
|
|
|
|
|
ref_type = SvTYPE(SvRV(input_ref)); |
|
9279
|
37
|
100
|
|
|
|
|
if (ref_type == SVt_PVHV) {// ── Hash-of-Hashes |
|
9280
|
14
|
|
|
|
|
|
HV *restrict in_hv = (HV *)SvRV(input_ref); |
|
9281
|
14
|
|
|
|
|
|
HV *restrict out_hv = newHV(); |
|
9282
|
|
|
|
|
|
|
HE *restrict he_row, *restrict he_col, *restrict out_inner_he; |
|
9283
|
14
|
|
|
|
|
|
retval_sv = sv_2mortal(newRV_noinc((SV *)out_hv)); |
|
9284
|
14
|
|
|
|
|
|
hv_iterinit(in_hv); |
|
9285
|
35
|
100
|
|
|
|
|
while ((he_row = hv_iternext(in_hv))) { |
|
9286
|
23
|
|
|
|
|
|
SV *restrict row_key_sv = hv_iterkeysv(he_row); |
|
9287
|
23
|
|
|
|
|
|
SV *restrict row_val = hv_iterval(in_hv, he_row); |
|
9288
|
|
|
|
|
|
|
HV *restrict in_inner_hv; |
|
9289
|
23
|
50
|
|
|
|
|
SvGETMAGIC(row_val); |
|
|
|
0
|
|
|
|
|
|
|
9290
|
|
|
|
|
|
|
|
|
9291
|
23
|
100
|
|
|
|
|
if (!SvROK(row_val) || SvTYPE(SvRV(row_val)) != SVt_PVHV) |
|
|
|
100
|
|
|
|
|
|
|
9292
|
2
|
|
|
|
|
|
croak("Stats::LikeR::transpose: Hash mode – inner element is not a hash ref"); |
|
9293
|
21
|
|
|
|
|
|
in_inner_hv = (HV *)SvRV(row_val); |
|
9294
|
21
|
|
|
|
|
|
hv_iterinit(in_inner_hv); |
|
9295
|
54
|
100
|
|
|
|
|
while ((he_col = hv_iternext(in_inner_hv))) { |
|
9296
|
33
|
|
|
|
|
|
SV *restrict col_key_sv = hv_iterkeysv(he_col); |
|
9297
|
33
|
|
|
|
|
|
SV *restrict val = hv_iterval(in_inner_hv, he_col); |
|
9298
|
|
|
|
|
|
|
HV *restrict out_inner_hv; |
|
9299
|
|
|
|
|
|
|
SV *restrict inner_ref; |
|
9300
|
33
|
50
|
|
|
|
|
SvGETMAGIC(val); |
|
|
|
0
|
|
|
|
|
|
|
9301
|
33
|
|
|
|
|
|
out_inner_he = hv_fetch_ent(out_hv, col_key_sv, 0, 0); |
|
9302
|
33
|
100
|
|
|
|
|
if (out_inner_he) { |
|
9303
|
14
|
|
|
|
|
|
inner_ref = HeVAL(out_inner_he); |
|
9304
|
14
|
50
|
|
|
|
|
if (!SvROK(inner_ref) || SvTYPE(SvRV(inner_ref)) != SVt_PVHV) |
|
|
|
50
|
|
|
|
|
|
|
9305
|
0
|
|
|
|
|
|
croak("Stats::LikeR::transpose: Internal error – output structure corrupted"); |
|
9306
|
14
|
|
|
|
|
|
out_inner_hv = (HV *)SvRV(inner_ref); |
|
9307
|
|
|
|
|
|
|
} else { |
|
9308
|
19
|
|
|
|
|
|
out_inner_hv = newHV(); |
|
9309
|
19
|
|
|
|
|
|
inner_ref = newRV_noinc((SV *)out_inner_hv); |
|
9310
|
19
|
50
|
|
|
|
|
if (!hv_store_ent(out_hv, col_key_sv, inner_ref, 0)) { |
|
9311
|
0
|
|
|
|
|
|
SvREFCNT_dec(inner_ref); |
|
9312
|
0
|
|
|
|
|
|
croak("Stats::LikeR::transpose: Failed to allocate inner hash"); |
|
9313
|
|
|
|
|
|
|
} |
|
9314
|
|
|
|
|
|
|
} |
|
9315
|
33
|
|
|
|
|
|
SvREFCNT_inc(val); |
|
9316
|
33
|
50
|
|
|
|
|
if (!hv_store_ent(out_inner_hv, row_key_sv, val, 0)) { |
|
9317
|
0
|
|
|
|
|
|
SvREFCNT_dec(val); |
|
9318
|
0
|
|
|
|
|
|
croak("Stats::LikeR::transpose: Failed to store transposed value"); |
|
9319
|
|
|
|
|
|
|
} |
|
9320
|
|
|
|
|
|
|
} |
|
9321
|
|
|
|
|
|
|
} |
|
9322
|
23
|
100
|
|
|
|
|
} else if (ref_type == SVt_PVAV) { // Array-of-Arrays |
|
9323
|
22
|
|
|
|
|
|
AV *restrict in_av = (AV *)SvRV(input_ref); |
|
9324
|
22
|
|
|
|
|
|
AV *restrict out_av = newAV(); |
|
9325
|
22
|
|
|
|
|
|
SSize_t nrows = av_len(in_av) + 1; |
|
9326
|
22
|
|
|
|
|
|
SSize_t ncols = 0; |
|
9327
|
22
|
|
|
|
|
|
retval_sv = sv_2mortal(newRV_noinc((SV *)out_av)); |
|
9328
|
22
|
100
|
|
|
|
|
if (nrows > 0) {// Pass 1: validate all rows; fix ncols from row 0 |
|
9329
|
|
|
|
|
|
|
{ |
|
9330
|
21
|
|
|
|
|
|
SV **restrict elem = av_fetch(in_av, 0, 0); |
|
9331
|
21
|
100
|
|
|
|
|
if (!elem || !*elem) |
|
|
|
50
|
|
|
|
|
|
|
9332
|
1
|
|
|
|
|
|
croak("Stats::LikeR::transpose: Array mode – row 0 is missing"); |
|
9333
|
20
|
50
|
|
|
|
|
SvGETMAGIC(*elem); |
|
|
|
0
|
|
|
|
|
|
|
9334
|
20
|
100
|
|
|
|
|
if (!SvROK(*elem) || SvTYPE(SvRV(*elem)) != SVt_PVAV) |
|
|
|
100
|
|
|
|
|
|
|
9335
|
2
|
|
|
|
|
|
croak("Stats::LikeR::transpose: Array mode – row 0 is not an array ref"); |
|
9336
|
18
|
|
|
|
|
|
ncols = av_len((AV *)SvRV(*elem)) + 1; |
|
9337
|
|
|
|
|
|
|
} |
|
9338
|
35
|
100
|
|
|
|
|
for (SSize_t i = 1; i < nrows; i++) { |
|
9339
|
19
|
|
|
|
|
|
SV **restrict elem = av_fetch(in_av, i, 0); |
|
9340
|
|
|
|
|
|
|
SSize_t row_ncols; |
|
9341
|
19
|
50
|
|
|
|
|
if (!elem || !*elem) |
|
|
|
50
|
|
|
|
|
|
|
9342
|
0
|
|
|
|
|
|
croak("Stats::LikeR::transpose: Array mode – row %d is missing", (int)i); |
|
9343
|
19
|
50
|
|
|
|
|
SvGETMAGIC(*elem); |
|
|
|
0
|
|
|
|
|
|
|
9344
|
19
|
50
|
|
|
|
|
if (!SvROK(*elem) || SvTYPE(SvRV(*elem)) != SVt_PVAV) |
|
|
|
50
|
|
|
|
|
|
|
9345
|
0
|
|
|
|
|
|
croak("Stats::LikeR::transpose: Array mode – row %d is not an array ref", (int)i); |
|
9346
|
19
|
|
|
|
|
|
row_ncols = av_len((AV *)SvRV(*elem)) + 1; |
|
9347
|
19
|
100
|
|
|
|
|
if (row_ncols != ncols) |
|
9348
|
2
|
|
|
|
|
|
croak("Stats::LikeR::transpose: Array mode – ragged array: " |
|
9349
|
|
|
|
|
|
|
"row 0 has %d cols, row %d has %d", |
|
9350
|
|
|
|
|
|
|
(int)ncols, (int)i, (int)row_ncols); |
|
9351
|
|
|
|
|
|
|
} |
|
9352
|
|
|
|
|
|
|
// Pass 2: output[j][i] = input[i][j] |
|
9353
|
16
|
100
|
|
|
|
|
if (ncols > 0) { |
|
9354
|
15
|
|
|
|
|
|
av_extend(out_av, ncols - 1); |
|
9355
|
47
|
100
|
|
|
|
|
for (SSize_t j = 0; j < ncols; j++) { |
|
9356
|
32
|
|
|
|
|
|
AV *restrict out_col_av = newAV(); |
|
9357
|
32
|
|
|
|
|
|
SV *restrict col_ref = newRV_noinc((SV *)out_col_av); |
|
9358
|
32
|
50
|
|
|
|
|
if (!av_store(out_av, j, col_ref)) { |
|
9359
|
0
|
|
|
|
|
|
SvREFCNT_dec(col_ref); |
|
9360
|
0
|
|
|
|
|
|
croak("Stats::LikeR::transpose: Array mode – " |
|
9361
|
|
|
|
|
|
|
"failed to allocate output column %d", (int)j); |
|
9362
|
|
|
|
|
|
|
} |
|
9363
|
32
|
|
|
|
|
|
av_extend(out_col_av, nrows - 1); |
|
9364
|
99
|
100
|
|
|
|
|
for (SSize_t i = 0; i < nrows; i++) { |
|
9365
|
67
|
|
|
|
|
|
SV **restrict elem = av_fetch(in_av, i, 0); |
|
9366
|
67
|
50
|
|
|
|
|
if (elem && *elem) { |
|
|
|
50
|
|
|
|
|
|
|
9367
|
67
|
50
|
|
|
|
|
SvGETMAGIC(*elem); |
|
|
|
0
|
|
|
|
|
|
|
9368
|
|
|
|
|
|
|
} |
|
9369
|
67
|
|
|
|
|
|
AV *restrict in_row_av = (AV *)SvRV(*elem); |
|
9370
|
67
|
|
|
|
|
|
SV **restrict val_ptr = av_fetch(in_row_av, j, 0); |
|
9371
|
67
|
100
|
|
|
|
|
SV *restrict val = (val_ptr && *val_ptr) ? *val_ptr : &PL_sv_undef; |
|
|
|
50
|
|
|
|
|
|
|
9372
|
67
|
50
|
|
|
|
|
SvGETMAGIC(val); |
|
|
|
0
|
|
|
|
|
|
|
9373
|
67
|
|
|
|
|
|
SvREFCNT_inc(val); |
|
9374
|
67
|
50
|
|
|
|
|
if (!av_store(out_col_av, i, val)) { |
|
9375
|
0
|
|
|
|
|
|
SvREFCNT_dec(val); |
|
9376
|
0
|
|
|
|
|
|
croak("Stats::LikeR::transpose: Array mode – " |
|
9377
|
|
|
|
|
|
|
"failed to store [%d][%d]", (int)j, (int)i); |
|
9378
|
|
|
|
|
|
|
} |
|
9379
|
|
|
|
|
|
|
} |
|
9380
|
|
|
|
|
|
|
} |
|
9381
|
|
|
|
|
|
|
} |
|
9382
|
|
|
|
|
|
|
} |
|
9383
|
|
|
|
|
|
|
} else { // Unsupported |
|
9384
|
1
|
|
|
|
|
|
croak("Stats::LikeR::transpose: Input must be a hash ref or array ref"); |
|
9385
|
|
|
|
|
|
|
} |
|
9386
|
29
|
|
|
|
|
|
RETVAL = SvREFCNT_inc(retval_sv); |
|
9387
|
|
|
|
|
|
|
OUTPUT: |
|
9388
|
|
|
|
|
|
|
RETVAL |