| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Proch::N50; |
|
2
|
|
|
|
|
|
|
#ABSTRACT: Calculate N50 and related statistics for FASTA/FASTQ files with minimal dependencies |
|
3
|
|
|
|
|
|
|
|
|
4
|
10
|
|
|
10
|
|
395785
|
use strict; |
|
|
10
|
|
|
|
|
16
|
|
|
|
10
|
|
|
|
|
341
|
|
|
5
|
10
|
|
|
10
|
|
82
|
use warnings; |
|
|
10
|
|
|
|
|
15
|
|
|
|
10
|
|
|
|
|
458
|
|
|
6
|
10
|
|
|
10
|
|
174
|
use 5.012; |
|
|
10
|
|
|
|
|
32
|
|
|
7
|
10
|
|
|
10
|
|
54
|
use Carp qw(croak); |
|
|
10
|
|
|
|
|
48
|
|
|
|
10
|
|
|
|
|
712
|
|
|
8
|
10
|
|
|
10
|
|
52
|
use File::Spec; |
|
|
10
|
|
|
|
|
13
|
|
|
|
10
|
|
|
|
|
313
|
|
|
9
|
10
|
|
|
10
|
|
6635
|
use JSON::PP; |
|
|
10
|
|
|
|
|
168400
|
|
|
|
10
|
|
|
|
|
942
|
|
|
10
|
10
|
|
|
10
|
|
5573
|
use FASTX::Reader; |
|
|
10
|
|
|
|
|
290177
|
|
|
|
10
|
|
|
|
|
661
|
|
|
11
|
10
|
|
|
10
|
|
98
|
use File::Basename; |
|
|
10
|
|
|
|
|
23
|
|
|
|
10
|
|
|
|
|
964
|
|
|
12
|
10
|
|
|
10
|
|
65
|
use Exporter qw(import); |
|
|
10
|
|
|
|
|
23
|
|
|
|
10
|
|
|
|
|
8520
|
|
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our $VERSION = '1.7.0'; |
|
15
|
|
|
|
|
|
|
our @EXPORT = qw(getStats getN50 jsonStats); |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
# Configuration |
|
18
|
|
|
|
|
|
|
our $DEFAULT_DIGITS = 2; |
|
19
|
|
|
|
|
|
|
our $MIN_CUSTOM_N = 0; |
|
20
|
|
|
|
|
|
|
our $MAX_CUSTOM_N = 100; |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
sub getStats { |
|
24
|
12
|
|
|
12
|
1
|
1226404
|
my ($file, $wantJSON, $customN) = @_; |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# Input validation |
|
27
|
12
|
50
|
|
|
|
77
|
croak "Missing input file parameter" unless defined $file; |
|
28
|
|
|
|
|
|
|
|
|
29
|
12
|
50
|
|
|
|
37
|
if (defined $customN) { |
|
30
|
0
|
0
|
0
|
|
|
0
|
croak sprintf("Custom N value must be between %d and %d", |
|
31
|
|
|
|
|
|
|
$MIN_CUSTOM_N, $MAX_CUSTOM_N) |
|
32
|
|
|
|
|
|
|
if ($customN > $MAX_CUSTOM_N || $customN < $MIN_CUSTOM_N); |
|
33
|
|
|
|
|
|
|
} |
|
34
|
|
|
|
|
|
|
|
|
35
|
12
|
|
|
|
|
1605
|
my $stats = { |
|
36
|
|
|
|
|
|
|
status => 1, |
|
37
|
|
|
|
|
|
|
N50 => undef, |
|
38
|
|
|
|
|
|
|
filename => basename($file), |
|
39
|
|
|
|
|
|
|
dirname => dirname($file), |
|
40
|
|
|
|
|
|
|
path => File::Spec->rel2abs($file) |
|
41
|
|
|
|
|
|
|
}; |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# File existence check |
|
44
|
12
|
100
|
66
|
|
|
343
|
unless ($file eq '-' || -e $file) { |
|
45
|
3
|
|
|
|
|
10
|
$stats->{status} = 0; |
|
46
|
3
|
|
|
|
|
11
|
$stats->{message} = "File not found: $file"; |
|
47
|
3
|
|
|
|
|
11
|
return $stats; |
|
48
|
|
|
|
|
|
|
} |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
# Initialize sequence reader |
|
51
|
9
|
50
|
|
|
|
133
|
my $reader = ($file eq '-') |
|
52
|
|
|
|
|
|
|
? FASTX::Reader->new({ filename => '{{STDIN}}' }) |
|
53
|
|
|
|
|
|
|
: FASTX::Reader->new({ filename => $file }); |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
# Process sequences |
|
56
|
9
|
|
|
|
|
2944
|
my %sizes; |
|
57
|
9
|
|
|
|
|
25
|
my ($total_seqs, $total_size) = (0, 0); |
|
58
|
|
|
|
|
|
|
|
|
59
|
9
|
|
|
|
|
62
|
while (my $seq = $reader->getRead()) { |
|
60
|
54
|
|
|
|
|
3506
|
$total_seqs++; |
|
61
|
54
|
|
|
|
|
131
|
my $length = length($seq->{seq}); |
|
62
|
54
|
|
|
|
|
143
|
$sizes{$length}++; |
|
63
|
54
|
|
|
|
|
243
|
$total_size += $length; |
|
64
|
|
|
|
|
|
|
} |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
# Calculate statistics |
|
67
|
9
|
|
|
|
|
217
|
my ($n50, $min, $max, $auN, $n75, $n90, $nx) = |
|
68
|
|
|
|
|
|
|
_calculateMetrics(\%sizes, $total_size, $customN); |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
# Populate results |
|
71
|
9
|
|
|
|
|
58
|
$stats->{size} = $total_size; |
|
72
|
9
|
|
|
|
|
26
|
$stats->{seqs} = $total_seqs; |
|
73
|
9
|
|
|
|
|
21
|
$stats->{N50} = $n50; |
|
74
|
9
|
|
|
|
|
21
|
$stats->{N75} = $n75; |
|
75
|
9
|
|
|
|
|
21
|
$stats->{N90} = $n90; |
|
76
|
9
|
|
|
|
|
51
|
$stats->{min} = $min; |
|
77
|
9
|
|
|
|
|
25
|
$stats->{max} = $max; |
|
78
|
9
|
|
|
|
|
111
|
$stats->{auN} = sprintf("%.${DEFAULT_DIGITS}f", $auN); |
|
79
|
9
|
50
|
|
|
|
51
|
$stats->{Nx} = $nx if defined $customN; |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
# Generate JSON if requested |
|
82
|
9
|
100
|
|
|
|
28
|
if ($wantJSON) { |
|
83
|
5
|
|
|
|
|
52
|
$stats->{json} = JSON::PP->new->pretty->encode($stats); |
|
84
|
|
|
|
|
|
|
} |
|
85
|
|
|
|
|
|
|
|
|
86
|
9
|
|
|
|
|
3056
|
return $stats; |
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
sub _calculateMetrics { |
|
90
|
9
|
|
|
9
|
|
33
|
my ($sizes, $total_size, $custom_n) = @_; |
|
91
|
|
|
|
|
|
|
|
|
92
|
9
|
|
|
|
|
48
|
my ($n50, $n75, $n90, $nx) = (0, 0, 0, 0); |
|
93
|
9
|
|
|
|
|
51
|
my $progressive_sum = 0; |
|
94
|
9
|
|
|
|
|
26
|
my $auN = 0; |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
# Sort lengths in DESCENDING order |
|
97
|
9
|
|
|
|
|
62
|
my @sorted_lengths = sort { $b <=> $a } keys %$sizes; |
|
|
38
|
|
|
|
|
95
|
|
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
# Get min and max lengths |
|
100
|
9
|
|
|
|
|
21
|
my $min = $sorted_lengths[-1]; |
|
101
|
9
|
|
|
|
|
19
|
my $max = $sorted_lengths[0]; |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
# Iterate over sorted lengths |
|
104
|
9
|
|
|
|
|
25
|
foreach my $length (@sorted_lengths) { |
|
105
|
34
|
|
|
|
|
79
|
my $count = $sizes->{$length}; # Number of sequences of this length |
|
106
|
34
|
|
|
|
|
54
|
my $total_length = $length * $count; # Total length contributed by these sequences |
|
107
|
34
|
|
|
|
|
49
|
$progressive_sum += $total_length; # Add to cumulative sum |
|
108
|
34
|
|
|
|
|
64
|
$auN += $total_length * ($total_length / $total_size); # auN calculation |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
# Check thresholds for N50, N75, N90 |
|
111
|
34
|
100
|
66
|
|
|
121
|
if (!$n50 && $progressive_sum >= ($total_size * 0.5)) { |
|
112
|
9
|
|
|
|
|
32
|
$n50 = $length; |
|
113
|
|
|
|
|
|
|
} |
|
114
|
34
|
100
|
100
|
|
|
114
|
if (!$n75 && $progressive_sum >= ($total_size * 0.75)) { |
|
115
|
9
|
|
|
|
|
26
|
$n75 = $length; |
|
116
|
|
|
|
|
|
|
} |
|
117
|
34
|
100
|
100
|
|
|
120
|
if (!$n90 && $progressive_sum >= ($total_size * 0.9)) { |
|
118
|
9
|
|
|
|
|
37
|
$n90 = $length; |
|
119
|
|
|
|
|
|
|
} |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
# Custom Nx calculation |
|
122
|
34
|
50
|
33
|
|
|
116
|
if (!$nx && defined $custom_n) { |
|
123
|
0
|
|
|
|
|
0
|
my $threshold = $total_size * ($custom_n / 100); |
|
124
|
0
|
0
|
|
|
|
0
|
if ($progressive_sum >= $threshold) { |
|
125
|
0
|
|
|
|
|
0
|
$nx = $length; |
|
126
|
|
|
|
|
|
|
} |
|
127
|
|
|
|
|
|
|
} |
|
128
|
|
|
|
|
|
|
} |
|
129
|
|
|
|
|
|
|
|
|
130
|
9
|
|
|
|
|
77
|
return ($n50, $min, $max, $auN, $n75, $n90, $nx); |
|
131
|
|
|
|
|
|
|
} |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
sub getN50 { |
|
136
|
3
|
|
|
3
|
1
|
433727
|
my ($file) = @_; |
|
137
|
3
|
|
|
|
|
16
|
my $stats = getStats($file); |
|
138
|
3
|
50
|
|
|
|
33
|
return $stats->{status} ? $stats->{N50} : 0; |
|
139
|
|
|
|
|
|
|
} |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
sub jsonStats { |
|
143
|
2
|
|
|
2
|
1
|
4367
|
my ($file) = @_; |
|
144
|
2
|
|
|
|
|
12
|
my $stats = getStats($file, 'JSON'); |
|
145
|
2
|
|
|
|
|
10
|
return $stats->{json}; |
|
146
|
|
|
|
|
|
|
} |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
1; |
|
149
|
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
__END__ |