line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Peptide::Kmers; |
2
|
|
|
|
|
|
|
# Peptide::Kmers - provides log_prop_kmers and kmers methods. |
3
|
|
|
|
|
|
|
|
4
|
2
|
|
|
2
|
|
24136
|
use Carp; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
188
|
|
5
|
2
|
|
|
2
|
|
12
|
use warnings; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
64
|
|
6
|
2
|
|
|
2
|
|
11
|
use strict; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
1521
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
=head2 new |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
Args : none |
11
|
|
|
|
|
|
|
Example : my $pk = Peptide::Kmers->new(verbose => 2); |
12
|
|
|
|
|
|
|
Description : bare constructor |
13
|
|
|
|
|
|
|
Returns : TRUE if successful, FALSE otherwise. |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=cut |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
sub new { |
18
|
1
|
|
|
1
|
1
|
17
|
my $class = shift; |
19
|
1
|
|
33
|
|
|
14
|
my $self = bless { verbose => 1, @_ }, ref($class) || $class; |
20
|
1
|
|
|
|
|
5
|
return $self; |
21
|
|
|
|
|
|
|
} |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=head2 log_prop_kmers |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
Args : %named_parameters: |
26
|
|
|
|
|
|
|
k : length of the k-mer. Allowed values: positive integer. Mandatory. |
27
|
|
|
|
|
|
|
in_fh : filehandle for the input file. If missing, STDIN is used. |
28
|
|
|
|
|
|
|
min : min number of occurrences. Allowed values: non-negative real number. Mandatory. Suggested values: 1 (equivalent to assuming that the k-mer occurred once even if it actually did not occur) or 0.5. |
29
|
|
|
|
|
|
|
maxtotal : max total number of occurrences. Allowed values: positive integer. Optional. |
30
|
|
|
|
|
|
|
Example : my %log_prop = $pk->log_prop_kmers(k => 3, min => 1, text => [ qw(aecaecaecd aecd xyz123) ]); |
31
|
|
|
|
|
|
|
Description : For all overlapping k-mers (words of length k) in the input, computes the frequency of occurrence within the input text. Only allowed k-mers (those returned by kmers()) are counted. If the k-mer does not occur, min number of occurrences (eg, 0.5 or 1) is used as default threshold. Computes log10(proportion(k-mer)), where proportion(k-mer) = frequency(k-mer) / sum( all frequencies). For example, if input has 2 words: 'fooo' and 'bar', and k=2, the k-mers are: 'fo', 'oo', 'oo', 'ba', 'ar'. The frequencies are: 2 for 'oo', and 1 for the rest of the k-mers. Prints sum( all frequencies) into STDERR. If maxtotal is defined, input processing is stopped if sum( all frequencies) >= maxtotal. If this prevents any input from being processed, a warning is printed into STDERR. All warning can be suppressed by setting verbose arg to 1 or below). For comparing log10(proportion(k-mer)) from 2 different text sources, for example sequences and non-sequences, it is best to compute using the same sum( all frequencies). Thus, execute the code twice. First time, execute using undefined maxtotal, just to compute the sum( all frequencies) for the entire text from each of the sources. Second time, execute using the same maxtotal = min sum( all frequencies) for each source. |
32
|
|
|
|
|
|
|
Returns : hash with keys = k-mers and values = log10(proportion(k-mer)) if successful, FALSE otherwise. |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=cut |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
sub log_prop_kmers { |
37
|
3
|
|
|
3
|
1
|
31067
|
my ($self, %args) = @_; |
38
|
3
|
50
|
0
|
|
|
17
|
$args{k} > 0 or carp "not ok: got k = $args{k}, expected > 0" and return; |
39
|
3
|
50
|
0
|
|
|
15
|
$args{min} >= 0 or carp "not ok: got min = $args{min}, expected >= 0" and return; |
40
|
3
|
50
|
0
|
|
|
24
|
not defined $args{maxtotal} or $args{maxtotal} > 0 or |
|
|
|
66
|
|
|
|
|
41
|
|
|
|
|
|
|
carp "not ok: got maxtotal = $args{maxtotal}, expected undefined or > 0" and return; |
42
|
3
|
|
33
|
|
|
13
|
my $in_fh = $args{in_fh} || *STDIN; |
43
|
3
|
|
|
|
|
54
|
my $re = qr/(?=(.{$args{k}}))/; |
44
|
3
|
|
|
|
|
22
|
my %freq = map { $_ => 0 } $self->kmers(%args); |
|
52728
|
|
|
|
|
98716
|
|
45
|
|
|
|
|
|
|
# min $freq_total, if all k-mers did not occur: |
46
|
3
|
|
|
|
|
9658
|
my $num_kmers = keys %freq; |
47
|
3
|
|
|
|
|
18
|
my $freq_total = $args{min} * $num_kmers; |
48
|
3
|
50
|
|
|
|
31
|
if ($self->{verbose} > 1) { |
49
|
0
|
0
|
0
|
|
|
0
|
carp "WARNING: min * num_kmers >= maxtotal ", |
50
|
|
|
|
|
|
|
"($args{min} * $num_kmers >= $args{maxtotal}), ", |
51
|
|
|
|
|
|
|
"no input is processed" |
52
|
|
|
|
|
|
|
if defined $args{maxtotal} and $freq_total >= $args{maxtotal}; |
53
|
|
|
|
|
|
|
} |
54
|
3
|
|
|
|
|
6
|
my $i = 0; |
55
|
3
|
|
|
|
|
1835
|
INPUT: while (<$in_fh>) { |
56
|
4
|
|
|
|
|
12
|
chomp; |
57
|
4
|
50
|
|
|
|
16
|
if ($self->{verbose} > 1) { |
58
|
0
|
0
|
|
|
|
0
|
if (not ++$i % 10_000) { |
59
|
0
|
|
|
|
|
0
|
print STDERR "log_prop_kmers: processing input line $i\n"; |
60
|
|
|
|
|
|
|
} |
61
|
|
|
|
|
|
|
} |
62
|
4
|
|
|
|
|
127
|
foreach (/$re/g) { |
63
|
23
|
100
|
100
|
|
|
79
|
last INPUT if defined $args{maxtotal} and $freq_total >= $args{maxtotal}; |
64
|
|
|
|
|
|
|
# count only allowed k-mers: |
65
|
21
|
100
|
|
|
|
61
|
next unless defined $freq{$_}; |
66
|
|
|
|
|
|
|
# For the first occurrence of the k-mer, $freq{$_} is changed from $args{min} |
67
|
|
|
|
|
|
|
# to 1. Thus, subtract $args{min} from $freq_total. |
68
|
|
|
|
|
|
|
# $freq{$_} is assigned to $args{min} separately, below, |
69
|
|
|
|
|
|
|
# because testing if $freq{$_} is not ambiguous, and $freq{$_} == $args{min} |
70
|
|
|
|
|
|
|
# is ambiguous for $args{min} = 1 and for k-mer that actually occurred once. |
71
|
|
|
|
|
|
|
# Note that k-mers that occur only once do not increase $freq_total |
72
|
|
|
|
|
|
|
# if $args{min} = 1 |
73
|
15
|
100
|
|
|
|
37
|
$freq_total -= $args{min} unless $freq{$_}; |
74
|
15
|
|
|
|
|
14
|
$freq_total++; |
75
|
15
|
|
|
|
|
29
|
$freq{$_}++; |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
} |
78
|
3
|
50
|
|
|
|
16
|
print STDERR "freq_total=$freq_total" if $self->{verbose} > 1; |
79
|
3
|
|
|
|
|
11848
|
foreach (keys %freq) { |
80
|
52728
|
100
|
|
|
|
132604
|
$freq{$_} = $args{min} if $freq{$_} < $args{min}; |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
#warn 'log_prop_kmers: all: ', join "; ", map { "$_ => $freq{$_}" } sort keys %freq; |
83
|
|
|
|
|
|
|
#warn 'log_prop_kmers: freq: ', join "; ", map { "$_ => $freq{$_}" } qw(acc ppp 123); |
84
|
3
|
|
50
|
|
|
7247
|
$freq_total ||= 1; # prevent division by 0 |
85
|
|
|
|
|
|
|
#warn "freq_total=$freq_total"; |
86
|
3
|
50
|
|
|
|
18
|
return unless keys %freq; |
87
|
3
|
|
|
|
|
9390
|
my %log_prop = map { ( $_ => |
|
52728
|
|
|
|
|
245743
|
|
88
|
|
|
|
|
|
|
sprintf("%.2f", |
89
|
|
|
|
|
|
|
( log($freq{$_} / $freq_total) / log(10) ) |
90
|
|
|
|
|
|
|
) |
91
|
|
|
|
|
|
|
) |
92
|
|
|
|
|
|
|
} keys %freq; |
93
|
|
|
|
|
|
|
#warn 'log_prop_kmers: log_prop: ', join "; ", map { "$_ => $log_prop{$_}" } qw(acc ppp 123); |
94
|
3
|
|
|
|
|
55630
|
return %log_prop; |
95
|
|
|
|
|
|
|
} |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=head2 kmers |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
Args : %named_parameters: |
100
|
|
|
|
|
|
|
mandatory: |
101
|
|
|
|
|
|
|
k : length of the k-mer. Allowed values: positive integer. |
102
|
|
|
|
|
|
|
Example : $pk->kmers(k => 3))[0,1,-1] # returns qw(aaa aab zzz) |
103
|
|
|
|
|
|
|
Description : Creates a list of different allowed k-mers (words of length k, using lowercase chars a-z). |
104
|
|
|
|
|
|
|
Returns : resulting list |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
=cut |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
sub kmers { |
109
|
4
|
|
|
4
|
1
|
956
|
my ($self, %args) = @_; |
110
|
4
|
50
|
0
|
|
|
18
|
$args{k} > 0 or carp "not ok: got k = $args{k}, expected > 0" and return; |
111
|
4
|
|
|
|
|
12
|
my @kmers = (''); |
112
|
|
|
|
|
|
|
# grow @kmers by adding to each existing el 1 char from the list of allowed chars |
113
|
4
|
|
|
|
|
54
|
for my $i (1..$args{k}) { |
114
|
12
|
|
|
|
|
16
|
my @kmers_new; |
115
|
12
|
|
|
|
|
22
|
foreach my $kmer (@kmers) { |
116
|
|
|
|
|
|
|
# keep allowed chars in kmers(): 'a'..'z' in sync with |
117
|
|
|
|
|
|
|
# WordPropProtein() args : lc of tr/A-Za-z//cd; |
118
|
2812
|
|
|
|
|
5483
|
foreach my $char ( 'a'..'z' ) { |
119
|
73112
|
|
|
|
|
109200
|
push @kmers_new, "$kmer$char"; |
120
|
|
|
|
|
|
|
} |
121
|
|
|
|
|
|
|
} |
122
|
12
|
|
|
|
|
18304
|
@kmers = @kmers_new; |
123
|
|
|
|
|
|
|
} |
124
|
|
|
|
|
|
|
#warn "@kmers"; |
125
|
4
|
|
|
|
|
19828
|
return @kmers; |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
1; |