line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::MUST::Core::SeqMask::Profiles; |
2
|
|
|
|
|
|
|
# ABSTRACT: Evolutionary profiles for sequence sites |
3
|
|
|
|
|
|
|
$Bio::MUST::Core::SeqMask::Profiles::VERSION = '0.212530'; |
4
|
17
|
|
|
17
|
|
140
|
use Moose; |
|
17
|
|
|
|
|
47
|
|
|
17
|
|
|
|
|
160
|
|
5
|
17
|
|
|
17
|
|
126594
|
use namespace::autoclean; |
|
17
|
|
|
|
|
55
|
|
|
17
|
|
|
|
|
207
|
|
6
|
|
|
|
|
|
|
|
7
|
17
|
|
|
17
|
|
1899
|
use autodie; |
|
17
|
|
|
|
|
52
|
|
|
17
|
|
|
|
|
224
|
|
8
|
17
|
|
|
17
|
|
96021
|
use feature qw(say); |
|
17
|
|
|
|
|
56
|
|
|
17
|
|
|
|
|
1809
|
|
9
|
|
|
|
|
|
|
|
10
|
17
|
|
|
17
|
|
169
|
use Carp; |
|
17
|
|
|
|
|
54
|
|
|
17
|
|
|
|
|
1571
|
|
11
|
17
|
|
|
17
|
|
128
|
use Const::Fast; |
|
17
|
|
|
|
|
38
|
|
|
17
|
|
|
|
|
218
|
|
12
|
17
|
|
|
17
|
|
1162
|
use Tie::IxHash; |
|
17
|
|
|
|
|
43
|
|
|
17
|
|
|
|
|
796
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
extends 'Bio::MUST::Core::SeqMask'; |
15
|
|
|
|
|
|
|
|
16
|
17
|
|
|
17
|
|
137
|
use Bio::MUST::Core::Types; |
|
17
|
|
|
|
|
50
|
|
|
17
|
|
|
|
|
644
|
|
17
|
17
|
|
|
17
|
|
125
|
use Bio::MUST::Core::Constants qw(:files); |
|
17
|
|
|
|
|
38
|
|
|
17
|
|
|
|
|
3925
|
|
18
|
17
|
|
|
17
|
|
161
|
use aliased 'Bio::MUST::Core::SeqMask::Freqs'; |
|
17
|
|
|
|
|
79
|
|
|
17
|
|
|
|
|
173
|
|
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# override superclass' Bool type |
22
|
|
|
|
|
|
|
# Note: mask indices are as follow: [site]{full_id}{AA} |
23
|
|
|
|
|
|
|
# mask values are AA freqs (both per seq and averaged over seqs) |
24
|
|
|
|
|
|
|
has '+mask' => ( |
25
|
|
|
|
|
|
|
isa => 'ArrayRef[HashRef[HashRef[Num]]]', |
26
|
|
|
|
|
|
|
); |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
# TODO: mask non-applicable methods from superclass? (Liskov principle) |
29
|
|
|
|
|
|
|
# TODO: move this under PostPred instead of SeqMask? |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
const my $AVERAGE => '<:AVERAGE:>'; |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
sub ppred_profiles { |
35
|
2
|
|
|
2
|
1
|
36
|
my $class = shift; |
36
|
2
|
|
|
|
|
5
|
my $alis = shift; |
37
|
2
|
|
100
|
|
|
23
|
my $args = shift // {}; # HashRef (should not be empty...) |
38
|
|
|
|
|
|
|
|
39
|
2
|
|
|
|
|
7
|
my $list = $args->{sim_list}; |
40
|
|
|
|
|
|
|
|
41
|
2
|
|
|
|
|
18
|
my @sim_freq_at_for; |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
my $regex; |
44
|
2
|
|
|
|
|
0
|
my $width; |
45
|
2
|
|
|
|
|
0
|
my $seq_inc; |
46
|
2
|
|
|
|
|
0
|
my $avg_inc; |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
# loop through Ali objects to build site profiles |
49
|
|
|
|
|
|
|
# Note: profiles will be available both per seq and averaged over seqs |
50
|
2
|
|
|
|
|
4
|
for my $ali ( @{$alis} ) { |
|
2
|
|
|
|
|
9
|
|
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# extract seqs on which to compute freqs (defaults to all seqs) |
53
|
100
|
100
|
|
|
|
538
|
my $sample = $list ? $list->filtered_ali($ali) : $ali; |
54
|
100
|
|
|
|
|
4016
|
my @seqs = $sample->all_seqs; |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
# setup mask details based on first Ali |
57
|
100
|
100
|
|
|
|
418
|
unless ($regex) { |
58
|
2
|
|
|
|
|
12
|
$regex = $ali->gapmiss_regex; |
59
|
2
|
|
|
|
|
12
|
$width = $ali->width; |
60
|
2
|
|
|
|
|
7
|
$seq_inc = 1.0 / @{$alis}; |
|
2
|
|
|
|
|
12
|
|
61
|
2
|
|
|
|
|
17
|
$avg_inc = $seq_inc / @seqs; |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
# loop through simulated seqs to store and average ppred state freqs |
65
|
100
|
|
|
|
|
270
|
for my $seq (@seqs) { |
66
|
4000
|
|
|
|
|
18351
|
my $full_id = $seq->full_id; |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
# store and average ppred state freq at each site for current seq |
69
|
|
|
|
|
|
|
# Note: all missing/gap states are folded to '*' |
70
|
4000
|
|
|
|
|
16005
|
for my $site (0..$width-1) { |
71
|
740000
|
|
|
|
|
1643827
|
my $sim_state = uc $seq->state_at($site); |
72
|
740000
|
50
|
|
|
|
2455809
|
$sim_state = '*' if $sim_state =~ m/$regex/xms; |
73
|
740000
|
|
|
|
|
2183474
|
$sim_freq_at_for[$site]{$full_id}{$sim_state} += $seq_inc; |
74
|
740000
|
|
|
|
|
1804873
|
$sim_freq_at_for[$site]{$AVERAGE}{$sim_state} += $avg_inc; |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
} |
78
|
|
|
|
|
|
|
|
79
|
2
|
|
|
|
|
89
|
return $class->new( mask => \@sim_freq_at_for ); |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
sub ppred_freqs { |
85
|
3
|
|
|
3
|
1
|
37
|
my $self = shift; |
86
|
3
|
|
|
|
|
9
|
my $ali = shift; |
87
|
3
|
|
100
|
|
|
20
|
my $args = shift // {}; # HashRef (should not be empty...) |
88
|
|
|
|
|
|
|
|
89
|
3
|
|
100
|
|
|
18
|
my $by_seq = $args->{by_seq} // 0; |
90
|
3
|
|
|
|
|
8
|
my $list = $args->{obs_list}; |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# ppred_freqs_by_seq |
93
|
|
|
|
|
|
|
# input: f_AA(site,seq) |
94
|
|
|
|
|
|
|
# output: f_AAobs(site,seq) |
95
|
|
|
|
|
|
|
# => mask f_AAobs_avg(site) over seq |
96
|
|
|
|
|
|
|
# => seq sort f_AAobs_avg(seq) over sites |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
# ppred_freqs |
99
|
|
|
|
|
|
|
# input: f_AA_avg(site) over seq |
100
|
|
|
|
|
|
|
# output: f_AAobs(site,seq) |
101
|
|
|
|
|
|
|
# => mask f_AAobs_avg(site) over seq |
102
|
|
|
|
|
|
|
# => seq sort f_AAobs_avg(seq) over sites |
103
|
|
|
|
|
|
|
|
104
|
3
|
|
|
|
|
44
|
tie my %obs_freq_for_at, 'Tie::IxHash'; |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
# extract seqs on which to compute freqs (defaults to all seqs) |
107
|
3
|
100
|
|
|
|
81
|
my $sample = $list ? $list->filtered_ali($ali) : $ali; |
108
|
3
|
|
|
|
|
147
|
my @seqs = $sample->all_seqs; |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
# setup mask details |
111
|
3
|
|
|
|
|
17
|
my $regex = $ali->gapmiss_regex; |
112
|
3
|
|
|
|
|
15
|
my $width = $ali->width; |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
# loop through real seqs to store observed state freqs |
115
|
|
|
|
|
|
|
# Note: ppred state freqs are either by seq or averaged over seqs |
116
|
3
|
|
|
|
|
13
|
for my $seq (@seqs) { |
117
|
161
|
|
|
|
|
1533
|
my $full_id = $seq->full_id; |
118
|
161
|
100
|
|
|
|
400
|
my $seq_key = $by_seq ? $full_id : $AVERAGE; |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
# store observed state freq at each site for current seq |
121
|
|
|
|
|
|
|
# Note: sites with missing/gap state get a max freq of 1.0 |
122
|
161
|
|
|
|
|
487
|
for my $site (0..$width-1) { |
123
|
29785
|
|
|
|
|
199913
|
my $sim_freq_for = ${ $self->state_at($site) }{$seq_key}; |
|
29785
|
|
|
|
|
930354
|
|
124
|
29785
|
|
|
|
|
63073
|
my $obs_state = uc $seq->state_at($site); |
125
|
|
|
|
|
|
|
my $obs_freq = $obs_state =~ m/$regex/xms ? 1.0 |
126
|
29785
|
100
|
100
|
|
|
124643
|
: $sim_freq_for->{$obs_state} // 0.0; |
127
|
29785
|
|
|
|
|
95661
|
$obs_freq_for_at{$full_id}[$site] = $obs_freq; |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
3
|
|
|
|
|
167
|
return Freqs->new( freq_for_at => \%obs_freq_for_at ); |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
sub store { |
137
|
1
|
|
|
1
|
1
|
3
|
my $self = shift; |
138
|
1
|
|
|
|
|
3
|
my $outfile = shift; |
139
|
|
|
|
|
|
|
|
140
|
1
|
|
|
|
|
8
|
open my $out, '>', $outfile; |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
# output header |
143
|
1
|
|
|
|
|
2190
|
say {$out} join "\t", qw(site seq aa), 'f(i,j)'; |
|
1
|
|
|
|
|
9
|
|
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
# setup loop |
146
|
1
|
|
|
|
|
3
|
my @ids; |
147
|
1
|
|
|
|
|
12
|
my @aas = split //xms, 'ACDEFGHIKLMNPQRSTVWY*'; # TODO: improve this |
148
|
1
|
|
|
|
|
41
|
my $width = $self->mask_len; |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
# loop through sites, ids and AAs to output ppred state freqs |
151
|
1
|
|
|
|
|
6
|
for my $site (0..$width-1) { |
152
|
185
|
|
|
|
|
7722
|
my $sim_freq_for = $self->state_at($site); |
153
|
185
|
100
|
50
|
|
|
398
|
@ids = sort keys %{ $sim_freq_for // {} } unless @ids; |
|
1
|
|
|
|
|
50
|
|
154
|
|
|
|
|
|
|
# get ids from first state |
155
|
|
|
|
|
|
|
ID: |
156
|
185
|
|
|
|
|
331
|
for my $id (@ids) { |
157
|
14800
|
100
|
|
|
|
24850
|
next ID if $id eq $AVERAGE; # skip averaged ppred freqs |
158
|
14615
|
|
|
|
|
19273
|
for my $aa (@aas) { |
159
|
306915
|
|
|
|
|
966634
|
say {$out} join "\t", $site + 1, $id, $aa, |
160
|
306915
|
|
100
|
|
|
365255
|
$sim_freq_for->{$id}{$aa} // 0; |
161
|
|
|
|
|
|
|
} # missing AAs are set to 0 (hence the alphabet) |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
|
165
|
1
|
|
|
|
|
43
|
return; |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
169
|
|
|
|
|
|
|
1; |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
__END__ |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
=pod |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=head1 NAME |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
Bio::MUST::Core::SeqMask::Profiles - Evolutionary profiles for sequence sites |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=head1 VERSION |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
version 0.212530 |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=head1 SYNOPSIS |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
# TODO |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
=head1 DESCRIPTION |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
# TODO |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
=head1 METHODS |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=head2 ppred_profiles |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=head2 ppred_freqs |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head2 store |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
=head1 AUTHOR |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
Denis BAURAIN <denis.baurain@uliege.be> |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN. |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
208
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
=cut |