File Coverage

blib/lib/Bio/MUST/Core/SeqMask/Profiles.pm
Criterion Covered Total %
statement 94 94 100.0
branch 15 16 93.7
condition 11 12 91.6
subroutine 13 13 100.0
pod 3 3 100.0
total 136 138 98.5


line stmt bran cond sub pod time code
1             package Bio::MUST::Core::SeqMask::Profiles;
2             # ABSTRACT: Evolutionary profiles for sequence sites
3             $Bio::MUST::Core::SeqMask::Profiles::VERSION = '0.212530';
4 17     17   140 use Moose;
  17         47  
  17         160  
5 17     17   126594 use namespace::autoclean;
  17         55  
  17         207  
6              
7 17     17   1899 use autodie;
  17         52  
  17         224  
8 17     17   96021 use feature qw(say);
  17         56  
  17         1809  
9              
10 17     17   169 use Carp;
  17         54  
  17         1571  
11 17     17   128 use Const::Fast;
  17         38  
  17         218  
12 17     17   1162 use Tie::IxHash;
  17         43  
  17         796  
13              
14             extends 'Bio::MUST::Core::SeqMask';
15              
16 17     17   137 use Bio::MUST::Core::Types;
  17         50  
  17         644  
17 17     17   125 use Bio::MUST::Core::Constants qw(:files);
  17         38  
  17         3925  
18 17     17   161 use aliased 'Bio::MUST::Core::SeqMask::Freqs';
  17         79  
  17         173  
19              
20              
21             # override superclass' Bool type
22             # Note: mask indices are as follow: [site]{full_id}{AA}
23             # mask values are AA freqs (both per seq and averaged over seqs)
24             has '+mask' => (
25             isa => 'ArrayRef[HashRef[HashRef[Num]]]',
26             );
27              
28             # TODO: mask non-applicable methods from superclass? (Liskov principle)
29             # TODO: move this under PostPred instead of SeqMask?
30              
31             const my $AVERAGE => '<:AVERAGE:>';
32              
33              
34             sub ppred_profiles {
35 2     2 1 36 my $class = shift;
36 2         5 my $alis = shift;
37 2   100     23 my $args = shift // {}; # HashRef (should not be empty...)
38              
39 2         7 my $list = $args->{sim_list};
40              
41 2         18 my @sim_freq_at_for;
42              
43             my $regex;
44 2         0 my $width;
45 2         0 my $seq_inc;
46 2         0 my $avg_inc;
47              
48             # loop through Ali objects to build site profiles
49             # Note: profiles will be available both per seq and averaged over seqs
50 2         4 for my $ali ( @{$alis} ) {
  2         9  
51              
52             # extract seqs on which to compute freqs (defaults to all seqs)
53 100 100       538 my $sample = $list ? $list->filtered_ali($ali) : $ali;
54 100         4016 my @seqs = $sample->all_seqs;
55              
56             # setup mask details based on first Ali
57 100 100       418 unless ($regex) {
58 2         12 $regex = $ali->gapmiss_regex;
59 2         12 $width = $ali->width;
60 2         7 $seq_inc = 1.0 / @{$alis};
  2         12  
61 2         17 $avg_inc = $seq_inc / @seqs;
62             }
63              
64             # loop through simulated seqs to store and average ppred state freqs
65 100         270 for my $seq (@seqs) {
66 4000         18351 my $full_id = $seq->full_id;
67              
68             # store and average ppred state freq at each site for current seq
69             # Note: all missing/gap states are folded to '*'
70 4000         16005 for my $site (0..$width-1) {
71 740000         1643827 my $sim_state = uc $seq->state_at($site);
72 740000 50       2455809 $sim_state = '*' if $sim_state =~ m/$regex/xms;
73 740000         2183474 $sim_freq_at_for[$site]{$full_id}{$sim_state} += $seq_inc;
74 740000         1804873 $sim_freq_at_for[$site]{$AVERAGE}{$sim_state} += $avg_inc;
75             }
76             }
77             }
78              
79 2         89 return $class->new( mask => \@sim_freq_at_for );
80             }
81              
82              
83              
84             sub ppred_freqs {
85 3     3 1 37 my $self = shift;
86 3         9 my $ali = shift;
87 3   100     20 my $args = shift // {}; # HashRef (should not be empty...)
88              
89 3   100     18 my $by_seq = $args->{by_seq} // 0;
90 3         8 my $list = $args->{obs_list};
91              
92             # ppred_freqs_by_seq
93             # input: f_AA(site,seq)
94             # output: f_AAobs(site,seq)
95             # => mask f_AAobs_avg(site) over seq
96             # => seq sort f_AAobs_avg(seq) over sites
97              
98             # ppred_freqs
99             # input: f_AA_avg(site) over seq
100             # output: f_AAobs(site,seq)
101             # => mask f_AAobs_avg(site) over seq
102             # => seq sort f_AAobs_avg(seq) over sites
103              
104 3         44 tie my %obs_freq_for_at, 'Tie::IxHash';
105              
106             # extract seqs on which to compute freqs (defaults to all seqs)
107 3 100       81 my $sample = $list ? $list->filtered_ali($ali) : $ali;
108 3         147 my @seqs = $sample->all_seqs;
109              
110             # setup mask details
111 3         17 my $regex = $ali->gapmiss_regex;
112 3         15 my $width = $ali->width;
113              
114             # loop through real seqs to store observed state freqs
115             # Note: ppred state freqs are either by seq or averaged over seqs
116 3         13 for my $seq (@seqs) {
117 161         1533 my $full_id = $seq->full_id;
118 161 100       400 my $seq_key = $by_seq ? $full_id : $AVERAGE;
119              
120             # store observed state freq at each site for current seq
121             # Note: sites with missing/gap state get a max freq of 1.0
122 161         487 for my $site (0..$width-1) {
123 29785         199913 my $sim_freq_for = ${ $self->state_at($site) }{$seq_key};
  29785         930354  
124 29785         63073 my $obs_state = uc $seq->state_at($site);
125             my $obs_freq = $obs_state =~ m/$regex/xms ? 1.0
126 29785 100 100     124643 : $sim_freq_for->{$obs_state} // 0.0;
127 29785         95661 $obs_freq_for_at{$full_id}[$site] = $obs_freq;
128             }
129             }
130              
131 3         167 return Freqs->new( freq_for_at => \%obs_freq_for_at );
132             }
133              
134              
135              
136             sub store {
137 1     1 1 3 my $self = shift;
138 1         3 my $outfile = shift;
139              
140 1         8 open my $out, '>', $outfile;
141              
142             # output header
143 1         2190 say {$out} join "\t", qw(site seq aa), 'f(i,j)';
  1         9  
144              
145             # setup loop
146 1         3 my @ids;
147 1         12 my @aas = split //xms, 'ACDEFGHIKLMNPQRSTVWY*'; # TODO: improve this
148 1         41 my $width = $self->mask_len;
149              
150             # loop through sites, ids and AAs to output ppred state freqs
151 1         6 for my $site (0..$width-1) {
152 185         7722 my $sim_freq_for = $self->state_at($site);
153 185 100 50     398 @ids = sort keys %{ $sim_freq_for // {} } unless @ids;
  1         50  
154             # get ids from first state
155             ID:
156 185         331 for my $id (@ids) {
157 14800 100       24850 next ID if $id eq $AVERAGE; # skip averaged ppred freqs
158 14615         19273 for my $aa (@aas) {
159 306915         966634 say {$out} join "\t", $site + 1, $id, $aa,
160 306915   100     365255 $sim_freq_for->{$id}{$aa} // 0;
161             } # missing AAs are set to 0 (hence the alphabet)
162             }
163             }
164              
165 1         43 return;
166             }
167              
168             __PACKAGE__->meta->make_immutable;
169             1;
170              
171             __END__
172              
173             =pod
174              
175             =head1 NAME
176              
177             Bio::MUST::Core::SeqMask::Profiles - Evolutionary profiles for sequence sites
178              
179             =head1 VERSION
180              
181             version 0.212530
182              
183             =head1 SYNOPSIS
184              
185             # TODO
186              
187             =head1 DESCRIPTION
188              
189             # TODO
190              
191             =head1 METHODS
192              
193             =head2 ppred_profiles
194              
195             =head2 ppred_freqs
196              
197             =head2 store
198              
199             =head1 AUTHOR
200              
201             Denis BAURAIN <denis.baurain@uliege.be>
202              
203             =head1 COPYRIGHT AND LICENSE
204              
205             This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
206              
207             This is free software; you can redistribute it and/or modify it under
208             the same terms as the Perl 5 programming language system itself.
209              
210             =cut