File Coverage

blib/lib/Bio/MUST/Core/SeqMask/Profiles.pm
Criterion Covered Total %
statement 94 94 100.0
branch 15 16 93.7
condition 11 12 91.6
subroutine 13 13 100.0
pod 3 3 100.0
total 136 138 98.5


line stmt bran cond sub pod time code
1             package Bio::MUST::Core::SeqMask::Profiles;
2             # ABSTRACT: Evolutionary profiles for sequence sites
3             $Bio::MUST::Core::SeqMask::Profiles::VERSION = '0.212650';
4 17     17   132 use Moose;
  17         46  
  17         151  
5 17     17   125778 use namespace::autoclean;
  17         60  
  17         192  
6              
7 17     17   1813 use autodie;
  17         51  
  17         206  
8 17     17   94265 use feature qw(say);
  17         59  
  17         1700  
9              
10 17     17   137 use Carp;
  17         47  
  17         1645  
11 17     17   134 use Const::Fast;
  17         50  
  17         192  
12 17     17   1119 use Tie::IxHash;
  17         96  
  17         863  
13              
14             extends 'Bio::MUST::Core::SeqMask';
15              
16 17     17   111 use Bio::MUST::Core::Types;
  17         62  
  17         567  
17 17     17   124 use Bio::MUST::Core::Constants qw(:files);
  17         40  
  17         3741  
18 17     17   146 use aliased 'Bio::MUST::Core::SeqMask::Freqs';
  17         42  
  17         151  
19              
20              
21             # override superclass' Bool type
22             # Note: mask indices are as follow: [site]{full_id}{AA}
23             # mask values are AA freqs (both per seq and averaged over seqs)
24             has '+mask' => (
25             isa => 'ArrayRef[HashRef[HashRef[Num]]]',
26             );
27              
28             # TODO: mask non-applicable methods from superclass? (Liskov principle)
29             # TODO: move this under PostPred instead of SeqMask?
30              
31             const my $AVERAGE => '<:AVERAGE:>';
32              
33              
34             sub ppred_profiles {
35 2     2 1 32 my $class = shift;
36 2         6 my $alis = shift;
37 2   100     20 my $args = shift // {}; # HashRef (should not be empty...)
38              
39 2         8 my $list = $args->{sim_list};
40              
41 2         18 my @sim_freq_at_for;
42              
43             my $regex;
44 2         0 my $width;
45 2         0 my $seq_inc;
46 2         0 my $avg_inc;
47              
48             # loop through Ali objects to build site profiles
49             # Note: profiles will be available both per seq and averaged over seqs
50 2         6 for my $ali ( @{$alis} ) {
  2         8  
51              
52             # extract seqs on which to compute freqs (defaults to all seqs)
53 100 100       425 my $sample = $list ? $list->filtered_ali($ali) : $ali;
54 100         3949 my @seqs = $sample->all_seqs;
55              
56             # setup mask details based on first Ali
57 100 100       378 unless ($regex) {
58 2         11 $regex = $ali->gapmiss_regex;
59 2         11 $width = $ali->width;
60 2         7 $seq_inc = 1.0 / @{$alis};
  2         9  
61 2         5 $avg_inc = $seq_inc / @seqs;
62             }
63              
64             # loop through simulated seqs to store and average ppred state freqs
65 100         286 for my $seq (@seqs) {
66 4000         18248 my $full_id = $seq->full_id;
67              
68             # store and average ppred state freq at each site for current seq
69             # Note: all missing/gap states are folded to '*'
70 4000         12992 for my $site (0..$width-1) {
71 740000         1591304 my $sim_state = uc $seq->state_at($site);
72 740000 50       2568080 $sim_state = '*' if $sim_state =~ m/$regex/xms;
73 740000         2327289 $sim_freq_at_for[$site]{$full_id}{$sim_state} += $seq_inc;
74 740000         1846268 $sim_freq_at_for[$site]{$AVERAGE}{$sim_state} += $avg_inc;
75             }
76             }
77             }
78              
79 2         84 return $class->new( mask => \@sim_freq_at_for );
80             }
81              
82              
83              
84             sub ppred_freqs {
85 3     3 1 36 my $self = shift;
86 3         12 my $ali = shift;
87 3   100     25 my $args = shift // {}; # HashRef (should not be empty...)
88              
89 3   100     21 my $by_seq = $args->{by_seq} // 0;
90 3         11 my $list = $args->{obs_list};
91              
92             # ppred_freqs_by_seq
93             # input: f_AA(site,seq)
94             # output: f_AAobs(site,seq)
95             # => mask f_AAobs_avg(site) over seq
96             # => seq sort f_AAobs_avg(seq) over sites
97              
98             # ppred_freqs
99             # input: f_AA_avg(site) over seq
100             # output: f_AAobs(site,seq)
101             # => mask f_AAobs_avg(site) over seq
102             # => seq sort f_AAobs_avg(seq) over sites
103              
104 3         54 tie my %obs_freq_for_at, 'Tie::IxHash';
105              
106             # extract seqs on which to compute freqs (defaults to all seqs)
107 3 100       85 my $sample = $list ? $list->filtered_ali($ali) : $ali;
108 3         155 my @seqs = $sample->all_seqs;
109              
110             # setup mask details
111 3         22 my $regex = $ali->gapmiss_regex;
112 3         22 my $width = $ali->width;
113              
114             # loop through real seqs to store observed state freqs
115             # Note: ppred state freqs are either by seq or averaged over seqs
116 3         16 for my $seq (@seqs) {
117 161         1530 my $full_id = $seq->full_id;
118 161 100       403 my $seq_key = $by_seq ? $full_id : $AVERAGE;
119              
120             # store observed state freq at each site for current seq
121             # Note: sites with missing/gap state get a max freq of 1.0
122 161         433 for my $site (0..$width-1) {
123 29785         235558 my $sim_freq_for = ${ $self->state_at($site) }{$seq_key};
  29785         1095631  
124 29785         73716 my $obs_state = uc $seq->state_at($site);
125             my $obs_freq = $obs_state =~ m/$regex/xms ? 1.0
126 29785 100 100     152396 : $sim_freq_for->{$obs_state} // 0.0;
127 29785         107972 $obs_freq_for_at{$full_id}[$site] = $obs_freq;
128             }
129             }
130              
131 3         203 return Freqs->new( freq_for_at => \%obs_freq_for_at );
132             }
133              
134              
135              
136             sub store {
137 1     1 1 4 my $self = shift;
138 1         2 my $outfile = shift;
139              
140 1         11 open my $out, '>', $outfile;
141              
142             # output header
143 1         4224 say {$out} join "\t", qw(site seq aa), 'f(i,j)';
  1         13  
144              
145             # setup loop
146 1         4 my @ids;
147 1         16 my @aas = split //xms, 'ACDEFGHIKLMNPQRSTVWY*'; # TODO: improve this
148 1         52 my $width = $self->mask_len;
149              
150             # loop through sites, ids and AAs to output ppred state freqs
151 1         7 for my $site (0..$width-1) {
152 185         17941 my $sim_freq_for = $self->state_at($site);
153 185 100 50     802 @ids = sort keys %{ $sim_freq_for // {} } unless @ids;
  1         69  
154             # get ids from first state
155             ID:
156 185         602 for my $id (@ids) {
157 14800 100       29668 next ID if $id eq $AVERAGE; # skip averaged ppred freqs
158 14615         21681 for my $aa (@aas) {
159 306915         1180163 say {$out} join "\t", $site + 1, $id, $aa,
160 306915   100     454803 $sim_freq_for->{$id}{$aa} // 0;
161             } # missing AAs are set to 0 (hence the alphabet)
162             }
163             }
164              
165 1         93 return;
166             }
167              
168             __PACKAGE__->meta->make_immutable;
169             1;
170              
171             __END__
172              
173             =pod
174              
175             =head1 NAME
176              
177             Bio::MUST::Core::SeqMask::Profiles - Evolutionary profiles for sequence sites
178              
179             =head1 VERSION
180              
181             version 0.212650
182              
183             =head1 SYNOPSIS
184              
185             # TODO
186              
187             =head1 DESCRIPTION
188              
189             # TODO
190              
191             =head1 METHODS
192              
193             =head2 ppred_profiles
194              
195             =head2 ppred_freqs
196              
197             =head2 store
198              
199             =head1 AUTHOR
200              
201             Denis BAURAIN <denis.baurain@uliege.be>
202              
203             =head1 COPYRIGHT AND LICENSE
204              
205             This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
206              
207             This is free software; you can redistribute it and/or modify it under
208             the same terms as the Perl 5 programming language system itself.
209              
210             =cut