line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# module Bio::PopGen::HtSNP.pm |
2
|
|
|
|
|
|
|
# cared by Pedro M. Gomez-Fabre |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
# |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
=head1 NAME |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
Bio::PopGen::HtSNP.pm- Select htSNP from a haplotype set |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
=head1 SYNOPSIS |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
use Bio::PopGen::HtSNP; |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
my $obj = Bio::PopGen::HtSNP->new($hap,$snp,$pop); |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 DESCRIPTION |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
Select the minimal set of SNP that contains the full information about |
19
|
|
|
|
|
|
|
the haplotype without redundancies. |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
Take as input the followin values: |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=over 4 |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
=item - the haplotype block (array of array). |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
=item - the snp id (array). |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
=item - family information and frequency (array of array). |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=back |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
The final haplotype is generated in a numerical format and the SNP's |
34
|
|
|
|
|
|
|
sets can be retrieve from the module. |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
B |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
- If you force to include a family with indetermination, the SNP's |
40
|
|
|
|
|
|
|
with indetermination will be removed from the analysis, so consider |
41
|
|
|
|
|
|
|
before to place your data set what do you really want to do. |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
- If two families have the same information (identical haplotype), one |
44
|
|
|
|
|
|
|
of them will be removed and the removed files will be stored classify |
45
|
|
|
|
|
|
|
as removed. |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
- Only are accepted for calculation A, C, G, T and - (as deletion) and |
48
|
|
|
|
|
|
|
their combinations. Any other value as n or ? will be considered as |
49
|
|
|
|
|
|
|
degenerations due to lack of information. |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
=head2 RATIONALE |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
On a haplotype set is expected that some of the SNP and their |
54
|
|
|
|
|
|
|
variations contribute in the same way to the haplotype. Eliminating |
55
|
|
|
|
|
|
|
redundancies will produce a minimal set of SNP's that can be used as |
56
|
|
|
|
|
|
|
input for a taging selection process. On the process SNP's with the |
57
|
|
|
|
|
|
|
same variation are clustered on the same group. |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
The idea is that because the tagging haplotype process is |
60
|
|
|
|
|
|
|
exponential. All redundant information we could eliminate on the |
61
|
|
|
|
|
|
|
tagging process will help to find a quick result. |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=head2 CONSTRUCTORS |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
my $obj = Bio::PopGen::HtSNP->new |
66
|
|
|
|
|
|
|
(-haplotype_block => \@haplotype_patterns, |
67
|
|
|
|
|
|
|
-snp_ids => \@snp_ids, |
68
|
|
|
|
|
|
|
-pattern_freq => \@pattern_name_and_freq); |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
where $hap, $snp and $pop are in the format: |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
my $hap = [ |
73
|
|
|
|
|
|
|
'acgt', |
74
|
|
|
|
|
|
|
'agtc', |
75
|
|
|
|
|
|
|
'cgtc' |
76
|
|
|
|
|
|
|
]; # haplotype patterns' id |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
my $snp = [qw/s1 s2 s3 s4/]; # snps' Id's |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
my $pop = [ |
81
|
|
|
|
|
|
|
[qw/ uno 0.20/], |
82
|
|
|
|
|
|
|
[qw/ dos 0.20/], |
83
|
|
|
|
|
|
|
[qw/ tres 0.15/], |
84
|
|
|
|
|
|
|
]; # haplotype_pattern_id Frequency |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=head2 OBJECT METHODS |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
See Below for more detailed summaries. |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=head1 DETAILS |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=head2 How the process is working with one example |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
Let's begin with one general example of the code. |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
Input haplotype: |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
acgtcca-t |
100
|
|
|
|
|
|
|
cggtagtgc |
101
|
|
|
|
|
|
|
cccccgtgc |
102
|
|
|
|
|
|
|
cgctcgtgc |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
The first thing to to is to B into characters. |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
a c g t c c a - t |
107
|
|
|
|
|
|
|
c g g t a g t g c |
108
|
|
|
|
|
|
|
c c c c c g t g c |
109
|
|
|
|
|
|
|
c g c t c g t g c |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
Now we have to B the haplotype to B. This |
112
|
|
|
|
|
|
|
will produce the same SNP if we have input a or A. |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
A C G T C C A - T |
115
|
|
|
|
|
|
|
C G G T A G T G C |
116
|
|
|
|
|
|
|
C C C C C G T G C |
117
|
|
|
|
|
|
|
C G C T C G T G C |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
The program admit as values any combination of ACTG and - (deletions). |
120
|
|
|
|
|
|
|
The haplotype is B, considering the first variation |
121
|
|
|
|
|
|
|
as zero and the alternate value as 1 (see expanded description below). |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
0 0 0 0 0 0 0 0 0 |
124
|
|
|
|
|
|
|
1 1 0 0 1 1 1 1 1 |
125
|
|
|
|
|
|
|
1 0 1 1 0 1 1 1 1 |
126
|
|
|
|
|
|
|
1 1 1 0 0 1 1 1 1 |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
Once we have the haplotype converted to numbers we have to generate the |
129
|
|
|
|
|
|
|
snp type information for the haplotype. |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
B |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
where: |
135
|
|
|
|
|
|
|
SUM is the sum of the values for the SNP |
136
|
|
|
|
|
|
|
value is the SNP number code (0 [generally for the mayor allele], |
137
|
|
|
|
|
|
|
1 [for the minor allele]. |
138
|
|
|
|
|
|
|
position is the position on the block. |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
For this example the code is: |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
0 0 0 0 0 0 0 0 0 |
143
|
|
|
|
|
|
|
1 1 0 0 1 1 1 1 1 |
144
|
|
|
|
|
|
|
1 0 1 1 0 1 1 1 1 |
145
|
|
|
|
|
|
|
1 1 1 0 0 1 1 1 1 |
146
|
|
|
|
|
|
|
------------------------------------------------------------------ |
147
|
|
|
|
|
|
|
14 10 12 4 2 14 14 14 14 |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
14 = 0*2^0 + 1*2^1 + 1*2^2 + 1*2^3 |
150
|
|
|
|
|
|
|
12 = 0*2^0 + 1*2^1 + 0*2^2 + 1*2^3 |
151
|
|
|
|
|
|
|
.... |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
Once we have the families classify. We will B just the SNP's B
|
154
|
|
|
|
|
|
|
redundant>. |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
14 10 12 4 2 |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
This information will be B is you want to tag |
159
|
|
|
|
|
|
|
the htSNP. |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
Whatever it happens to one SNPs of a class will happen to a SNP of |
162
|
|
|
|
|
|
|
the same class. Therefore you don't need to scan redundancies |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=head2 Working with fuzzy data. |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
This module is designed to work with fuzzy data. As the source of the |
167
|
|
|
|
|
|
|
haplotype is diverse. The program assume that some haplotypes can be |
168
|
|
|
|
|
|
|
generated using different values. If there is any indetermination (? or n) |
169
|
|
|
|
|
|
|
or any other degenerated value or invalid. The program will take away |
170
|
|
|
|
|
|
|
This SNP and will leave that for a further analysis. |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
On a complex situation: |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
a c g t ? c a c t |
175
|
|
|
|
|
|
|
a c g t ? c a - t |
176
|
|
|
|
|
|
|
c g ? t a g ? g c |
177
|
|
|
|
|
|
|
c a c t c g t g c |
178
|
|
|
|
|
|
|
c g c t c g t g c |
179
|
|
|
|
|
|
|
c g g t a g ? g c |
180
|
|
|
|
|
|
|
a c ? t ? c a c t |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
On this haplotype everything is happening. We have a multialelic variance. |
183
|
|
|
|
|
|
|
We have indeterminations. We have deletions and we have even one SNP |
184
|
|
|
|
|
|
|
which is not a real SNP. |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
The buiding process will be the same on this situation. |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
Convert the haplotype to uppercase. |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
A C G T ? C A C T |
191
|
|
|
|
|
|
|
A C G T ? C A - T |
192
|
|
|
|
|
|
|
C G ? T A G ? G C |
193
|
|
|
|
|
|
|
C A C T C G T G C |
194
|
|
|
|
|
|
|
C G C T C G T G C |
195
|
|
|
|
|
|
|
C G G T A G ? G C |
196
|
|
|
|
|
|
|
A C ? T ? C A C T |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
All columns that present indeterminations will be removed from the analysis |
199
|
|
|
|
|
|
|
on this Step. |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
hapotype after remove columns: |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
A C T C C T |
204
|
|
|
|
|
|
|
A C T C - T |
205
|
|
|
|
|
|
|
C G T G G C |
206
|
|
|
|
|
|
|
C A T G G C |
207
|
|
|
|
|
|
|
C G T G G C |
208
|
|
|
|
|
|
|
C G T G G C |
209
|
|
|
|
|
|
|
A C T C C T |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
All changes made on the haplotype matrix, will be also made on the SNP list. |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
snp_id_1 snp_id_2 snp_id_4 snp_id_6 snp_id_8 snp_id_9 |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
now the SNP that is not one SNP will be removed from the analysis. |
216
|
|
|
|
|
|
|
SNP with Id snp_id_4 (the one with all T's). |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
because of the removing. Some of the families will become the same and will |
220
|
|
|
|
|
|
|
be clustered. A posteriori analysis will diference these families. |
221
|
|
|
|
|
|
|
but because of the indetermination can not be distinguish. |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
A C C C T |
224
|
|
|
|
|
|
|
A C C - T |
225
|
|
|
|
|
|
|
C G G G C |
226
|
|
|
|
|
|
|
C A G G C |
227
|
|
|
|
|
|
|
C G G G C |
228
|
|
|
|
|
|
|
C G G G C |
229
|
|
|
|
|
|
|
A C C C T |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
The result of the mergering will go like: |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
A C C C T |
234
|
|
|
|
|
|
|
A C C - T |
235
|
|
|
|
|
|
|
C G G G C |
236
|
|
|
|
|
|
|
C A G G C |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
Once again the changes made on the families and we merge the frequency (I
|
239
|
|
|
|
|
|
|
implemented>) |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
Before to convert the haplotype into numbers we consider how many variations |
242
|
|
|
|
|
|
|
we have on the set. On this case the variations are 3. |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
The control code will use on this situation base three as mutiplicity |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
0 0 0 0 0 |
247
|
|
|
|
|
|
|
0 0 0 1 0 |
248
|
|
|
|
|
|
|
1 1 1 2 1 |
249
|
|
|
|
|
|
|
1 2 1 2 1 |
250
|
|
|
|
|
|
|
----------------------------------- |
251
|
|
|
|
|
|
|
36 63 36 75 36 |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
And the minimal set for this combination is |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
0 0 0 |
256
|
|
|
|
|
|
|
0 0 1 |
257
|
|
|
|
|
|
|
1 1 2 |
258
|
|
|
|
|
|
|
1 2 2 |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
B this second example is a remote example an on normal conditions. This |
261
|
|
|
|
|
|
|
conditions makes no sense, but as the haplotypes, can come from many sources |
262
|
|
|
|
|
|
|
we have to be ready for all kind of combinations. |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
=head1 FEEDBACK |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
=head2 Mailing Lists |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
270
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to |
271
|
|
|
|
|
|
|
the Bioperl mailing list. Your participation is much appreciated. |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
274
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
=head2 Support |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
I |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
283
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
284
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
285
|
|
|
|
|
|
|
with code and data examples if at all possible. |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
=head2 Reporting Bugs |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
290
|
|
|
|
|
|
|
of the bugs and their resolution. Bug reports can be submitted via |
291
|
|
|
|
|
|
|
the web: |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
=head1 AUTHOR - Pedro M. Gomez-Fabre |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
Email pgf18872-at-gsk-dot-com |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
=head1 APPENDIX |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
The rest of the documentation details each of the object methods. |
303
|
|
|
|
|
|
|
Internal methods are usually preceded with a _ |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
=cut |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
# Let the code begin... |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
package Bio::PopGen::HtSNP; |
310
|
1
|
|
|
1
|
|
586
|
use Data::Dumper; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
44
|
|
311
|
1
|
|
|
1
|
|
656
|
use Storable qw(dclone); |
|
1
|
|
|
|
|
3562
|
|
|
1
|
|
|
|
|
79
|
|
312
|
|
|
|
|
|
|
|
313
|
1
|
|
|
1
|
|
7
|
use vars qw (); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
19
|
|
314
|
1
|
|
|
1
|
|
5
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
30
|
|
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
|
317
|
1
|
|
|
1
|
|
4
|
use base qw(Bio::Root::Root); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
497
|
|
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
my $USAGE = 'Usage: |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
Bio::PopGen::HtSNP->new(-haplotype_block -ids -pattern_freq) |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
'; |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
=head2 new |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
Title : new |
328
|
|
|
|
|
|
|
Function: constructor of the class. |
329
|
|
|
|
|
|
|
Usage : $obj-> Bio::PopGen::HtSNP->new(-haplotype_block |
330
|
|
|
|
|
|
|
-snp_ids |
331
|
|
|
|
|
|
|
-pattern_freq) |
332
|
|
|
|
|
|
|
Returns : self hash |
333
|
|
|
|
|
|
|
Args : input haplotype (array of array) |
334
|
|
|
|
|
|
|
snp_ids (array) |
335
|
|
|
|
|
|
|
pop_freq (array of array) |
336
|
|
|
|
|
|
|
Status : public |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
=cut |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
sub new { |
341
|
1
|
|
|
1
|
1
|
175
|
my($class, @args) = @_; |
342
|
|
|
|
|
|
|
|
343
|
1
|
|
|
|
|
11
|
my $self = $class->SUPER::new(@args); |
344
|
1
|
|
|
|
|
8
|
my ($haplotype_block, |
345
|
|
|
|
|
|
|
$snp_ids, |
346
|
|
|
|
|
|
|
$pattern_freq ) = $self->_rearrange([qw(HAPLOTYPE_BLOCK |
347
|
|
|
|
|
|
|
SNP_IDS |
348
|
|
|
|
|
|
|
PATTERN_FREQ)],@args); |
349
|
|
|
|
|
|
|
|
350
|
1
|
50
|
|
|
|
7
|
if ($haplotype_block){ |
351
|
1
|
|
|
|
|
4
|
$self->haplotype_block($haplotype_block); |
352
|
|
|
|
|
|
|
} |
353
|
|
|
|
|
|
|
else{ |
354
|
0
|
|
|
|
|
0
|
$self->throw("Haplotype block has not been defined. |
355
|
|
|
|
|
|
|
\n$USAGE"); |
356
|
|
|
|
|
|
|
} |
357
|
1
|
50
|
|
|
|
3
|
if ($snp_ids){ |
358
|
1
|
|
|
|
|
4
|
$self->snp_ids($snp_ids); |
359
|
|
|
|
|
|
|
} |
360
|
|
|
|
|
|
|
else{ |
361
|
0
|
|
|
|
|
0
|
$self->throw("Array with ids has not been defined. |
362
|
|
|
|
|
|
|
\n$USAGE"); |
363
|
|
|
|
|
|
|
} |
364
|
1
|
50
|
|
|
|
3
|
if ($pattern_freq){ |
365
|
1
|
|
|
|
|
4
|
$self->pattern_freq($pattern_freq); |
366
|
|
|
|
|
|
|
} |
367
|
|
|
|
|
|
|
else{ |
368
|
0
|
|
|
|
|
0
|
$self->throw("Array with pattern id and frequency has not been defined. |
369
|
|
|
|
|
|
|
\n$USAGE"); |
370
|
|
|
|
|
|
|
} |
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
# if the input values are not well formed complained and exit. |
373
|
1
|
|
|
|
|
7
|
_check_input($self); |
374
|
|
|
|
|
|
|
|
375
|
1
|
|
|
|
|
4
|
_do_it($self); |
376
|
|
|
|
|
|
|
|
377
|
1
|
|
|
|
|
4
|
return $self; |
378
|
|
|
|
|
|
|
} |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
=head2 haplotype_block |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
Title : haplotype_block |
383
|
|
|
|
|
|
|
Usage : my $haplotype_block = $HtSNP->haplotype_block(); |
384
|
|
|
|
|
|
|
Function: Get the haplotype block for a haplotype tagging selection |
385
|
|
|
|
|
|
|
Returns : reference of array |
386
|
|
|
|
|
|
|
Args : reference of array with haplotype pattern |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
=cut |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
sub haplotype_block{ |
392
|
3
|
|
|
3
|
1
|
4
|
my ($self) =shift; |
393
|
3
|
100
|
|
|
|
10
|
return $self->{'_haplotype_block'} = shift if @_; |
394
|
2
|
|
|
|
|
4
|
return $self->{'_haplotype_block'}; |
395
|
|
|
|
|
|
|
} |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
=head2 snp_ids |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
Title : snp_ids |
400
|
|
|
|
|
|
|
Usage : my $snp_ids = $HtSNP->$snp_ids(); |
401
|
|
|
|
|
|
|
Function: Get the ids for a haplotype tagging selection |
402
|
|
|
|
|
|
|
Returns : reference of array |
403
|
|
|
|
|
|
|
Args : reference of array with SNP ids |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
=cut |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
sub snp_ids{ |
409
|
4
|
|
|
4
|
1
|
6
|
my ($self) =shift; |
410
|
4
|
100
|
|
|
|
11
|
return $self->{'_snp_ids'} = shift if @_; |
411
|
3
|
|
|
|
|
23
|
return $self->{'_snp_ids'}; |
412
|
|
|
|
|
|
|
} |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
=head2 pattern_freq |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
Title : pattern_freq |
418
|
|
|
|
|
|
|
Usage : my $pattern_freq = $HtSNP->pattern_freq(); |
419
|
|
|
|
|
|
|
Function: Get the pattern id and frequency for a haplotype |
420
|
|
|
|
|
|
|
tagging selection |
421
|
|
|
|
|
|
|
Returns : reference of array |
422
|
|
|
|
|
|
|
Args : reference of array with SNP ids |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
=cut |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
sub pattern_freq{ |
427
|
3
|
|
|
3
|
1
|
4
|
my ($self) =shift; |
428
|
3
|
100
|
|
|
|
6
|
return $self->{'_pattern_freq'} = shift if @_; |
429
|
2
|
|
|
|
|
48
|
return $self->{'_pattern_freq'}; |
430
|
|
|
|
|
|
|
} |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
=head2 _check_input |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
Title : _check_input |
435
|
|
|
|
|
|
|
Usage : _check_input($self) |
436
|
|
|
|
|
|
|
Function: check for errors on the input |
437
|
|
|
|
|
|
|
Returns : self hash |
438
|
|
|
|
|
|
|
Args : self |
439
|
|
|
|
|
|
|
Status : internal |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
=cut |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
#------------------------ |
444
|
|
|
|
|
|
|
sub _check_input{ |
445
|
|
|
|
|
|
|
#------------------------ |
446
|
|
|
|
|
|
|
|
447
|
1
|
|
|
1
|
|
1
|
my $self = shift; |
448
|
|
|
|
|
|
|
|
449
|
1
|
|
|
|
|
3
|
_haplotype_length_error($self); |
450
|
1
|
|
|
|
|
3
|
_population_error($self); |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
} |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
=head2 _haplotype_length_error |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
Title : _haplotype_length_error |
457
|
|
|
|
|
|
|
Usage : _haplotype_length_error($self) |
458
|
|
|
|
|
|
|
Function: check if the haplotype length is the same that the one on the |
459
|
|
|
|
|
|
|
SNP id list. If not break and exit |
460
|
|
|
|
|
|
|
Returns : self hash |
461
|
|
|
|
|
|
|
Args : self |
462
|
|
|
|
|
|
|
Status : internal |
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
=cut |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
#------------------------ |
468
|
|
|
|
|
|
|
sub _haplotype_length_error{ |
469
|
|
|
|
|
|
|
#------------------------ |
470
|
|
|
|
|
|
|
|
471
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
472
|
|
|
|
|
|
|
|
473
|
1
|
|
|
|
|
2
|
my $input_block = $self->haplotype_block(); |
474
|
1
|
|
|
|
|
2
|
my $snp_ids = $self->snp_ids(); |
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
############################# |
478
|
|
|
|
|
|
|
# define error list |
479
|
|
|
|
|
|
|
############################# |
480
|
1
|
|
|
|
|
2
|
my $different_haplotype_length = 0; |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
############################## |
483
|
|
|
|
|
|
|
# get parameters used to find |
484
|
|
|
|
|
|
|
# the errors |
485
|
|
|
|
|
|
|
############################## |
486
|
|
|
|
|
|
|
|
487
|
1
|
|
|
|
|
2
|
my $snp_number = scalar @$snp_ids; |
488
|
1
|
|
|
|
|
1
|
my $number_of_families = scalar @$input_block; |
489
|
1
|
|
|
|
|
3
|
my $h = 0; # haplotype position |
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
############################ |
493
|
|
|
|
|
|
|
# haplotype length |
494
|
|
|
|
|
|
|
# |
495
|
|
|
|
|
|
|
# if the length differs from the number of ids |
496
|
|
|
|
|
|
|
############################ |
497
|
|
|
|
|
|
|
|
498
|
1
|
|
|
|
|
5
|
for ($h=0; $h<$#$input_block+1 ; $h++){ |
499
|
7
|
50
|
|
|
|
16
|
if (length $input_block->[$h] != $snp_number){ |
500
|
0
|
|
|
|
|
0
|
$different_haplotype_length = 1; |
501
|
0
|
|
|
|
|
0
|
last; |
502
|
|
|
|
|
|
|
} |
503
|
|
|
|
|
|
|
} |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
# haploytypes does not have the same length |
506
|
1
|
50
|
|
|
|
3
|
if ($different_haplotype_length){ |
507
|
0
|
|
|
|
|
0
|
$self->throw("The number of snp ids is $snp_number and ". |
508
|
|
|
|
|
|
|
"the length of the family (". ($h+1) .") [". |
509
|
|
|
|
|
|
|
$input_block->[$h]."] is ". |
510
|
|
|
|
|
|
|
length $input_block->[$h], "\n"); |
511
|
|
|
|
|
|
|
} |
512
|
|
|
|
|
|
|
} |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
=head2 _population_error |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
Title : _population_error |
518
|
|
|
|
|
|
|
Usage : _population_error($self) |
519
|
|
|
|
|
|
|
Function: use input_block and pop_freq test if the number of elements |
520
|
|
|
|
|
|
|
match. If doesn't break and quit. |
521
|
|
|
|
|
|
|
Returns : self hash |
522
|
|
|
|
|
|
|
Args : self |
523
|
|
|
|
|
|
|
Status : internal |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
=cut |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
#------------------------ |
529
|
|
|
|
|
|
|
sub _population_error{ |
530
|
|
|
|
|
|
|
#------------------------ |
531
|
|
|
|
|
|
|
|
532
|
1
|
|
|
1
|
|
1
|
my $self = shift; |
533
|
|
|
|
|
|
|
|
534
|
1
|
|
|
|
|
3
|
my $input_block = $self->haplotype_block(); |
535
|
1
|
|
|
|
|
2
|
my $pop_freq = $self->pattern_freq(); |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
############################# |
538
|
|
|
|
|
|
|
# define error list |
539
|
|
|
|
|
|
|
############################# |
540
|
1
|
|
|
|
|
2
|
my $pop_freq_elements_error = 0; # matrix bad formed |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
############################## |
543
|
|
|
|
|
|
|
# get parameters used to find |
544
|
|
|
|
|
|
|
# the errors |
545
|
|
|
|
|
|
|
############################## |
546
|
1
|
|
|
|
|
2
|
my $number_of_families = scalar @$input_block; |
547
|
|
|
|
|
|
|
|
548
|
1
|
|
|
|
|
1
|
my $pf = 0; # number of elements on population frequency |
549
|
1
|
|
|
|
|
1
|
my $frequency = 0; # population frequency |
550
|
1
|
|
|
|
|
1
|
my $p_f_length = 0; |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
# check if the pop_freq array is well formed and if the number |
553
|
|
|
|
|
|
|
# of elements fit with the number of families |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
############################# |
556
|
|
|
|
|
|
|
# check population frequency |
557
|
|
|
|
|
|
|
# |
558
|
|
|
|
|
|
|
# - population frequency matrix need to be well formed |
559
|
|
|
|
|
|
|
# - get the frequency |
560
|
|
|
|
|
|
|
# - calculate number of families on pop_freq |
561
|
|
|
|
|
|
|
############################# |
562
|
|
|
|
|
|
|
|
563
|
1
|
|
|
|
|
5
|
for ($pf=0; $pf<$#$pop_freq+1; $pf++){ |
564
|
7
|
|
|
|
|
16
|
$frequency += $pop_freq->[$pf]->[1]; |
565
|
|
|
|
|
|
|
|
566
|
7
|
50
|
|
|
|
4
|
if ( scalar @{$pop_freq->[$pf]} !=2){ |
|
7
|
|
|
|
|
20
|
|
567
|
0
|
|
|
|
|
0
|
$p_f_length = scalar @{$pop_freq->[$pf]}; |
|
0
|
|
|
|
|
0
|
|
568
|
0
|
|
|
|
|
0
|
$pop_freq_elements_error = 1; |
569
|
0
|
|
|
|
|
0
|
last; |
570
|
|
|
|
|
|
|
} |
571
|
|
|
|
|
|
|
} |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
########################### |
574
|
|
|
|
|
|
|
## error processing |
575
|
|
|
|
|
|
|
########################### |
576
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
# The frequency shouldn't be greater than 1 |
579
|
1
|
50
|
|
|
|
3
|
if ($frequency >1) { |
580
|
0
|
|
|
|
|
0
|
$self->warn("The frequency for this set is $frequency (greater than 1)\n"); |
581
|
|
|
|
|
|
|
} |
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
# the haplotype matix is not well formed |
584
|
1
|
50
|
|
|
|
7
|
if ($pop_freq_elements_error){ |
585
|
0
|
|
|
|
|
0
|
$self->throw("the frequency matrix is not well formed\n". |
586
|
|
|
|
|
|
|
"\nThe number of elements for pattern ".($pf+1)." is ". |
587
|
|
|
|
|
|
|
"$p_f_length\n". |
588
|
0
|
|
|
|
|
0
|
"It should be 2 for pattern \"@{$pop_freq->[$pf]}\"\n". |
589
|
|
|
|
|
|
|
"\nFormat should be:\n". |
590
|
|
|
|
|
|
|
"haplotype_id\t frequency\n" |
591
|
|
|
|
|
|
|
); |
592
|
|
|
|
|
|
|
} |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
# the size does not fit on pop_freq array |
595
|
|
|
|
|
|
|
# with the one in haplotype (input_block) |
596
|
1
|
50
|
|
|
|
3
|
if ($pf != $number_of_families) { |
597
|
0
|
|
|
|
|
0
|
$self->throw("The number of patterns on frequency array ($pf)\n". |
598
|
|
|
|
|
|
|
"does not fit with the number of haplotype patterns on \n". |
599
|
|
|
|
|
|
|
"haplotype array ($number_of_families)\n"); |
600
|
|
|
|
|
|
|
} |
601
|
|
|
|
|
|
|
} |
602
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
=head2 _do_it |
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
Title : _do_it |
607
|
|
|
|
|
|
|
Usage : _do_it($self) |
608
|
|
|
|
|
|
|
Function: Process the input generating the results. |
609
|
|
|
|
|
|
|
Returns : self hash |
610
|
|
|
|
|
|
|
Args : self |
611
|
|
|
|
|
|
|
Status : internal |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
=cut |
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
#------------------------ |
616
|
|
|
|
|
|
|
sub _do_it{ |
617
|
|
|
|
|
|
|
#------------------------ |
618
|
|
|
|
|
|
|
|
619
|
1
|
|
|
1
|
|
1
|
my $self = shift; |
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
# first we are goinf to define here all variables we are going to use |
622
|
1
|
|
|
|
|
2
|
$self -> {'w_hap'} = []; |
623
|
1
|
|
|
|
|
3
|
$self -> {'w_pop_freq'} = dclone ( $self ->pattern_freq() ); |
624
|
1
|
|
|
|
|
3
|
$self -> {'deg_pattern'} = {}; |
625
|
1
|
|
|
|
|
4
|
$self -> {'snp_type'} = {}; # type of snp on the set. see below |
626
|
1
|
|
|
|
|
2
|
$self -> {'alleles_number'} = 0; # number of variations (biallelic,...) |
627
|
1
|
|
|
|
|
3
|
$self -> {'snp_type_code'} = []; |
628
|
1
|
|
|
|
|
3
|
$self -> {'ht_type'} = []; # store the snp type used on the htSet |
629
|
1
|
|
|
|
|
3
|
$self -> {'split_hap'} = []; |
630
|
1
|
|
|
|
|
3
|
$self -> {'snp_and_code'} = []; |
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
# we classify the SNP under snp_type |
634
|
1
|
|
|
|
|
4
|
$self->{snp_type}->{useful_snp} = dclone ( $self ->snp_ids() ); |
635
|
1
|
|
|
|
|
3
|
$self->{snp_type}->{deg_snp} = []; # deg snp |
636
|
1
|
|
|
|
|
2
|
$self->{snp_type}->{silent_snp} = []; # not a real snp |
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
# split the haplotype |
639
|
1
|
|
|
|
|
3
|
_split_haplo ($self); |
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
# first we convert to upper case the haplotype |
642
|
|
|
|
|
|
|
# to make A the same as a for comparison |
643
|
1
|
|
|
|
|
5
|
_to_upper_case( $self -> {w_hap} ); |
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
####################################################### |
646
|
|
|
|
|
|
|
# check if any SNP has indetermination. If any SNP has |
647
|
|
|
|
|
|
|
# indetermination this value will be removed. |
648
|
|
|
|
|
|
|
####################################################### |
649
|
1
|
|
|
|
|
3
|
_remove_deg ( $self ); |
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
####################################################### |
652
|
|
|
|
|
|
|
# depending of the families you use some SNPs can be |
653
|
|
|
|
|
|
|
# silent. This silent SNP's are not used on the |
654
|
|
|
|
|
|
|
# creation of tags and has to be skipped from the |
655
|
|
|
|
|
|
|
# analysis. |
656
|
|
|
|
|
|
|
####################################################### |
657
|
1
|
|
|
|
|
3
|
_rem_silent_snp ( $self ); |
658
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
####################################################### |
660
|
|
|
|
|
|
|
# for the remaining SNP's we have to check if two |
661
|
|
|
|
|
|
|
# families have the same value. If this is true, the families |
662
|
|
|
|
|
|
|
# will produce the same result and therefore we will not find |
663
|
|
|
|
|
|
|
# any pattern. So, the redundant families need to be take |
664
|
|
|
|
|
|
|
# away from the analysis. But also considered for a further |
665
|
|
|
|
|
|
|
# run. |
666
|
|
|
|
|
|
|
# |
667
|
|
|
|
|
|
|
# When we talk about a normal haplotype blocks this situation |
668
|
|
|
|
|
|
|
# makes no sense but if we remove one of the snp because the |
669
|
|
|
|
|
|
|
# degeneration two families can became the same. |
670
|
|
|
|
|
|
|
# these families may be analised on a second round |
671
|
|
|
|
|
|
|
####################################################### |
672
|
|
|
|
|
|
|
|
673
|
1
|
|
|
|
|
3
|
_find_deg_pattern ( $self ); |
674
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
################################################################# |
676
|
|
|
|
|
|
|
# if the pattern list length is different to the lenght of the w_hap |
677
|
|
|
|
|
|
|
# we can tell that tow columns have been considered as the same one |
678
|
|
|
|
|
|
|
# and therefore we have to start to remove the values. |
679
|
|
|
|
|
|
|
# remove all columns with degeneration |
680
|
|
|
|
|
|
|
# |
681
|
|
|
|
|
|
|
# For this calculation we don't use the pattern frequency. |
682
|
|
|
|
|
|
|
# All patterns are the same, This selection makes |
683
|
|
|
|
|
|
|
# sense when you have different frequency. |
684
|
|
|
|
|
|
|
# |
685
|
|
|
|
|
|
|
# Note: on this version we don't classify the haplotype by frequency |
686
|
|
|
|
|
|
|
# but if you need to do it. This is the place to do it!!!! |
687
|
|
|
|
|
|
|
# |
688
|
|
|
|
|
|
|
# In reality you don't need to sort the values because you will remove |
689
|
|
|
|
|
|
|
# the values according to their values. |
690
|
|
|
|
|
|
|
# |
691
|
|
|
|
|
|
|
# But as comes from a hash, the order could be different and as a |
692
|
|
|
|
|
|
|
# consequence the code generate on every run of the same set could |
693
|
|
|
|
|
|
|
# differ. That is not important. In fact, does not matter but could |
694
|
|
|
|
|
|
|
# confuse people. |
695
|
|
|
|
|
|
|
################################################################# |
696
|
|
|
|
|
|
|
|
697
|
4
|
|
|
|
|
6
|
my @tmp =sort { $a <=> $b} |
698
|
1
|
|
|
|
|
2
|
keys %{$self -> {deg_pattern}}; # just count the families |
|
1
|
|
|
|
|
5
|
|
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
# if the size of the list is different to the size of the degenerated |
701
|
|
|
|
|
|
|
# family. There is degeneration. And the redundancies will be |
702
|
|
|
|
|
|
|
# removed. |
703
|
1
|
50
|
|
|
|
2
|
if($#tmp != $#{$self -> { w_hap } } ){ |
|
1
|
|
|
|
|
4
|
|
704
|
1
|
|
|
|
|
4
|
_keep_these_patterns($self->{w_hap}, \@tmp); |
705
|
1
|
|
|
|
|
4
|
_keep_these_patterns($self->{w_pop_freq}, \@tmp); |
706
|
|
|
|
|
|
|
} |
707
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
################################################################# |
709
|
|
|
|
|
|
|
# the steps made before about removing snp and cluster families |
710
|
|
|
|
|
|
|
# are just needed pre-process the haplotype before. |
711
|
|
|
|
|
|
|
# |
712
|
|
|
|
|
|
|
# Now is when the fun starts. |
713
|
|
|
|
|
|
|
# |
714
|
|
|
|
|
|
|
# |
715
|
|
|
|
|
|
|
# once we have the this minimal matrix, we have to calculate the |
716
|
|
|
|
|
|
|
# max multipliticy for the values. The max number of alleles found |
717
|
|
|
|
|
|
|
# on the set. A normal haplotype is biallelic but we can not |
718
|
|
|
|
|
|
|
# reject multiple variations. |
719
|
|
|
|
|
|
|
################################################################## |
720
|
|
|
|
|
|
|
|
721
|
1
|
|
|
|
|
4
|
_alleles_number ( $self ); |
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
################################################################## |
724
|
|
|
|
|
|
|
# Now we have to convert the haplotype into number |
725
|
|
|
|
|
|
|
# |
726
|
|
|
|
|
|
|
# A C C - T |
727
|
|
|
|
|
|
|
# C A G G C |
728
|
|
|
|
|
|
|
# A C C C T |
729
|
|
|
|
|
|
|
# C G G G C |
730
|
|
|
|
|
|
|
# |
731
|
|
|
|
|
|
|
# one haplotype like this transformed into number produce this result |
732
|
|
|
|
|
|
|
# |
733
|
|
|
|
|
|
|
# 0 0 0 0 0 |
734
|
|
|
|
|
|
|
# 1 1 1 1 1 |
735
|
|
|
|
|
|
|
# 0 0 0 2 0 |
736
|
|
|
|
|
|
|
# 1 2 1 1 1 |
737
|
|
|
|
|
|
|
# |
738
|
|
|
|
|
|
|
################################################################## |
739
|
|
|
|
|
|
|
|
740
|
1
|
|
|
|
|
3
|
_convert_to_numbers( $self ); |
741
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
################################################################### |
743
|
|
|
|
|
|
|
# The next step is to calculate the type of the SNP. |
744
|
|
|
|
|
|
|
# This process is made based on the position of the SNP, the value |
745
|
|
|
|
|
|
|
# and its multiplicity. |
746
|
|
|
|
|
|
|
################################################################### |
747
|
|
|
|
|
|
|
|
748
|
1
|
|
|
|
|
4
|
_snp_type_code( $self ); |
749
|
|
|
|
|
|
|
|
750
|
|
|
|
|
|
|
################################################################### |
751
|
|
|
|
|
|
|
# now we have all information we need to calculate the haplotype |
752
|
|
|
|
|
|
|
# tagging SNP htSNP |
753
|
|
|
|
|
|
|
################################################################### |
754
|
|
|
|
|
|
|
|
755
|
1
|
|
|
|
|
2
|
_htSNP( $self ); |
756
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
################################################################### |
758
|
|
|
|
|
|
|
# patch: |
759
|
|
|
|
|
|
|
# |
760
|
|
|
|
|
|
|
# all SNP have a code. but if the SNP is not used this code must |
761
|
|
|
|
|
|
|
# be zero in case of silent SNP. This looks not to informative |
762
|
|
|
|
|
|
|
# because all the information is already there. But this method |
763
|
|
|
|
|
|
|
# compile the full set. |
764
|
|
|
|
|
|
|
################################################################### |
765
|
|
|
|
|
|
|
|
766
|
1
|
|
|
|
|
3
|
_snp_and_code_summary( $self ); |
767
|
|
|
|
|
|
|
} |
768
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
=head2 input_block |
770
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
Title : input_block |
772
|
|
|
|
|
|
|
Usage : $obj->input_block() |
773
|
|
|
|
|
|
|
Function: returns input block |
774
|
|
|
|
|
|
|
Returns : reference to array of array |
775
|
|
|
|
|
|
|
Args : none |
776
|
|
|
|
|
|
|
Status : public |
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
=cut |
779
|
|
|
|
|
|
|
|
780
|
|
|
|
|
|
|
#------------------------ |
781
|
|
|
|
|
|
|
sub input_block{ |
782
|
|
|
|
|
|
|
#------------------------ |
783
|
|
|
|
|
|
|
|
784
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
785
|
0
|
|
|
|
|
0
|
return $self -> {input_block}; |
786
|
|
|
|
|
|
|
} |
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
=head2 hap_length |
789
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
Title : hap_length |
791
|
|
|
|
|
|
|
Usage : $obj->hap_length() |
792
|
|
|
|
|
|
|
Function: get numbers of SNP on the haplotype |
793
|
|
|
|
|
|
|
Returns : scalar |
794
|
|
|
|
|
|
|
Args : none |
795
|
|
|
|
|
|
|
Status : public |
796
|
|
|
|
|
|
|
|
797
|
|
|
|
|
|
|
=cut |
798
|
|
|
|
|
|
|
|
799
|
|
|
|
|
|
|
#------------------------ |
800
|
|
|
|
|
|
|
sub hap_length{ |
801
|
|
|
|
|
|
|
#------------------------ |
802
|
|
|
|
|
|
|
|
803
|
1
|
|
|
1
|
1
|
6
|
my $self = shift; |
804
|
1
|
|
|
|
|
1
|
return scalar @{$self -> {'_snp_ids'}}; |
|
1
|
|
|
|
|
5
|
|
805
|
|
|
|
|
|
|
} |
806
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
=head2 pop_freq |
809
|
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
Title : pop_freq |
811
|
|
|
|
|
|
|
Usage : $obj->pop_freq() |
812
|
|
|
|
|
|
|
Function: returns population frequency |
813
|
|
|
|
|
|
|
Returns : reference to array |
814
|
|
|
|
|
|
|
Args : none |
815
|
|
|
|
|
|
|
Status : public |
816
|
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
=cut |
818
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
#------------------------ |
820
|
|
|
|
|
|
|
sub pop_freq{ |
821
|
|
|
|
|
|
|
#------------------------ |
822
|
|
|
|
|
|
|
|
823
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
824
|
|
|
|
|
|
|
return $self -> {pop_freq} |
825
|
0
|
|
|
|
|
0
|
} |
826
|
|
|
|
|
|
|
|
827
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
=head2 deg_snp |
829
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
Title : deg_snp |
832
|
|
|
|
|
|
|
Usage : $obj->deg_snp() |
833
|
|
|
|
|
|
|
Function: returns snp_removes due to indetermination on their values |
834
|
|
|
|
|
|
|
Returns : reference to array |
835
|
|
|
|
|
|
|
Args : none |
836
|
|
|
|
|
|
|
Status : public |
837
|
|
|
|
|
|
|
|
838
|
|
|
|
|
|
|
=cut |
839
|
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
#------------------------ |
841
|
|
|
|
|
|
|
sub deg_snp{ |
842
|
|
|
|
|
|
|
#------------------------ |
843
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
844
|
1
|
|
|
|
|
5
|
return $self -> {snp_type} ->{deg_snp}; |
845
|
|
|
|
|
|
|
} |
846
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
|
848
|
|
|
|
|
|
|
=head2 snp_type |
849
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
|
851
|
|
|
|
|
|
|
Title : snp_type |
852
|
|
|
|
|
|
|
Usage : $obj->snp_type() |
853
|
|
|
|
|
|
|
Function: returns hash with SNP type |
854
|
|
|
|
|
|
|
Returns : reference to hash |
855
|
|
|
|
|
|
|
Args : none |
856
|
|
|
|
|
|
|
Status : public |
857
|
|
|
|
|
|
|
|
858
|
|
|
|
|
|
|
=cut |
859
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
#------------------------ |
861
|
|
|
|
|
|
|
sub snp_type{ |
862
|
|
|
|
|
|
|
#------------------------ |
863
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
864
|
0
|
|
|
|
|
0
|
return $self -> {snp_type}; |
865
|
|
|
|
|
|
|
} |
866
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
|
868
|
|
|
|
|
|
|
=head2 silent_snp |
869
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
Title : silent_snp |
872
|
|
|
|
|
|
|
Usage : $obj->silent_snp() |
873
|
|
|
|
|
|
|
Function: some SNP's are silent (not contibuting to the haplotype) |
874
|
|
|
|
|
|
|
and are not considering for this analysis |
875
|
|
|
|
|
|
|
Returns : reference to a array |
876
|
|
|
|
|
|
|
Args : none |
877
|
|
|
|
|
|
|
Status : public |
878
|
|
|
|
|
|
|
|
879
|
|
|
|
|
|
|
=cut |
880
|
|
|
|
|
|
|
|
881
|
|
|
|
|
|
|
#------------------------ |
882
|
|
|
|
|
|
|
sub silent_snp{ |
883
|
|
|
|
|
|
|
#------------------------ |
884
|
1
|
|
|
1
|
1
|
1
|
my $self = shift; |
885
|
1
|
|
|
|
|
54
|
return $self -> {snp_type} ->{silent_snp}; |
886
|
|
|
|
|
|
|
} |
887
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
|
889
|
|
|
|
|
|
|
=head2 useful_snp |
890
|
|
|
|
|
|
|
|
891
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
Title : useful_snp |
893
|
|
|
|
|
|
|
Usage : $obj->useful_snp() |
894
|
|
|
|
|
|
|
Function: returns list of SNP's that are can be used as htSNP. Some |
895
|
|
|
|
|
|
|
of them can produce the same information. But this is |
896
|
|
|
|
|
|
|
not considered here. |
897
|
|
|
|
|
|
|
Returns : reference to a array |
898
|
|
|
|
|
|
|
Args : none |
899
|
|
|
|
|
|
|
Status : public |
900
|
|
|
|
|
|
|
|
901
|
|
|
|
|
|
|
=cut |
902
|
|
|
|
|
|
|
|
903
|
|
|
|
|
|
|
#------------------------ |
904
|
|
|
|
|
|
|
sub useful_snp{ |
905
|
|
|
|
|
|
|
#------------------------ |
906
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
907
|
1
|
|
|
|
|
6
|
return $self -> {snp_type} ->{useful_snp}; |
908
|
|
|
|
|
|
|
} |
909
|
|
|
|
|
|
|
|
910
|
|
|
|
|
|
|
|
911
|
|
|
|
|
|
|
=head2 ht_type |
912
|
|
|
|
|
|
|
|
913
|
|
|
|
|
|
|
|
914
|
|
|
|
|
|
|
Title : ht_type |
915
|
|
|
|
|
|
|
Usage : $obj->ht_type() |
916
|
|
|
|
|
|
|
Function: every useful SNP has a numeric code dependending of its |
917
|
|
|
|
|
|
|
value and position. For a better description see |
918
|
|
|
|
|
|
|
description of the module. |
919
|
|
|
|
|
|
|
Returns : reference to a array |
920
|
|
|
|
|
|
|
Args : none |
921
|
|
|
|
|
|
|
Status : public |
922
|
|
|
|
|
|
|
|
923
|
|
|
|
|
|
|
=cut |
924
|
|
|
|
|
|
|
|
925
|
|
|
|
|
|
|
#------------------------ |
926
|
|
|
|
|
|
|
sub ht_type{ |
927
|
|
|
|
|
|
|
#------------------------ |
928
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
929
|
1
|
|
|
|
|
5
|
return $self -> {ht_type}; |
930
|
|
|
|
|
|
|
} |
931
|
|
|
|
|
|
|
=head2 ht_set |
932
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
|
934
|
|
|
|
|
|
|
Title : ht_set |
935
|
|
|
|
|
|
|
Usage : $obj->ht_set() |
936
|
|
|
|
|
|
|
Function: returns the minimal haplotype in numerical format. This |
937
|
|
|
|
|
|
|
haplotype contains the maximal information about the |
938
|
|
|
|
|
|
|
haplotype variations but with no redundancies. It's the |
939
|
|
|
|
|
|
|
minimal set that describes the haplotype. |
940
|
|
|
|
|
|
|
Returns : reference to an array of arrays |
941
|
|
|
|
|
|
|
Args : none |
942
|
|
|
|
|
|
|
Status : public |
943
|
|
|
|
|
|
|
|
944
|
|
|
|
|
|
|
=cut |
945
|
|
|
|
|
|
|
|
946
|
|
|
|
|
|
|
#------------------------ |
947
|
|
|
|
|
|
|
sub ht_set{ |
948
|
|
|
|
|
|
|
#------------------------ |
949
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
950
|
0
|
|
|
|
|
0
|
return $self -> {w_hap}; |
951
|
|
|
|
|
|
|
} |
952
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
=head2 snp_type_code |
954
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
|
956
|
|
|
|
|
|
|
Title : snp_type_code |
957
|
|
|
|
|
|
|
Usage : $obj->snp_type_code() |
958
|
|
|
|
|
|
|
Function: returns the numeric code of the SNPs that need to be |
959
|
|
|
|
|
|
|
tagged that correspond to the SNP's considered in ht_set. |
960
|
|
|
|
|
|
|
Returns : reference to an array |
961
|
|
|
|
|
|
|
Args : none |
962
|
|
|
|
|
|
|
Status : public |
963
|
|
|
|
|
|
|
|
964
|
|
|
|
|
|
|
=cut |
965
|
|
|
|
|
|
|
|
966
|
|
|
|
|
|
|
#------------------------ |
967
|
|
|
|
|
|
|
sub snp_type_code{ |
968
|
|
|
|
|
|
|
#------------------------ |
969
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
970
|
1
|
|
|
|
|
5
|
return $self -> {snp_type_code}; |
971
|
|
|
|
|
|
|
} |
972
|
|
|
|
|
|
|
|
973
|
|
|
|
|
|
|
=head2 snp_and_code |
974
|
|
|
|
|
|
|
|
975
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
Title : snp_and_code |
977
|
|
|
|
|
|
|
Usage : $obj->snp_and_code() |
978
|
|
|
|
|
|
|
Function: Returns the full list of SNP's and the code associate to |
979
|
|
|
|
|
|
|
them. If the SNP belongs to the group useful_snp it keep |
980
|
|
|
|
|
|
|
this code. If the SNP is silent the code is 0. And if the |
981
|
|
|
|
|
|
|
SNP is degenerated the code is -1. |
982
|
|
|
|
|
|
|
Returns : reference to an array of array |
983
|
|
|
|
|
|
|
Args : none |
984
|
|
|
|
|
|
|
Status : public |
985
|
|
|
|
|
|
|
|
986
|
|
|
|
|
|
|
=cut |
987
|
|
|
|
|
|
|
|
988
|
|
|
|
|
|
|
#------------------------ |
989
|
|
|
|
|
|
|
sub snp_and_code{ |
990
|
|
|
|
|
|
|
#------------------------ |
991
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
992
|
0
|
|
|
|
|
0
|
return $self -> {'snp_and_code'}; |
993
|
|
|
|
|
|
|
} |
994
|
|
|
|
|
|
|
|
995
|
|
|
|
|
|
|
=head2 deg_pattern |
996
|
|
|
|
|
|
|
|
997
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
Title : deg_pattern |
999
|
|
|
|
|
|
|
Usage : $obj->deg_pattern() |
1000
|
|
|
|
|
|
|
Function: Returns the a list with the degenerated haplotype. |
1001
|
|
|
|
|
|
|
Sometimes due to degeneration some haplotypes looks |
1002
|
|
|
|
|
|
|
the same and if we don't remove them it won't find |
1003
|
|
|
|
|
|
|
any tag. |
1004
|
|
|
|
|
|
|
Returns : reference to a hash of array |
1005
|
|
|
|
|
|
|
Args : none |
1006
|
|
|
|
|
|
|
Status : public |
1007
|
|
|
|
|
|
|
|
1008
|
|
|
|
|
|
|
=cut |
1009
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
#------------------------ |
1011
|
|
|
|
|
|
|
sub deg_pattern{ |
1012
|
|
|
|
|
|
|
#------------------------ |
1013
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
1014
|
|
|
|
|
|
|
|
1015
|
1
|
|
|
|
|
2
|
return $self -> {'deg_pattern'}; |
1016
|
|
|
|
|
|
|
} |
1017
|
|
|
|
|
|
|
|
1018
|
|
|
|
|
|
|
=head2 split_hap |
1019
|
|
|
|
|
|
|
|
1020
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
Title : split_hap |
1022
|
|
|
|
|
|
|
Usage : $obj->split_hap() |
1023
|
|
|
|
|
|
|
Function: simple representation of the haplotype base by base |
1024
|
|
|
|
|
|
|
Same information that input haplotype but base based. |
1025
|
|
|
|
|
|
|
Returns : reference to an array of array |
1026
|
|
|
|
|
|
|
Args : none |
1027
|
|
|
|
|
|
|
Status : public |
1028
|
|
|
|
|
|
|
|
1029
|
|
|
|
|
|
|
=cut |
1030
|
|
|
|
|
|
|
|
1031
|
|
|
|
|
|
|
#------------------------ |
1032
|
|
|
|
|
|
|
sub split_hap{ |
1033
|
|
|
|
|
|
|
#------------------------ |
1034
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1035
|
0
|
|
|
|
|
0
|
return $self -> {'split_hap'}; |
1036
|
|
|
|
|
|
|
} |
1037
|
|
|
|
|
|
|
|
1038
|
|
|
|
|
|
|
=head2 _split_haplo |
1039
|
|
|
|
|
|
|
|
1040
|
|
|
|
|
|
|
Title : _split_haplo |
1041
|
|
|
|
|
|
|
Usage : _split_haplo($self) |
1042
|
|
|
|
|
|
|
Function: Take a haplotype and split it into bases |
1043
|
|
|
|
|
|
|
Returns : self |
1044
|
|
|
|
|
|
|
Args : none |
1045
|
|
|
|
|
|
|
Status : internal |
1046
|
|
|
|
|
|
|
|
1047
|
|
|
|
|
|
|
=cut |
1048
|
|
|
|
|
|
|
|
1049
|
|
|
|
|
|
|
#------------------------ |
1050
|
|
|
|
|
|
|
sub _split_haplo { |
1051
|
|
|
|
|
|
|
#------------------------ |
1052
|
1
|
|
|
1
|
|
1
|
my $self = shift; |
1053
|
|
|
|
|
|
|
|
1054
|
1
|
|
|
|
|
2
|
my $in = $self ->{'_haplotype_block'}; |
1055
|
1
|
|
|
|
|
1
|
my $out = $self ->{'w_hap'}; |
1056
|
|
|
|
|
|
|
|
1057
|
|
|
|
|
|
|
# split every haplotype and store the result into $out |
1058
|
1
|
|
|
|
|
3
|
foreach (@$in){ |
1059
|
7
|
|
|
|
|
28
|
push @$out, [split (//,$_)]; |
1060
|
|
|
|
|
|
|
} |
1061
|
|
|
|
|
|
|
|
1062
|
1
|
|
|
|
|
30
|
$self -> {'split_hap'} = dclone ($out); |
1063
|
|
|
|
|
|
|
} |
1064
|
|
|
|
|
|
|
|
1065
|
|
|
|
|
|
|
# internal method to convert the haplotype to uppercase |
1066
|
|
|
|
|
|
|
|
1067
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
=head2 _to_upper_case |
1069
|
|
|
|
|
|
|
|
1070
|
|
|
|
|
|
|
|
1071
|
|
|
|
|
|
|
Title : _to_upper_case |
1072
|
|
|
|
|
|
|
Usage : _to_upper_case() |
1073
|
|
|
|
|
|
|
Function: make SNP or in-dels Upper case |
1074
|
|
|
|
|
|
|
Returns : self |
1075
|
|
|
|
|
|
|
Args : an AoA ref |
1076
|
|
|
|
|
|
|
Status : private |
1077
|
|
|
|
|
|
|
|
1078
|
|
|
|
|
|
|
=cut |
1079
|
|
|
|
|
|
|
|
1080
|
|
|
|
|
|
|
#------------------------ |
1081
|
|
|
|
|
|
|
sub _to_upper_case { |
1082
|
|
|
|
|
|
|
#------------------------ |
1083
|
1
|
|
|
1
|
|
2
|
my ($arr) =@_; |
1084
|
|
|
|
|
|
|
|
1085
|
1
|
|
|
|
|
3
|
foreach my $aref (@$arr){ |
1086
|
7
|
|
|
|
|
5
|
foreach my $value (@{$aref} ){ |
|
7
|
|
|
|
|
8
|
|
1087
|
63
|
|
|
|
|
44
|
$value = uc $value; |
1088
|
|
|
|
|
|
|
} |
1089
|
|
|
|
|
|
|
} |
1090
|
|
|
|
|
|
|
} |
1091
|
|
|
|
|
|
|
|
1092
|
|
|
|
|
|
|
|
1093
|
|
|
|
|
|
|
=head2 _remove_deg |
1094
|
|
|
|
|
|
|
|
1095
|
|
|
|
|
|
|
|
1096
|
|
|
|
|
|
|
Title : _remove_deg |
1097
|
|
|
|
|
|
|
Usage : _remove_deg() |
1098
|
|
|
|
|
|
|
Function: when have a indetermination or strange value this SNP |
1099
|
|
|
|
|
|
|
is removed |
1100
|
|
|
|
|
|
|
Returns : haplotype family set and degeneration list |
1101
|
|
|
|
|
|
|
Args : ref to an AoA and a ref to an array |
1102
|
|
|
|
|
|
|
Status : internal |
1103
|
|
|
|
|
|
|
|
1104
|
|
|
|
|
|
|
=cut |
1105
|
|
|
|
|
|
|
|
1106
|
|
|
|
|
|
|
#------------------------ |
1107
|
|
|
|
|
|
|
sub _remove_deg { |
1108
|
|
|
|
|
|
|
#------------------------ |
1109
|
1
|
|
|
1
|
|
1
|
my $self = shift; |
1110
|
|
|
|
|
|
|
|
1111
|
1
|
|
|
|
|
2
|
my $hap = $self->{w_hap}; |
1112
|
1
|
|
|
|
|
1
|
my $snp = $self->{snp_type}->{useful_snp}; |
1113
|
1
|
|
|
|
|
2
|
my $deg_snp = $self->{snp_type}->{deg_snp}; |
1114
|
|
|
|
|
|
|
|
1115
|
1
|
|
|
|
|
1
|
my $rem = []; # take the position of the array to be removed |
1116
|
|
|
|
|
|
|
|
1117
|
|
|
|
|
|
|
# first we work on the columns we have void values |
1118
|
1
|
|
|
|
|
4
|
$rem = _find_indet($hap,$rem); # find degenerated columns |
1119
|
|
|
|
|
|
|
|
1120
|
1
|
50
|
|
|
|
3
|
if (@$rem){ |
1121
|
|
|
|
|
|
|
|
1122
|
|
|
|
|
|
|
# remove column on haplotype |
1123
|
1
|
|
|
|
|
3
|
_remove_col($hap,$rem); # remove list |
1124
|
|
|
|
|
|
|
|
1125
|
|
|
|
|
|
|
# now remove the values from SNP id |
1126
|
1
|
|
|
|
|
3
|
_remove_snp_id($snp,$deg_snp,$rem); # remove list |
1127
|
|
|
|
|
|
|
} |
1128
|
|
|
|
|
|
|
} |
1129
|
|
|
|
|
|
|
|
1130
|
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
=head2 _rem_silent_snp |
1132
|
|
|
|
|
|
|
|
1133
|
|
|
|
|
|
|
|
1134
|
|
|
|
|
|
|
Title : _rem_silent_snp |
1135
|
|
|
|
|
|
|
Usage : _rem_silent_snp() |
1136
|
|
|
|
|
|
|
Function: there is the remote possibilty that one SNP won't be a |
1137
|
|
|
|
|
|
|
real SNP on this situation we have to remove this SNP, |
1138
|
|
|
|
|
|
|
otherwise the program won't find any tag |
1139
|
|
|
|
|
|
|
Returns : nonthing |
1140
|
|
|
|
|
|
|
Args : ref to an AoA and a ref to an array |
1141
|
|
|
|
|
|
|
Status : internal |
1142
|
|
|
|
|
|
|
|
1143
|
|
|
|
|
|
|
=cut |
1144
|
|
|
|
|
|
|
|
1145
|
|
|
|
|
|
|
#------------------------ |
1146
|
|
|
|
|
|
|
sub _rem_silent_snp { |
1147
|
|
|
|
|
|
|
#------------------------ |
1148
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
1149
|
|
|
|
|
|
|
|
1150
|
1
|
|
|
|
|
4
|
my $hap = $self->{w_hap}; |
1151
|
1
|
|
|
|
|
2
|
my $snp = $self->{snp_type}->{useful_snp}; |
1152
|
1
|
|
|
|
|
2
|
my $silent_snp = $self->{snp_type}->{silent_snp}; |
1153
|
|
|
|
|
|
|
|
1154
|
1
|
|
|
|
|
1
|
my $rem = []; # store the positions to be removed |
1155
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
#find columns with no variation on the SNP, Real snp? |
1157
|
1
|
|
|
|
|
2
|
$rem = _find_silent_snps($hap); |
1158
|
|
|
|
|
|
|
|
1159
|
1
|
50
|
|
|
|
3
|
if (@$rem){ |
1160
|
|
|
|
|
|
|
|
1161
|
|
|
|
|
|
|
# remove column on haplotype |
1162
|
1
|
|
|
|
|
2
|
_remove_col($hap,$rem); |
1163
|
|
|
|
|
|
|
|
1164
|
|
|
|
|
|
|
# remove the values from SNP id |
1165
|
1
|
|
|
|
|
2
|
_remove_snp_id($snp,$silent_snp,$rem); |
1166
|
|
|
|
|
|
|
} |
1167
|
|
|
|
|
|
|
} |
1168
|
|
|
|
|
|
|
|
1169
|
|
|
|
|
|
|
|
1170
|
|
|
|
|
|
|
=head2 _find_silent_snps |
1171
|
|
|
|
|
|
|
|
1172
|
|
|
|
|
|
|
|
1173
|
|
|
|
|
|
|
Title : _find_silent_snps |
1174
|
|
|
|
|
|
|
Usage : |
1175
|
|
|
|
|
|
|
Function: list of snps that are not SNPs. All values for that |
1176
|
|
|
|
|
|
|
SNPs on the set is the same one. Look stupid but can |
1177
|
|
|
|
|
|
|
happend and if this happend you will not find any tag |
1178
|
|
|
|
|
|
|
Returns : nothing |
1179
|
|
|
|
|
|
|
Args : |
1180
|
|
|
|
|
|
|
Status : |
1181
|
|
|
|
|
|
|
|
1182
|
|
|
|
|
|
|
=cut |
1183
|
|
|
|
|
|
|
|
1184
|
|
|
|
|
|
|
#------------------------ |
1185
|
|
|
|
|
|
|
sub _find_silent_snps{ |
1186
|
|
|
|
|
|
|
#------------------------ |
1187
|
1
|
|
|
1
|
|
2
|
my ($arr)=@_; |
1188
|
|
|
|
|
|
|
|
1189
|
1
|
|
|
|
|
1
|
my $list =[]; # no snp list; |
1190
|
|
|
|
|
|
|
|
1191
|
|
|
|
|
|
|
# determine the number of snp by the length of the first row. |
1192
|
|
|
|
|
|
|
# we assume that the matrix is squared. |
1193
|
1
|
|
|
|
|
1
|
my $colsn= @{$arr->[0]}; |
|
1
|
|
|
|
|
2
|
|
1194
|
|
|
|
|
|
|
|
1195
|
1
|
|
|
|
|
3
|
for (my $i=0;$i<$colsn;$i++){ |
1196
|
6
|
|
|
|
|
5
|
my $different =0; # check degeneration |
1197
|
|
|
|
|
|
|
|
1198
|
6
|
|
|
|
|
8
|
for my $r (1..$#$arr){ |
1199
|
15
|
100
|
|
|
|
22
|
if($arr->[0][$i] ne $arr->[$r][$i]){ |
1200
|
5
|
|
|
|
|
4
|
$different =1; |
1201
|
5
|
|
|
|
|
6
|
last; |
1202
|
|
|
|
|
|
|
} |
1203
|
|
|
|
|
|
|
} |
1204
|
|
|
|
|
|
|
|
1205
|
6
|
100
|
|
|
|
15
|
if(!$different){ |
1206
|
1
|
|
|
|
|
4
|
push (@$list, $i); |
1207
|
|
|
|
|
|
|
} |
1208
|
|
|
|
|
|
|
} |
1209
|
|
|
|
|
|
|
|
1210
|
1
|
|
|
|
|
4
|
return $list; |
1211
|
|
|
|
|
|
|
} |
1212
|
|
|
|
|
|
|
|
1213
|
|
|
|
|
|
|
|
1214
|
|
|
|
|
|
|
=head2 _find_indet |
1215
|
|
|
|
|
|
|
|
1216
|
|
|
|
|
|
|
|
1217
|
|
|
|
|
|
|
Title : _find_indet |
1218
|
|
|
|
|
|
|
Usage : |
1219
|
|
|
|
|
|
|
Function: find column (SNP) with invalid or degenerated values |
1220
|
|
|
|
|
|
|
and store this values into the second parameter supplied. |
1221
|
|
|
|
|
|
|
Returns : nothing |
1222
|
|
|
|
|
|
|
Args : ref to AoA and ref to an array |
1223
|
|
|
|
|
|
|
Status : internal |
1224
|
|
|
|
|
|
|
|
1225
|
|
|
|
|
|
|
=cut |
1226
|
|
|
|
|
|
|
|
1227
|
|
|
|
|
|
|
#------------------------ |
1228
|
|
|
|
|
|
|
sub _find_indet{ |
1229
|
|
|
|
|
|
|
#------------------------ |
1230
|
1
|
|
|
1
|
|
1
|
my ($arr, $list)=@_; |
1231
|
|
|
|
|
|
|
|
1232
|
1
|
|
|
|
|
4
|
foreach my $i(0..$#$arr){ |
1233
|
7
|
|
|
|
|
5
|
foreach my $j(0..$#{$arr->[$i]}){ |
|
7
|
|
|
|
|
14
|
|
1234
|
63
|
100
|
|
|
|
126
|
unless ($arr->[$i][$j] =~ /[ACTG-]/){ |
1235
|
7
|
100
|
|
|
|
12
|
if ($#$list<0){ |
1236
|
1
|
|
|
|
|
2
|
push(@$list,$j); |
1237
|
|
|
|
|
|
|
} |
1238
|
|
|
|
|
|
|
else{ |
1239
|
6
|
|
|
|
|
6
|
my $found =0; # check if already exist the value |
1240
|
6
|
|
|
|
|
10
|
foreach my $k(0..$#$list){ |
1241
|
10
|
100
|
|
|
|
16
|
$found =1 if ($list->[$k] eq $j); |
1242
|
10
|
100
|
|
|
|
17
|
last if ($found); |
1243
|
|
|
|
|
|
|
} |
1244
|
6
|
100
|
|
|
|
13
|
if(!$found){ |
1245
|
2
|
|
|
|
|
4
|
push(@$list,$j); |
1246
|
|
|
|
|
|
|
} |
1247
|
|
|
|
|
|
|
} |
1248
|
|
|
|
|
|
|
} |
1249
|
|
|
|
|
|
|
} |
1250
|
|
|
|
|
|
|
} |
1251
|
|
|
|
|
|
|
|
1252
|
1
|
|
|
|
|
8
|
@$list = sort { $a <=> $b} @$list; |
|
3
|
|
|
|
|
10
|
|
1253
|
|
|
|
|
|
|
|
1254
|
1
|
|
|
|
|
2
|
return $list; |
1255
|
|
|
|
|
|
|
} |
1256
|
|
|
|
|
|
|
|
1257
|
|
|
|
|
|
|
=head2 _remove_col |
1258
|
|
|
|
|
|
|
|
1259
|
|
|
|
|
|
|
Title : _remove_col |
1260
|
|
|
|
|
|
|
Usage : |
1261
|
|
|
|
|
|
|
Function: remove columns contained on the second array from |
1262
|
|
|
|
|
|
|
the first arr |
1263
|
|
|
|
|
|
|
Returns : nothing |
1264
|
|
|
|
|
|
|
Args : array of array reference and array reference |
1265
|
|
|
|
|
|
|
Status : internal |
1266
|
|
|
|
|
|
|
|
1267
|
|
|
|
|
|
|
=cut |
1268
|
|
|
|
|
|
|
|
1269
|
|
|
|
|
|
|
#------------------------ |
1270
|
|
|
|
|
|
|
sub _remove_col{ |
1271
|
|
|
|
|
|
|
#------------------------ |
1272
|
2
|
|
|
2
|
|
3
|
my ($arr,$rem)=@_; |
1273
|
|
|
|
|
|
|
|
1274
|
2
|
|
|
|
|
3
|
foreach my $col (reverse @$rem){ |
1275
|
4
|
|
|
|
|
17
|
splice @$_, $col, 1 for @$arr; |
1276
|
|
|
|
|
|
|
} |
1277
|
|
|
|
|
|
|
} |
1278
|
|
|
|
|
|
|
|
1279
|
|
|
|
|
|
|
|
1280
|
|
|
|
|
|
|
=head2 _remove_snp_id |
1281
|
|
|
|
|
|
|
|
1282
|
|
|
|
|
|
|
Title : _remove_snp_id |
1283
|
|
|
|
|
|
|
Usage : |
1284
|
|
|
|
|
|
|
Function: remove columns contained on the second array from |
1285
|
|
|
|
|
|
|
the first arr |
1286
|
|
|
|
|
|
|
Returns : nothing |
1287
|
|
|
|
|
|
|
Args : array of array reference and array reference |
1288
|
|
|
|
|
|
|
Status : internal |
1289
|
|
|
|
|
|
|
|
1290
|
|
|
|
|
|
|
=cut |
1291
|
|
|
|
|
|
|
|
1292
|
|
|
|
|
|
|
#------------------------ |
1293
|
|
|
|
|
|
|
sub _remove_snp_id{ |
1294
|
|
|
|
|
|
|
#------------------------ |
1295
|
2
|
|
|
2
|
|
2
|
my ($arr,$removed,$rem_list)=@_; |
1296
|
|
|
|
|
|
|
|
1297
|
2
|
|
|
|
|
7
|
push @$removed, splice @$arr, $_, 1 foreach reverse @$rem_list; |
1298
|
|
|
|
|
|
|
} |
1299
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
|
1301
|
|
|
|
|
|
|
=head2 _find_deg_pattern |
1302
|
|
|
|
|
|
|
|
1303
|
|
|
|
|
|
|
Title : _find_deg_pattern |
1304
|
|
|
|
|
|
|
Usage : |
1305
|
|
|
|
|
|
|
Function: create a list with the degenerated patterns |
1306
|
|
|
|
|
|
|
Returns : @array |
1307
|
|
|
|
|
|
|
Args : a ref to AoA |
1308
|
|
|
|
|
|
|
Status : public |
1309
|
|
|
|
|
|
|
|
1310
|
|
|
|
|
|
|
=cut |
1311
|
|
|
|
|
|
|
|
1312
|
|
|
|
|
|
|
#------------------------ |
1313
|
|
|
|
|
|
|
sub _find_deg_pattern{ |
1314
|
|
|
|
|
|
|
#------------------------ |
1315
|
1
|
|
|
1
|
|
1
|
my $self = shift; |
1316
|
|
|
|
|
|
|
|
1317
|
1
|
|
|
|
|
2
|
my $arr = $self ->{w_hap}; # the working haplotype |
1318
|
1
|
|
|
|
|
1
|
my $list = $self ->{'deg_pattern'}; # degenerated patterns |
1319
|
|
|
|
|
|
|
|
1320
|
|
|
|
|
|
|
# we have to check all elements |
1321
|
1
|
|
|
|
|
3
|
foreach my $i(0..$#$arr){ |
1322
|
|
|
|
|
|
|
# is the element has not been used create a key |
1323
|
7
|
100
|
|
|
|
11
|
unless ( _is_on_hash ($list,\$i) ) { |
1324
|
4
|
|
|
|
|
10
|
$list->{$i}=[$i]; |
1325
|
|
|
|
|
|
|
}; |
1326
|
|
|
|
|
|
|
|
1327
|
7
|
|
|
|
|
15
|
foreach my $j($i+1..$#$arr){ |
1328
|
18
|
|
|
|
|
26
|
my $comp = compare_arrays($arr->[$i],$arr->[$j]); |
1329
|
|
|
|
|
|
|
|
1330
|
18
|
100
|
|
|
|
30
|
if($comp){ |
1331
|
|
|
|
|
|
|
# as we have no elements we push this into the list |
1332
|
|
|
|
|
|
|
# check for the first element |
1333
|
3
|
|
|
|
|
6
|
my $key = _key_for_value($list,\$i); |
1334
|
|
|
|
|
|
|
|
1335
|
3
|
|
|
|
|
3
|
push (@{$list->{$key}},$j); |
|
3
|
|
|
|
|
5
|
|
1336
|
|
|
|
|
|
|
|
1337
|
3
|
|
|
|
|
4
|
last; |
1338
|
|
|
|
|
|
|
} |
1339
|
|
|
|
|
|
|
} |
1340
|
|
|
|
|
|
|
} |
1341
|
|
|
|
|
|
|
|
1342
|
|
|
|
|
|
|
} |
1343
|
|
|
|
|
|
|
|
1344
|
|
|
|
|
|
|
#------------------------ |
1345
|
|
|
|
|
|
|
sub _key_for_value{ |
1346
|
|
|
|
|
|
|
#------------------------ |
1347
|
3
|
|
|
3
|
|
3
|
my($hash,$value)=@_; |
1348
|
|
|
|
|
|
|
|
1349
|
3
|
|
|
|
|
6
|
foreach my $key (keys %$hash){ |
1350
|
5
|
100
|
|
|
|
4
|
if( _is_there(\@{$hash->{$key}},$value)){ |
|
5
|
|
|
|
|
10
|
|
1351
|
3
|
|
|
|
|
4
|
return $key; |
1352
|
|
|
|
|
|
|
} |
1353
|
|
|
|
|
|
|
} |
1354
|
|
|
|
|
|
|
} |
1355
|
|
|
|
|
|
|
|
1356
|
|
|
|
|
|
|
#------------------------ |
1357
|
|
|
|
|
|
|
sub _is_on_hash{ |
1358
|
|
|
|
|
|
|
#------------------------ |
1359
|
7
|
|
|
7
|
|
8
|
my($hash,$value)=@_; |
1360
|
|
|
|
|
|
|
|
1361
|
7
|
|
|
|
|
13
|
foreach my $key (keys %$hash){ |
1362
|
11
|
100
|
|
|
|
6
|
if( _is_there(\@{$hash->{$key}},$value)){ |
|
11
|
|
|
|
|
16
|
|
1363
|
3
|
|
|
|
|
6
|
return 1; |
1364
|
|
|
|
|
|
|
} |
1365
|
|
|
|
|
|
|
} |
1366
|
|
|
|
|
|
|
} |
1367
|
|
|
|
|
|
|
|
1368
|
|
|
|
|
|
|
#------------------------ |
1369
|
|
|
|
|
|
|
sub _is_there{ |
1370
|
|
|
|
|
|
|
#------------------------ |
1371
|
|
|
|
|
|
|
|
1372
|
41
|
|
|
41
|
|
26
|
my($arr,$value)=@_; |
1373
|
|
|
|
|
|
|
|
1374
|
41
|
|
|
|
|
52
|
foreach my $el (@$arr){ |
1375
|
55
|
100
|
|
|
|
104
|
if ($el eq $$value){ |
1376
|
16
|
|
|
|
|
29
|
return 1; |
1377
|
|
|
|
|
|
|
} |
1378
|
|
|
|
|
|
|
} |
1379
|
|
|
|
|
|
|
} |
1380
|
|
|
|
|
|
|
|
1381
|
|
|
|
|
|
|
|
1382
|
|
|
|
|
|
|
=head2 _keep_these_patterns |
1383
|
|
|
|
|
|
|
|
1384
|
|
|
|
|
|
|
|
1385
|
|
|
|
|
|
|
Title : _keep_these_patterns |
1386
|
|
|
|
|
|
|
Usage : |
1387
|
|
|
|
|
|
|
Function: this is a basic approach, take a LoL and a list, |
1388
|
|
|
|
|
|
|
keep just the columns included on the list |
1389
|
|
|
|
|
|
|
Returns : nothing |
1390
|
|
|
|
|
|
|
Args : an AoA and an array |
1391
|
|
|
|
|
|
|
Status : public |
1392
|
|
|
|
|
|
|
|
1393
|
|
|
|
|
|
|
=cut |
1394
|
|
|
|
|
|
|
|
1395
|
|
|
|
|
|
|
#------------------------ |
1396
|
|
|
|
|
|
|
sub _keep_these_patterns{ |
1397
|
|
|
|
|
|
|
#------------------------ |
1398
|
2
|
|
|
2
|
|
3
|
my ($arr,$list)=@_; |
1399
|
|
|
|
|
|
|
|
1400
|
|
|
|
|
|
|
# by now we just take one of the repetitions but you can weight |
1401
|
|
|
|
|
|
|
# the values by frequency |
1402
|
|
|
|
|
|
|
|
1403
|
2
|
|
|
|
|
4
|
my @outValues=(); |
1404
|
|
|
|
|
|
|
|
1405
|
2
|
|
|
|
|
3
|
foreach my $k (@$list){ |
1406
|
8
|
|
|
|
|
8
|
push @outValues, $arr->[$k]; |
1407
|
|
|
|
|
|
|
} |
1408
|
|
|
|
|
|
|
|
1409
|
|
|
|
|
|
|
#make arr to hold the new values |
1410
|
2
|
|
|
|
|
2
|
@$arr= @{dclone(\@outValues)}; |
|
2
|
|
|
|
|
45
|
|
1411
|
|
|
|
|
|
|
|
1412
|
|
|
|
|
|
|
} |
1413
|
|
|
|
|
|
|
|
1414
|
|
|
|
|
|
|
|
1415
|
|
|
|
|
|
|
=head2 compare_arrays |
1416
|
|
|
|
|
|
|
|
1417
|
|
|
|
|
|
|
|
1418
|
|
|
|
|
|
|
Title : compare_arrays |
1419
|
|
|
|
|
|
|
Usage : |
1420
|
|
|
|
|
|
|
Function: take two arrays and compare their values |
1421
|
|
|
|
|
|
|
Returns : 1 if the two values are the same |
1422
|
|
|
|
|
|
|
0 if the values are different |
1423
|
|
|
|
|
|
|
Args : an AoA and an array |
1424
|
|
|
|
|
|
|
Status : public |
1425
|
|
|
|
|
|
|
|
1426
|
|
|
|
|
|
|
=cut |
1427
|
|
|
|
|
|
|
|
1428
|
|
|
|
|
|
|
#------------------------ |
1429
|
|
|
|
|
|
|
sub compare_arrays { |
1430
|
|
|
|
|
|
|
#------------------------ |
1431
|
18
|
|
|
18
|
1
|
17
|
my ($first, $second) = @_; |
1432
|
18
|
50
|
|
|
|
26
|
return 0 unless @$first == @$second; |
1433
|
18
|
|
|
|
|
31
|
for (my $i = 0; $i < @$first; $i++) { |
1434
|
39
|
100
|
|
|
|
82
|
return 0 if $first->[$i] ne $second->[$i]; |
1435
|
|
|
|
|
|
|
} |
1436
|
3
|
|
|
|
|
5
|
return 1; |
1437
|
|
|
|
|
|
|
} |
1438
|
|
|
|
|
|
|
|
1439
|
|
|
|
|
|
|
|
1440
|
|
|
|
|
|
|
=head2 _convert_to_numbers |
1441
|
|
|
|
|
|
|
|
1442
|
|
|
|
|
|
|
|
1443
|
|
|
|
|
|
|
Title : _convert_to_numbers |
1444
|
|
|
|
|
|
|
Usage : _convert_to_numbers() |
1445
|
|
|
|
|
|
|
Function: tranform the haplotype into numbers. before to do that |
1446
|
|
|
|
|
|
|
we have to consider the variation on the set. |
1447
|
|
|
|
|
|
|
Returns : nonthing |
1448
|
|
|
|
|
|
|
Args : ref to an AoA and a ref to an array |
1449
|
|
|
|
|
|
|
Status : internal |
1450
|
|
|
|
|
|
|
|
1451
|
|
|
|
|
|
|
=cut |
1452
|
|
|
|
|
|
|
|
1453
|
|
|
|
|
|
|
#------------------------ |
1454
|
|
|
|
|
|
|
sub _convert_to_numbers{ |
1455
|
|
|
|
|
|
|
#------------------------ |
1456
|
1
|
|
|
1
|
|
3
|
my $self = shift; |
1457
|
|
|
|
|
|
|
|
1458
|
1
|
|
|
|
|
1
|
my $hap_ref = $self->{w_hap}; |
1459
|
1
|
|
|
|
|
1
|
my $mm = $self->{alleles_number}; |
1460
|
|
|
|
|
|
|
|
1461
|
|
|
|
|
|
|
# the first element is considered as zero. The first modification |
1462
|
|
|
|
|
|
|
# is consider as one and so on. |
1463
|
|
|
|
|
|
|
|
1464
|
1
|
|
|
|
|
1
|
my $length = @{ @$hap_ref[0]}; #length of the haplotype |
|
1
|
|
|
|
|
3
|
|
1465
|
|
|
|
|
|
|
|
1466
|
1
|
|
|
|
|
3
|
for (my $c = 0; $c<$length;$c++){ |
1467
|
|
|
|
|
|
|
|
1468
|
5
|
|
|
|
|
6
|
my @al=(); |
1469
|
|
|
|
|
|
|
|
1470
|
5
|
|
|
|
|
9
|
for my $r (0..$#$hap_ref){ |
1471
|
|
|
|
|
|
|
|
1472
|
20
|
100
|
|
|
|
25
|
push @al,$hap_ref->[$r][$c] |
1473
|
|
|
|
|
|
|
unless _is_there(\@al,\$hap_ref->[$r][$c]); |
1474
|
|
|
|
|
|
|
|
1475
|
20
|
|
|
|
|
29
|
$hap_ref->[$r][$c] = get_position(\@al,\$hap_ref->[$r][$c]); |
1476
|
|
|
|
|
|
|
} |
1477
|
|
|
|
|
|
|
} |
1478
|
|
|
|
|
|
|
} |
1479
|
|
|
|
|
|
|
|
1480
|
|
|
|
|
|
|
|
1481
|
|
|
|
|
|
|
=head2 _snp_type_code |
1482
|
|
|
|
|
|
|
|
1483
|
|
|
|
|
|
|
|
1484
|
|
|
|
|
|
|
Title : _snp_type_code |
1485
|
|
|
|
|
|
|
Usage : |
1486
|
|
|
|
|
|
|
Function: |
1487
|
|
|
|
|
|
|
we have to create the snp type code for each version. |
1488
|
|
|
|
|
|
|
The way the snp type is created is the following: |
1489
|
|
|
|
|
|
|
|
1490
|
|
|
|
|
|
|
we take the number value for every SNP and do the |
1491
|
|
|
|
|
|
|
following calculation |
1492
|
|
|
|
|
|
|
|
1493
|
|
|
|
|
|
|
let be a SNP set as follow: |
1494
|
|
|
|
|
|
|
|
1495
|
|
|
|
|
|
|
0 0 |
1496
|
|
|
|
|
|
|
1 1 |
1497
|
|
|
|
|
|
|
1 2 |
1498
|
|
|
|
|
|
|
|
1499
|
|
|
|
|
|
|
and multiplicity 3 |
1500
|
|
|
|
|
|
|
on this case the situation is: |
1501
|
|
|
|
|
|
|
|
1502
|
|
|
|
|
|
|
sum (value * multiplicity ^ position) for each SNP |
1503
|
|
|
|
|
|
|
|
1504
|
|
|
|
|
|
|
0 * 3 ^ 0 + 1 * 3 ^ 1 + 1 * 3 ^ 2 = 12 |
1505
|
|
|
|
|
|
|
0 * 3 ^ 0 + 1 * 3 ^ 1 + 2 * 3 ^ 2 = 21 |
1506
|
|
|
|
|
|
|
Returns : nothing |
1507
|
|
|
|
|
|
|
Args : $self |
1508
|
|
|
|
|
|
|
Status : private |
1509
|
|
|
|
|
|
|
|
1510
|
|
|
|
|
|
|
=cut |
1511
|
|
|
|
|
|
|
|
1512
|
|
|
|
|
|
|
#------------------------ |
1513
|
|
|
|
|
|
|
sub _snp_type_code{ |
1514
|
|
|
|
|
|
|
#------------------------ |
1515
|
1
|
|
|
1
|
|
1
|
my $self = shift; |
1516
|
|
|
|
|
|
|
|
1517
|
1
|
|
|
|
|
2
|
my $hap = $self->{w_hap}; |
1518
|
1
|
|
|
|
|
2
|
my $arr = $self->{snp_type_code}; |
1519
|
1
|
|
|
|
|
1
|
my $al = $self->{alleles_number}; |
1520
|
|
|
|
|
|
|
|
1521
|
1
|
|
|
|
|
1
|
my $length = @{ $hap->[0]}; #length of the haplotype |
|
1
|
|
|
|
|
1
|
|
1522
|
|
|
|
|
|
|
|
1523
|
1
|
|
|
|
|
4
|
for (my $c=0; $c<$length; $c++){ |
1524
|
5
|
|
|
|
|
6
|
for my $r (0..$#$hap){ |
1525
|
20
|
|
|
|
|
23
|
$arr->[$c] += $hap->[$r][$c] * $al ** $r; |
1526
|
|
|
|
|
|
|
} |
1527
|
|
|
|
|
|
|
} |
1528
|
|
|
|
|
|
|
} |
1529
|
|
|
|
|
|
|
|
1530
|
|
|
|
|
|
|
################################################# |
1531
|
|
|
|
|
|
|
# return the position of an element in one array |
1532
|
|
|
|
|
|
|
# The element is always present on the array |
1533
|
|
|
|
|
|
|
################################################# |
1534
|
|
|
|
|
|
|
|
1535
|
|
|
|
|
|
|
#------------------------ |
1536
|
|
|
|
|
|
|
sub get_position{ |
1537
|
|
|
|
|
|
|
#------------------------ |
1538
|
|
|
|
|
|
|
|
1539
|
20
|
|
|
20
|
0
|
19
|
my($array, $value)=@_; |
1540
|
|
|
|
|
|
|
|
1541
|
20
|
|
|
|
|
24
|
for my $i(0..$#$array) { |
1542
|
34
|
100
|
|
|
|
45
|
if ($array->[$i] eq $$value){ |
1543
|
20
|
|
|
|
|
38
|
return $i; |
1544
|
|
|
|
|
|
|
} |
1545
|
|
|
|
|
|
|
} |
1546
|
|
|
|
|
|
|
|
1547
|
|
|
|
|
|
|
} |
1548
|
|
|
|
|
|
|
|
1549
|
|
|
|
|
|
|
|
1550
|
|
|
|
|
|
|
=head2 _alleles_number |
1551
|
|
|
|
|
|
|
|
1552
|
|
|
|
|
|
|
|
1553
|
|
|
|
|
|
|
Title : _alleles_number |
1554
|
|
|
|
|
|
|
Usage : |
1555
|
|
|
|
|
|
|
Function: calculate the max number of alleles for a haplotype and |
1556
|
|
|
|
|
|
|
if the number. For each SNP the number is stored and the |
1557
|
|
|
|
|
|
|
max number of alleles for a SNP on the set is returned |
1558
|
|
|
|
|
|
|
Returns : max number of alleles (a scalar storing a number) |
1559
|
|
|
|
|
|
|
Args : ref to AoA |
1560
|
|
|
|
|
|
|
Status : public |
1561
|
|
|
|
|
|
|
|
1562
|
|
|
|
|
|
|
=cut |
1563
|
|
|
|
|
|
|
|
1564
|
|
|
|
|
|
|
#------------------------ |
1565
|
|
|
|
|
|
|
sub _alleles_number{ |
1566
|
|
|
|
|
|
|
#------------------------ |
1567
|
|
|
|
|
|
|
|
1568
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
1569
|
|
|
|
|
|
|
|
1570
|
1
|
|
|
|
|
1
|
my $hap_ref = $self ->{w_hap}; # working haplotype |
1571
|
|
|
|
|
|
|
|
1572
|
1
|
|
|
|
|
2
|
my $length = @{ @$hap_ref[0]}; # length of the haplotype |
|
1
|
|
|
|
|
4
|
|
1573
|
|
|
|
|
|
|
|
1574
|
1
|
|
|
|
|
4
|
for (my $c = 0; $c<$length;$c++){ |
1575
|
|
|
|
|
|
|
|
1576
|
5
|
|
|
|
|
7
|
my %alleles=(); |
1577
|
|
|
|
|
|
|
|
1578
|
5
|
|
|
|
|
11
|
for my $r (0..$#$hap_ref){ |
1579
|
20
|
|
|
|
|
26
|
$alleles{ $hap_ref->[$r][$c] } =1; # new key for every new snp |
1580
|
|
|
|
|
|
|
} |
1581
|
|
|
|
|
|
|
|
1582
|
|
|
|
|
|
|
# if the number of alleles for this column is |
1583
|
|
|
|
|
|
|
# greater than before set $m value as allele number |
1584
|
5
|
100
|
|
|
|
59
|
if ($self->{alleles_number} < keys %alleles) { |
1585
|
2
|
|
|
|
|
6
|
$self->{alleles_number} = keys %alleles; |
1586
|
|
|
|
|
|
|
} |
1587
|
|
|
|
|
|
|
} |
1588
|
|
|
|
|
|
|
} |
1589
|
|
|
|
|
|
|
|
1590
|
|
|
|
|
|
|
|
1591
|
|
|
|
|
|
|
=head2 _htSNP |
1592
|
|
|
|
|
|
|
|
1593
|
|
|
|
|
|
|
|
1594
|
|
|
|
|
|
|
Title : _htSNP |
1595
|
|
|
|
|
|
|
Usage : _htSNP() |
1596
|
|
|
|
|
|
|
Function: calculate the minimal set that contains all information of the |
1597
|
|
|
|
|
|
|
haplotype. |
1598
|
|
|
|
|
|
|
Returns : nonthing |
1599
|
|
|
|
|
|
|
Args : ref to an AoA and a ref to an array |
1600
|
|
|
|
|
|
|
Status : internal |
1601
|
|
|
|
|
|
|
|
1602
|
|
|
|
|
|
|
=cut |
1603
|
|
|
|
|
|
|
|
1604
|
|
|
|
|
|
|
#------------------------ |
1605
|
|
|
|
|
|
|
sub _htSNP{ |
1606
|
|
|
|
|
|
|
#------------------------ |
1607
|
1
|
|
|
1
|
|
1
|
my $self = shift; |
1608
|
|
|
|
|
|
|
|
1609
|
1
|
|
|
|
|
2
|
my $hap = $self->{'w_hap'}; |
1610
|
1
|
|
|
|
|
1
|
my $type = $self->{'snp_type_code'}; |
1611
|
1
|
|
|
|
|
2
|
my $set = $self->{'ht_type'}; |
1612
|
1
|
|
|
|
|
2
|
my $out = []; # store the minimal set |
1613
|
|
|
|
|
|
|
|
1614
|
1
|
|
|
|
|
1
|
my $nc=0; # new column for the output values |
1615
|
|
|
|
|
|
|
|
1616
|
|
|
|
|
|
|
# pass for every value of the snp_type_code |
1617
|
1
|
|
|
|
|
2
|
for my $c (0..$#$type){ |
1618
|
|
|
|
|
|
|
|
1619
|
5
|
|
|
|
|
3
|
my $exist =0; |
1620
|
|
|
|
|
|
|
|
1621
|
|
|
|
|
|
|
# every new value (not present) is pushed into set |
1622
|
5
|
100
|
|
|
|
8
|
if ( ! _is_there( $set,\$type->[$c] ) ){ |
1623
|
3
|
|
|
|
|
5
|
push @$set, $type->[$c]; |
1624
|
|
|
|
|
|
|
|
1625
|
3
|
|
|
|
|
2
|
$exist =1; |
1626
|
|
|
|
|
|
|
|
1627
|
3
|
|
|
|
|
6
|
for my $r(0..$#$hap){ |
1628
|
|
|
|
|
|
|
#save value of the snp for every SNP |
1629
|
12
|
|
|
|
|
17
|
$out->[$r][$nc]= $hap->[$r][$c]; |
1630
|
|
|
|
|
|
|
} |
1631
|
|
|
|
|
|
|
} |
1632
|
|
|
|
|
|
|
|
1633
|
5
|
100
|
|
|
|
10
|
if ($exist){ $nc++ }; |
|
3
|
|
|
|
|
5
|
|
1634
|
|
|
|
|
|
|
} |
1635
|
|
|
|
|
|
|
|
1636
|
1
|
|
|
|
|
2
|
@$hap = @{dclone $out}; |
|
1
|
|
|
|
|
22
|
|
1637
|
|
|
|
|
|
|
} |
1638
|
|
|
|
|
|
|
|
1639
|
|
|
|
|
|
|
=head2 _snp_and_code_summary |
1640
|
|
|
|
|
|
|
|
1641
|
|
|
|
|
|
|
Title : _snp_and_code_summary |
1642
|
|
|
|
|
|
|
Usage : _snp_and_code_summary() |
1643
|
|
|
|
|
|
|
Function: compile on a list all SNP and the code for each. This |
1644
|
|
|
|
|
|
|
information can be also obtained combining snp_type and |
1645
|
|
|
|
|
|
|
snp_type_code but on these results the information about |
1646
|
|
|
|
|
|
|
the rest of SNP's are not compiled as table. |
1647
|
|
|
|
|
|
|
|
1648
|
|
|
|
|
|
|
0 will be silent SNPs |
1649
|
|
|
|
|
|
|
-1 are degenerated SNPs |
1650
|
|
|
|
|
|
|
and the rest of positive values are the code for useful SNP |
1651
|
|
|
|
|
|
|
|
1652
|
|
|
|
|
|
|
Returns : nonthing |
1653
|
|
|
|
|
|
|
Args : ref to an AoA and a ref to an array |
1654
|
|
|
|
|
|
|
Status : internal |
1655
|
|
|
|
|
|
|
|
1656
|
|
|
|
|
|
|
=cut |
1657
|
|
|
|
|
|
|
|
1658
|
|
|
|
|
|
|
#------------------------ |
1659
|
|
|
|
|
|
|
sub _snp_and_code_summary{ |
1660
|
|
|
|
|
|
|
#------------------------ |
1661
|
1
|
|
|
1
|
|
1
|
my $self = shift; |
1662
|
|
|
|
|
|
|
|
1663
|
1
|
|
|
|
|
2
|
my $snp_type_code = $self->{'snp_type_code'}; |
1664
|
1
|
|
|
|
|
1
|
my $useful_snp = $self->{'snp_type'}->{'useful_snp'}; |
1665
|
1
|
|
|
|
|
14
|
my $silent_snp = $self->{'snp_type'}->{'silent_snp'}; |
1666
|
1
|
|
|
|
|
2
|
my $deg_snp = $self->{'snp_type'}->{'deg_snp'}; |
1667
|
1
|
|
|
|
|
3
|
my $snp_ids = $self->snp_ids(); |
1668
|
1
|
|
|
|
|
2
|
my $snp_and_code = $self->{'snp_and_code'}; |
1669
|
|
|
|
|
|
|
|
1670
|
|
|
|
|
|
|
# walk all SNP's and generate code for each |
1671
|
|
|
|
|
|
|
|
1672
|
|
|
|
|
|
|
# do a practical thing. Consider all snp silent |
1673
|
1
|
|
|
|
|
3
|
foreach my $i (0..$#$snp_ids){ |
1674
|
|
|
|
|
|
|
|
1675
|
|
|
|
|
|
|
# assign zero to silent |
1676
|
9
|
|
|
|
|
6
|
my $value=0; |
1677
|
|
|
|
|
|
|
|
1678
|
|
|
|
|
|
|
# active SNPs |
1679
|
9
|
|
|
|
|
9
|
foreach my $j (0..$#$useful_snp){ |
1680
|
35
|
100
|
|
|
|
49
|
if ($snp_ids->[$i] eq $useful_snp->[$j]){ |
1681
|
5
|
|
|
|
|
5
|
$value = $snp_type_code->[$j]; |
1682
|
5
|
|
|
|
|
5
|
last; |
1683
|
|
|
|
|
|
|
} |
1684
|
|
|
|
|
|
|
} |
1685
|
|
|
|
|
|
|
|
1686
|
|
|
|
|
|
|
# assign -1 to degenerated |
1687
|
9
|
|
|
|
|
13
|
foreach my $j (0..$#$deg_snp){ |
1688
|
24
|
100
|
|
|
|
38
|
if ($snp_ids->[$i] eq $deg_snp->[$j]){ |
1689
|
3
|
|
|
|
|
4
|
$value = -1; |
1690
|
3
|
|
|
|
|
2
|
last; |
1691
|
|
|
|
|
|
|
} |
1692
|
|
|
|
|
|
|
} |
1693
|
|
|
|
|
|
|
|
1694
|
9
|
|
|
|
|
20
|
push @$snp_and_code, [$snp_ids->[$i], $value]; |
1695
|
|
|
|
|
|
|
|
1696
|
|
|
|
|
|
|
} |
1697
|
|
|
|
|
|
|
} |
1698
|
|
|
|
|
|
|
|
1699
|
|
|
|
|
|
|
|
1700
|
|
|
|
|
|
|
1; |
1701
|
|
|
|
|
|
|
|
1702
|
|
|
|
|
|
|
|