line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package App::Fasops::Common; |
2
|
26
|
|
|
26
|
|
408216
|
use strict; |
|
26
|
|
|
|
|
172
|
|
|
26
|
|
|
|
|
824
|
|
3
|
26
|
|
|
26
|
|
192
|
use warnings; |
|
26
|
|
|
|
|
55
|
|
|
26
|
|
|
|
|
703
|
|
4
|
26
|
|
|
26
|
|
3714
|
use autodie; |
|
26
|
|
|
|
|
109540
|
|
|
26
|
|
|
|
|
178
|
|
5
|
|
|
|
|
|
|
|
6
|
26
|
|
|
26
|
|
150123
|
use 5.018001; |
|
26
|
|
|
|
|
427
|
|
7
|
|
|
|
|
|
|
|
8
|
26
|
|
|
26
|
|
180
|
use Carp qw(); |
|
26
|
|
|
|
|
55
|
|
|
26
|
|
|
|
|
620
|
|
9
|
26
|
|
|
26
|
|
14436
|
use File::ShareDir qw(); |
|
26
|
|
|
|
|
561859
|
|
|
26
|
|
|
|
|
718
|
|
10
|
26
|
|
|
26
|
|
3884
|
use IO::Zlib; |
|
26
|
|
|
|
|
472608
|
|
|
26
|
|
|
|
|
213
|
|
11
|
26
|
|
|
26
|
|
18396
|
use IPC::Cmd qw(); |
|
26
|
|
|
|
|
1271426
|
|
|
26
|
|
|
|
|
916
|
|
12
|
26
|
|
|
26
|
|
248
|
use List::Util; |
|
26
|
|
|
|
|
76
|
|
|
26
|
|
|
|
|
1615
|
|
13
|
26
|
|
|
26
|
|
6438
|
use Path::Tiny qw(); |
|
26
|
|
|
|
|
77734
|
|
|
26
|
|
|
|
|
515
|
|
14
|
26
|
|
|
26
|
|
3847
|
use Tie::IxHash; |
|
26
|
|
|
|
|
20785
|
|
|
26
|
|
|
|
|
715
|
|
15
|
26
|
|
|
26
|
|
3470
|
use YAML::Syck qw(); |
|
26
|
|
|
|
|
13789
|
|
|
26
|
|
|
|
|
499
|
|
16
|
|
|
|
|
|
|
|
17
|
26
|
|
|
26
|
|
3864
|
use AlignDB::IntSpan; |
|
26
|
|
|
|
|
52043
|
|
|
26
|
|
|
|
|
606
|
|
18
|
26
|
|
|
26
|
|
3451
|
use App::RL::Common; |
|
26
|
|
|
|
|
139450
|
|
|
26
|
|
|
|
|
229541
|
|
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
sub mean (@) { |
21
|
24
|
|
|
24
|
0
|
58
|
@_ = grep { defined $_ } @_; |
|
59
|
|
|
|
|
144
|
|
22
|
24
|
50
|
|
|
|
69
|
return 0 unless @_; |
23
|
24
|
100
|
|
|
|
69
|
return $_[0] unless @_ > 1; |
24
|
12
|
|
|
|
|
72
|
return List::Util::sum(@_) / scalar(@_); |
25
|
|
|
|
|
|
|
} |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
sub any (&@) { |
28
|
39682
|
|
|
39682
|
0
|
54663
|
my $f = shift; |
29
|
39682
|
|
|
|
|
59669
|
for (@_) { |
30
|
375847
|
100
|
|
|
|
502672
|
return 1 if $f->(); |
31
|
|
|
|
|
|
|
} |
32
|
38125
|
|
|
|
|
122935
|
return 0; |
33
|
|
|
|
|
|
|
} |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
sub all (&@) { |
36
|
42417
|
|
|
42417
|
0
|
58788
|
my $f = shift; |
37
|
42417
|
|
|
|
|
62496
|
for (@_) { |
38
|
384843
|
100
|
|
|
|
558519
|
return 0 unless $f->(); |
39
|
|
|
|
|
|
|
} |
40
|
40895
|
|
|
|
|
75918
|
return 1; |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
sub uniq (@) { |
44
|
2340
|
|
|
2340
|
0
|
3034
|
my %seen = (); |
45
|
2340
|
|
|
|
|
3022
|
my $k; |
46
|
|
|
|
|
|
|
my $seen_undef; |
47
|
2340
|
50
|
|
|
|
3419
|
return grep { defined $_ ? not $seen{ $k = $_ }++ : not $seen_undef++ } @_; |
|
13472
|
|
|
|
|
31718
|
|
48
|
|
|
|
|
|
|
} |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
sub firstidx (&@) { |
51
|
12
|
|
|
12
|
0
|
28
|
my $f = shift; |
52
|
12
|
|
|
|
|
28
|
for my $i ( 0 .. $#_ ) { |
53
|
19
|
|
|
|
|
33
|
local *_ = \$_[$i]; |
54
|
19
|
100
|
|
|
|
32
|
return $i if $f->(); |
55
|
|
|
|
|
|
|
} |
56
|
0
|
|
|
|
|
0
|
return -1; |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
sub read_replaces { |
60
|
3
|
|
|
3
|
0
|
5
|
my $file = shift; |
61
|
|
|
|
|
|
|
|
62
|
3
|
|
|
|
|
20
|
tie my %replace, "Tie::IxHash"; |
63
|
3
|
|
|
|
|
53
|
my @lines = Path::Tiny::path($file)->lines( { chomp => 1 } ); |
64
|
3
|
|
|
|
|
682
|
for (@lines) { |
65
|
5
|
|
|
|
|
76
|
my @fields = split /\t/; |
66
|
5
|
50
|
|
|
|
15
|
if ( @fields >= 1 ) { |
67
|
5
|
|
|
|
|
10
|
my $ori = shift @fields; |
68
|
5
|
|
|
|
|
29
|
$replace{$ori} = [@fields]; |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
3
|
|
|
|
|
50
|
return \%replace; |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
sub parse_block { |
76
|
112
|
|
|
112
|
0
|
481
|
my $block = shift; |
77
|
112
|
|
|
|
|
183
|
my $want_full = shift; |
78
|
|
|
|
|
|
|
|
79
|
112
|
|
|
|
|
2942
|
my @lines = grep {/\S/} split /\n/, $block; |
|
1074
|
|
|
|
|
2473
|
|
80
|
112
|
50
|
|
|
|
405
|
Carp::croak "Numbers of headers not equal to seqs\n" if @lines % 2; |
81
|
|
|
|
|
|
|
|
82
|
112
|
|
|
|
|
753
|
tie my %info_of, "Tie::IxHash"; |
83
|
112
|
|
|
|
|
1996
|
while (@lines) { |
84
|
537
|
|
|
|
|
7160
|
my $header = shift @lines; |
85
|
537
|
|
|
|
|
2064
|
$header =~ s/^\>//; |
86
|
537
|
|
|
|
|
1002
|
chomp $header; |
87
|
|
|
|
|
|
|
|
88
|
537
|
|
|
|
|
823
|
my $seq = shift @lines; |
89
|
537
|
|
|
|
|
705
|
chomp $seq; |
90
|
|
|
|
|
|
|
|
91
|
537
|
|
|
|
|
1270
|
my $info_ref = App::RL::Common::decode_header($header); |
92
|
537
|
|
|
|
|
83746
|
$info_ref->{seq} = $seq; |
93
|
537
|
100
|
66
|
|
|
7750
|
if ( $want_full or !defined $info_ref->{name} ) { |
94
|
364
|
|
|
|
|
843
|
my $ess_header = App::RL::Common::encode_header( $info_ref, 1 ); |
95
|
364
|
|
|
|
|
26787
|
$info_of{$ess_header} = $info_ref; |
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
else { |
98
|
173
|
|
|
|
|
1448
|
$info_of{ $info_ref->{name} } = $info_ref; |
99
|
|
|
|
|
|
|
} |
100
|
|
|
|
|
|
|
} |
101
|
|
|
|
|
|
|
|
102
|
112
|
|
|
|
|
2018
|
return \%info_of; |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
sub parse_block_array { |
106
|
10
|
|
|
10
|
0
|
20
|
my $block = shift; |
107
|
|
|
|
|
|
|
|
108
|
10
|
|
|
|
|
53
|
my @lines = grep {/\S/} split /\n/, $block; |
|
78
|
|
|
|
|
194
|
|
109
|
10
|
50
|
|
|
|
36
|
Carp::croak "Numbers of headers not equal to seqs\n" if @lines % 2; |
110
|
|
|
|
|
|
|
|
111
|
10
|
|
|
|
|
17
|
my $info_ary = []; |
112
|
10
|
|
|
|
|
32
|
while (@lines) { |
113
|
39
|
|
|
|
|
72
|
my $header = shift @lines; |
114
|
39
|
|
|
|
|
140
|
$header =~ s/^\>//; |
115
|
39
|
|
|
|
|
74
|
chomp $header; |
116
|
|
|
|
|
|
|
|
117
|
39
|
|
|
|
|
58
|
my $seq = shift @lines; |
118
|
39
|
|
|
|
|
56
|
chomp $seq; |
119
|
|
|
|
|
|
|
|
120
|
39
|
|
|
|
|
89
|
my $info_ref = App::RL::Common::decode_header($header); |
121
|
39
|
|
|
|
|
6556
|
$info_ref->{seq} = $seq; |
122
|
|
|
|
|
|
|
|
123
|
39
|
|
|
|
|
527
|
push @{$info_ary}, $info_ref; |
|
39
|
|
|
|
|
112
|
|
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
|
126
|
10
|
|
|
|
|
26
|
return $info_ary; |
127
|
|
|
|
|
|
|
} |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
sub parse_block_header { |
130
|
9
|
|
|
9
|
0
|
20
|
my $block = shift; |
131
|
|
|
|
|
|
|
|
132
|
9
|
|
|
|
|
44
|
my @lines = grep {/\S/} split /\n/, $block; |
|
72
|
|
|
|
|
165
|
|
133
|
9
|
50
|
|
|
|
29
|
Carp::croak "Numbers of headers not equal to seqs\n" if @lines % 2; |
134
|
|
|
|
|
|
|
|
135
|
9
|
|
|
|
|
42
|
tie my %info_of, "Tie::IxHash"; |
136
|
9
|
|
|
|
|
140
|
while (@lines) { |
137
|
36
|
|
|
|
|
401
|
my $header = shift @lines; |
138
|
36
|
|
|
|
|
148
|
$header =~ s/^\>//; |
139
|
36
|
|
|
|
|
64
|
chomp $header; |
140
|
|
|
|
|
|
|
|
141
|
36
|
|
|
|
|
52
|
my $seq = shift @lines; |
142
|
36
|
|
|
|
|
39
|
chomp $seq; |
143
|
|
|
|
|
|
|
|
144
|
36
|
|
|
|
|
84
|
my $info_ref = App::RL::Common::decode_header($header); |
145
|
36
|
|
|
|
|
5939
|
my $ess_header = App::RL::Common::encode_header( $info_ref, 1 ); |
146
|
36
|
|
|
|
|
2693
|
$info_ref->{seq} = $seq; |
147
|
36
|
|
|
|
|
528
|
$info_of{$ess_header} = $info_ref; |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
|
150
|
9
|
|
|
|
|
138
|
return \%info_of; |
151
|
|
|
|
|
|
|
} |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
sub parse_axt_block { |
154
|
6
|
|
|
6
|
0
|
12
|
my $block = shift; |
155
|
6
|
|
|
|
|
24
|
my $length_of = shift; |
156
|
|
|
|
|
|
|
|
157
|
6
|
|
|
|
|
27
|
my @lines = grep {/\S/} split /\n/, $block; |
|
18
|
|
|
|
|
63
|
|
158
|
6
|
50
|
|
|
|
15
|
Carp::croak "A block of axt should contain three lines\n" if @lines != 3; |
159
|
|
|
|
|
|
|
|
160
|
6
|
|
|
|
|
57
|
my (undef, $first_chr, $first_start, $first_end, $second_chr, |
161
|
|
|
|
|
|
|
$second_start, $second_end, $query_strand, undef, |
162
|
|
|
|
|
|
|
) = split /\s+/, $lines[0]; |
163
|
|
|
|
|
|
|
|
164
|
6
|
100
|
|
|
|
20
|
if ( $query_strand eq "-" ) { |
165
|
3
|
100
|
66
|
|
|
13
|
if ( defined $length_of and ref $length_of eq "HASH" ) { |
166
|
1
|
50
|
|
|
|
7
|
if ( exists $length_of->{$second_chr} ) { |
167
|
1
|
|
|
|
|
5
|
$second_start = $length_of->{$second_chr} - $second_start + 1; |
168
|
1
|
|
|
|
|
2
|
$second_end = $length_of->{$second_chr} - $second_end + 1; |
169
|
1
|
|
|
|
|
4
|
( $second_start, $second_end ) = ( $second_end, $second_start ); |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
|
174
|
6
|
|
|
|
|
50
|
my $info_refs = [ |
175
|
|
|
|
|
|
|
{ name => "target", |
176
|
|
|
|
|
|
|
chr => $first_chr, |
177
|
|
|
|
|
|
|
start => $first_start, |
178
|
|
|
|
|
|
|
end => $first_end, |
179
|
|
|
|
|
|
|
strand => "+", |
180
|
|
|
|
|
|
|
seq => $lines[1], |
181
|
|
|
|
|
|
|
}, |
182
|
|
|
|
|
|
|
{ name => "query", |
183
|
|
|
|
|
|
|
chr => $second_chr, |
184
|
|
|
|
|
|
|
start => $second_start, |
185
|
|
|
|
|
|
|
end => $second_end, |
186
|
|
|
|
|
|
|
strand => $query_strand, |
187
|
|
|
|
|
|
|
seq => $lines[2], |
188
|
|
|
|
|
|
|
}, |
189
|
|
|
|
|
|
|
]; |
190
|
|
|
|
|
|
|
|
191
|
6
|
|
|
|
|
21
|
return $info_refs; |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
sub parse_maf_block { |
195
|
4
|
|
|
4
|
0
|
8
|
my $block = shift; |
196
|
|
|
|
|
|
|
|
197
|
4
|
|
|
|
|
19
|
my @lines = grep {/\S/} split /\n/, $block; |
|
16
|
|
|
|
|
58
|
|
198
|
4
|
50
|
|
|
|
14
|
Carp::croak "A block of maf should contain s lines\n" unless @lines > 0; |
199
|
|
|
|
|
|
|
|
200
|
4
|
|
|
|
|
27
|
tie my %info_of, "Tie::IxHash"; |
201
|
|
|
|
|
|
|
|
202
|
4
|
|
|
|
|
119
|
for my $sline (@lines) { |
203
|
16
|
|
|
|
|
297
|
my ( undef, $src, $start, $size, $strand, $srcSize, $text ) = split /\s+/, $sline; |
204
|
|
|
|
|
|
|
|
205
|
16
|
|
|
|
|
43
|
my ( $species, $chr_name ) = split /\./, $src; |
206
|
16
|
50
|
|
|
|
33
|
$chr_name = $species if !defined $chr_name; |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
# adjust coordinates to be one-based inclusive |
209
|
16
|
|
|
|
|
33
|
$start = $start + 1; |
210
|
|
|
|
|
|
|
|
211
|
16
|
|
|
|
|
29
|
my $end = $start + $size - 1; |
212
|
|
|
|
|
|
|
|
213
|
16
|
100
|
|
|
|
32
|
if ( $strand eq "-" ) { |
214
|
4
|
|
|
|
|
9
|
$start = $srcSize - $start + 1; |
215
|
4
|
|
|
|
|
9
|
$end = $srcSize - $end + 1; |
216
|
4
|
|
|
|
|
7
|
( $start, $end ) = ( $end, $start ); |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
|
219
|
16
|
|
|
|
|
109
|
$info_of{$species} = { |
220
|
|
|
|
|
|
|
name => $species, |
221
|
|
|
|
|
|
|
chr => $chr_name, |
222
|
|
|
|
|
|
|
start => $start, |
223
|
|
|
|
|
|
|
end => $end, |
224
|
|
|
|
|
|
|
strand => $strand, |
225
|
|
|
|
|
|
|
seq => $text, |
226
|
|
|
|
|
|
|
}; |
227
|
|
|
|
|
|
|
} |
228
|
|
|
|
|
|
|
|
229
|
4
|
|
|
|
|
59
|
return \%info_of; |
230
|
|
|
|
|
|
|
} |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
sub revcom { |
233
|
10
|
|
|
10
|
0
|
2315
|
my $seq = shift; |
234
|
|
|
|
|
|
|
|
235
|
10
|
|
|
|
|
45
|
$seq =~ tr/ACGTMRWSYKVHDBNacgtmrwsykvhdbn-/TGCAKYWSRMBDHVNtgcakyswrmbdhvn-/; |
236
|
10
|
|
|
|
|
25
|
my $seq_rc = reverse $seq; |
237
|
|
|
|
|
|
|
|
238
|
10
|
|
|
|
|
29
|
return $seq_rc; |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
sub seq_length { |
242
|
15
|
|
|
15
|
0
|
2149
|
my $seq = shift; |
243
|
|
|
|
|
|
|
|
244
|
15
|
|
|
|
|
31
|
my $gaps = $seq =~ tr/-/-/; |
245
|
|
|
|
|
|
|
|
246
|
15
|
|
|
|
|
55
|
return length($seq) - $gaps; |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
#@returns AlignDB::IntSpan |
250
|
|
|
|
|
|
|
sub indel_intspan { |
251
|
196
|
|
|
196
|
0
|
5110
|
my $seq = shift; |
252
|
|
|
|
|
|
|
|
253
|
196
|
|
|
|
|
506
|
my $intspan = AlignDB::IntSpan->new(); |
254
|
196
|
|
|
|
|
1870
|
my $length = length($seq); |
255
|
|
|
|
|
|
|
|
256
|
196
|
|
|
|
|
278
|
my $offset = 0; |
257
|
196
|
|
|
|
|
268
|
my $start = 0; |
258
|
196
|
|
|
|
|
255
|
my $end = 0; |
259
|
196
|
|
|
|
|
384
|
for my $pos ( 1 .. $length ) { |
260
|
77221
|
|
|
|
|
101413
|
my $base = substr( $seq, $pos - 1, 1 ); |
261
|
77221
|
100
|
|
|
|
106079
|
if ( $base eq '-' ) { |
262
|
2262
|
100
|
|
|
|
3551
|
if ( $offset == 0 ) { |
263
|
528
|
|
|
|
|
663
|
$start = $pos; |
264
|
|
|
|
|
|
|
} |
265
|
2262
|
|
|
|
|
2972
|
$offset++; |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
else { |
268
|
74959
|
100
|
|
|
|
111914
|
if ( $offset != 0 ) { |
269
|
517
|
|
|
|
|
627
|
$end = $pos - 1; |
270
|
517
|
|
|
|
|
1097
|
$intspan->add_pair( $start, $end ); |
271
|
|
|
|
|
|
|
} |
272
|
74959
|
|
|
|
|
132766
|
$offset = 0; |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
} |
275
|
196
|
100
|
|
|
|
391
|
if ( $offset != 0 ) { |
276
|
11
|
|
|
|
|
23
|
$end = $length; |
277
|
11
|
|
|
|
|
31
|
$intspan->add_pair( $start, $end ); |
278
|
|
|
|
|
|
|
} |
279
|
|
|
|
|
|
|
|
280
|
196
|
|
|
|
|
1045
|
return $intspan; |
281
|
|
|
|
|
|
|
} |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
#@returns AlignDB::IntSpan |
284
|
|
|
|
|
|
|
sub seq_intspan { |
285
|
24
|
|
|
24
|
0
|
10992
|
my $seq = shift; |
286
|
|
|
|
|
|
|
|
287
|
24
|
|
|
|
|
107
|
my $intspan = AlignDB::IntSpan->new(); |
288
|
24
|
|
|
|
|
261
|
my $length = length($seq); |
289
|
24
|
|
|
|
|
77
|
$intspan->add_pair( 1, $length ); |
290
|
|
|
|
|
|
|
|
291
|
24
|
|
|
|
|
1456
|
$intspan->subtract( indel_intspan($seq) ); |
292
|
|
|
|
|
|
|
|
293
|
24
|
|
|
|
|
20142
|
return $intspan; |
294
|
|
|
|
|
|
|
} |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
sub align_seqs { |
297
|
0
|
|
|
0
|
0
|
0
|
my $seq_refs = shift; |
298
|
0
|
|
0
|
|
|
0
|
my $aln_bin = shift // "mafft"; |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
# get executable |
301
|
0
|
|
|
|
|
0
|
my $bin; |
302
|
|
|
|
|
|
|
|
303
|
0
|
0
|
|
|
|
0
|
if ( $aln_bin =~ /clus/i ) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
304
|
0
|
|
|
|
|
0
|
$aln_bin = 'clustalw'; |
305
|
0
|
|
|
|
|
0
|
for my $e (qw{clustalw clustal-w clustalw2}) { |
306
|
0
|
0
|
|
|
|
0
|
if ( IPC::Cmd::can_run($e) ) { |
307
|
0
|
|
|
|
|
0
|
$bin = $e; |
308
|
0
|
|
|
|
|
0
|
last; |
309
|
|
|
|
|
|
|
} |
310
|
|
|
|
|
|
|
} |
311
|
|
|
|
|
|
|
} |
312
|
|
|
|
|
|
|
elsif ( $aln_bin =~ /musc/i ) { |
313
|
0
|
|
|
|
|
0
|
$aln_bin = 'muscle'; |
314
|
0
|
|
|
|
|
0
|
for my $e (qw{muscle}) { |
315
|
0
|
0
|
|
|
|
0
|
if ( IPC::Cmd::can_run($e) ) { |
316
|
0
|
|
|
|
|
0
|
$bin = $e; |
317
|
0
|
|
|
|
|
0
|
last; |
318
|
|
|
|
|
|
|
} |
319
|
|
|
|
|
|
|
} |
320
|
|
|
|
|
|
|
} |
321
|
|
|
|
|
|
|
elsif ( $aln_bin =~ /maff/i ) { |
322
|
0
|
|
|
|
|
0
|
$aln_bin = 'mafft'; |
323
|
0
|
|
|
|
|
0
|
for my $e (qw{mafft}) { |
324
|
0
|
0
|
|
|
|
0
|
if ( IPC::Cmd::can_run($e) ) { |
325
|
0
|
|
|
|
|
0
|
$bin = $e; |
326
|
0
|
|
|
|
|
0
|
last; |
327
|
|
|
|
|
|
|
} |
328
|
|
|
|
|
|
|
} |
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
|
331
|
0
|
0
|
|
|
|
0
|
if ( !defined $bin ) { |
332
|
0
|
|
|
|
|
0
|
Carp::confess "Could not find the executable for $aln_bin\n"; |
333
|
|
|
|
|
|
|
} |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
# temp in and out |
336
|
|
|
|
|
|
|
#@type Path::Tiny |
337
|
0
|
|
|
|
|
0
|
my $temp_in = Path::Tiny->tempfile("seq_in_XXXXXXXX"); |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
#@type Path::Tiny |
340
|
0
|
|
|
|
|
0
|
my $temp_out = Path::Tiny->tempfile("seq_out_XXXXXXXX"); |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
# msa may change the order of sequences |
343
|
0
|
|
|
|
|
0
|
my @indexes = 0 .. scalar( @{$seq_refs} - 1 ); |
|
0
|
|
|
|
|
0
|
|
344
|
|
|
|
|
|
|
{ |
345
|
0
|
|
|
|
|
0
|
my $fh = $temp_in->openw(); |
|
0
|
|
|
|
|
0
|
|
346
|
0
|
|
|
|
|
0
|
for my $i (@indexes) { |
347
|
0
|
|
|
|
|
0
|
printf {$fh} ">seq_%d\n", $i; |
|
0
|
|
|
|
|
0
|
|
348
|
0
|
|
|
|
|
0
|
printf {$fh} "%s\n", $seq_refs->[$i]; |
|
0
|
|
|
|
|
0
|
|
349
|
|
|
|
|
|
|
} |
350
|
0
|
|
|
|
|
0
|
close $fh; |
351
|
|
|
|
|
|
|
} |
352
|
|
|
|
|
|
|
|
353
|
0
|
|
|
|
|
0
|
my @args; |
354
|
0
|
0
|
|
|
|
0
|
if ( $aln_bin eq "clustalw" ) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
355
|
0
|
|
|
|
|
0
|
push @args, "-align -type=dna -output=fasta -outorder=input -quiet"; |
356
|
0
|
|
|
|
|
0
|
push @args, "-infile=" . $temp_in->absolute->stringify; |
357
|
0
|
|
|
|
|
0
|
push @args, "-outfile=" . $temp_out->absolute->stringify; |
358
|
|
|
|
|
|
|
} |
359
|
|
|
|
|
|
|
elsif ( $aln_bin eq "muscle" ) { |
360
|
0
|
|
|
|
|
0
|
push @args, "-quiet"; |
361
|
0
|
|
|
|
|
0
|
push @args, "-in " . $temp_in->absolute->stringify; |
362
|
0
|
|
|
|
|
0
|
push @args, "-out " . $temp_out->absolute->stringify; |
363
|
|
|
|
|
|
|
} |
364
|
|
|
|
|
|
|
elsif ( $aln_bin eq "mafft" ) { |
365
|
0
|
|
|
|
|
0
|
push @args, "--quiet"; |
366
|
0
|
|
|
|
|
0
|
push @args, "--auto"; |
367
|
0
|
|
|
|
|
0
|
push @args, $temp_in->absolute->stringify; |
368
|
0
|
|
|
|
|
0
|
push @args, "> " . $temp_out->absolute->stringify; |
369
|
|
|
|
|
|
|
} |
370
|
|
|
|
|
|
|
|
371
|
0
|
|
|
|
|
0
|
my $cmd_line = join " ", ( $bin, @args ); |
372
|
0
|
|
|
|
|
0
|
my $ok = IPC::Cmd::run( command => $cmd_line ); |
373
|
|
|
|
|
|
|
|
374
|
0
|
0
|
|
|
|
0
|
if ( !$ok ) { |
375
|
0
|
|
|
|
|
0
|
Carp::confess("$aln_bin call failed\n"); |
376
|
|
|
|
|
|
|
} |
377
|
|
|
|
|
|
|
|
378
|
0
|
|
|
|
|
0
|
my @aligned; |
379
|
0
|
|
|
|
|
0
|
my $seq_of = read_fasta( $temp_out->absolute->stringify ); |
380
|
0
|
|
|
|
|
0
|
for my $i (@indexes) { |
381
|
0
|
|
|
|
|
0
|
push @aligned, $seq_of->{ "seq_" . $i }; |
382
|
|
|
|
|
|
|
} |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
# delete .dnd files created by clustalw |
385
|
|
|
|
|
|
|
#printf STDERR "%s\n", $temp_in->absolute->stringify; |
386
|
0
|
0
|
|
|
|
0
|
if ( $aln_bin eq "clustalw" ) { |
387
|
0
|
|
|
|
|
0
|
my $dnd = $temp_in->absolute->stringify . ".dnd"; |
388
|
0
|
|
|
|
|
0
|
path($dnd)->remove; |
389
|
|
|
|
|
|
|
} |
390
|
|
|
|
|
|
|
|
391
|
0
|
|
|
|
|
0
|
undef $temp_in; |
392
|
0
|
|
|
|
|
0
|
undef $temp_out; |
393
|
|
|
|
|
|
|
|
394
|
0
|
|
|
|
|
0
|
return \@aligned; |
395
|
|
|
|
|
|
|
} |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
sub align_seqs_quick { |
398
|
0
|
|
|
0
|
0
|
0
|
my $seq_refs = shift; |
399
|
0
|
|
|
|
|
0
|
my $aln_prog = shift; |
400
|
0
|
|
|
|
|
0
|
my $indel_pad = shift; |
401
|
0
|
|
|
|
|
0
|
my $indel_fill = shift; |
402
|
|
|
|
|
|
|
|
403
|
0
|
0
|
|
|
|
0
|
if ( !defined $aln_prog ) { |
404
|
0
|
|
|
|
|
0
|
$aln_prog = "mafft"; |
405
|
|
|
|
|
|
|
} |
406
|
0
|
0
|
|
|
|
0
|
if ( !defined $indel_pad ) { |
407
|
0
|
|
|
|
|
0
|
$indel_pad = 50; |
408
|
|
|
|
|
|
|
} |
409
|
0
|
0
|
|
|
|
0
|
if ( !defined $indel_fill ) { |
410
|
0
|
|
|
|
|
0
|
$indel_fill = 50; |
411
|
|
|
|
|
|
|
} |
412
|
|
|
|
|
|
|
|
413
|
0
|
|
|
|
|
0
|
my @aligned = @{$seq_refs}; |
|
0
|
|
|
|
|
0
|
|
414
|
0
|
|
|
|
|
0
|
my $seq_count = scalar @aligned; |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
# all indel regions |
417
|
0
|
|
|
|
|
0
|
my $realign_region = AlignDB::IntSpan->new(); |
418
|
0
|
|
|
|
|
0
|
for my $seq (@aligned) { |
419
|
0
|
|
|
|
|
0
|
my $indel_intspan = indel_intspan($seq); |
420
|
0
|
|
|
|
|
0
|
$indel_intspan = $indel_intspan->pad($indel_pad); |
421
|
0
|
|
|
|
|
0
|
$realign_region->merge($indel_intspan); |
422
|
|
|
|
|
|
|
} |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
# join adjacent realign regions |
425
|
0
|
|
|
|
|
0
|
$realign_region = $realign_region->fill($indel_fill); |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
# realign all segments in realign_region |
428
|
0
|
|
|
|
|
0
|
for my $span ( reverse $realign_region->spans ) { |
429
|
0
|
|
|
|
|
0
|
my $seg_start = $span->[0]; |
430
|
0
|
|
|
|
|
0
|
my $seg_end = $span->[1]; |
431
|
|
|
|
|
|
|
|
432
|
0
|
|
|
|
|
0
|
my @segments; |
433
|
0
|
|
|
|
|
0
|
for my $i ( 0 .. $seq_count - 1 ) { |
434
|
0
|
|
|
|
|
0
|
my $seg = substr( $aligned[$i], $seg_start - 1, $seg_end - $seg_start + 1 ); |
435
|
0
|
|
|
|
|
0
|
push @segments, $seg; |
436
|
|
|
|
|
|
|
} |
437
|
0
|
|
|
|
|
0
|
my $realigned_segments = align_seqs( \@segments ); |
438
|
|
|
|
|
|
|
|
439
|
0
|
|
|
|
|
0
|
for my $i ( 0 .. $seq_count - 1 ) { |
440
|
0
|
|
|
|
|
0
|
my $seg = $realigned_segments->[$i]; |
441
|
0
|
|
|
|
|
0
|
$seg = uc $seg; |
442
|
0
|
|
|
|
|
0
|
substr( $aligned[$i], $seg_start - 1, $seg_end - $seg_start + 1, $seg ); |
443
|
|
|
|
|
|
|
} |
444
|
|
|
|
|
|
|
} |
445
|
|
|
|
|
|
|
|
446
|
0
|
|
|
|
|
0
|
return \@aligned; |
447
|
|
|
|
|
|
|
} |
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
#----------------------------# |
450
|
|
|
|
|
|
|
# trim pure dash regions |
451
|
|
|
|
|
|
|
#----------------------------# |
452
|
|
|
|
|
|
|
sub trim_pure_dash { |
453
|
10
|
|
|
10
|
0
|
1654
|
my $seq_refs = shift; |
454
|
|
|
|
|
|
|
|
455
|
10
|
|
|
|
|
21
|
my $seq_count = @{$seq_refs}; |
|
10
|
|
|
|
|
22
|
|
456
|
10
|
|
|
|
|
22
|
my $seq_length = length $seq_refs->[0]; |
457
|
|
|
|
|
|
|
|
458
|
10
|
50
|
|
|
|
26
|
return unless $seq_length; |
459
|
|
|
|
|
|
|
|
460
|
10
|
|
|
|
|
72
|
my $trim_region = AlignDB::IntSpan->new(); |
461
|
|
|
|
|
|
|
|
462
|
10
|
|
|
|
|
144
|
for my $pos ( 1 .. $seq_length ) { |
463
|
528
|
|
|
|
|
1004
|
my @bases; |
464
|
528
|
|
|
|
|
822
|
for my $i ( 0 .. $seq_count - 1 ) { |
465
|
2062
|
|
|
|
|
2847
|
my $base = substr( $seq_refs->[$i], $pos - 1, 1 ); |
466
|
2062
|
|
|
|
|
3143
|
push @bases, $base; |
467
|
|
|
|
|
|
|
} |
468
|
|
|
|
|
|
|
|
469
|
528
|
100
|
|
544
|
|
1365
|
if ( all { $_ eq '-' } @bases ) { |
|
544
|
|
|
|
|
1973
|
|
470
|
4
|
|
|
|
|
13
|
$trim_region->add($pos); |
471
|
|
|
|
|
|
|
} |
472
|
|
|
|
|
|
|
} |
473
|
|
|
|
|
|
|
|
474
|
10
|
|
|
|
|
53
|
for my $span ( reverse $trim_region->spans ) { |
475
|
3
|
|
|
|
|
54
|
my $seg_start = $span->[0]; |
476
|
3
|
|
|
|
|
7
|
my $seg_end = $span->[1]; |
477
|
|
|
|
|
|
|
|
478
|
3
|
|
|
|
|
7
|
for my $i ( 0 .. $seq_count - 1 ) { |
479
|
9
|
|
|
|
|
21
|
substr( $seq_refs->[$i], $seg_start - 1, $seg_end - $seg_start + 1, "" ); |
480
|
|
|
|
|
|
|
} |
481
|
|
|
|
|
|
|
} |
482
|
|
|
|
|
|
|
|
483
|
10
|
|
|
|
|
261
|
return; |
484
|
|
|
|
|
|
|
} |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
#----------------------------# |
487
|
|
|
|
|
|
|
# trim outgroup only sequence |
488
|
|
|
|
|
|
|
#----------------------------# |
489
|
|
|
|
|
|
|
# if intersect is superset of union |
490
|
|
|
|
|
|
|
# T G----C |
491
|
|
|
|
|
|
|
# Q G----C |
492
|
|
|
|
|
|
|
# O GAAAAC |
493
|
|
|
|
|
|
|
sub trim_outgroup { |
494
|
11
|
|
|
11
|
0
|
9059
|
my $seq_refs = shift; |
495
|
|
|
|
|
|
|
|
496
|
11
|
|
|
|
|
21
|
my $seq_count = scalar @{$seq_refs}; |
|
11
|
|
|
|
|
15
|
|
497
|
|
|
|
|
|
|
|
498
|
11
|
50
|
|
|
|
35
|
Carp::confess "Need three or more sequences\n" if $seq_count < 3; |
499
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
# Don't expand indel set here. Last seq is outgroup |
501
|
11
|
|
|
|
|
16
|
my @indel_intspans; |
502
|
11
|
|
|
|
|
30
|
for my $i ( 0 .. $seq_count - 2 ) { |
503
|
22
|
|
|
|
|
43
|
my $indel_intspan = indel_intspan( $seq_refs->[$i] ); |
504
|
22
|
|
|
|
|
39
|
push @indel_intspans, $indel_intspan; |
505
|
|
|
|
|
|
|
} |
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
# find trim_region |
508
|
11
|
|
|
|
|
28
|
my $union_set = AlignDB::IntSpan::union(@indel_intspans); |
509
|
11
|
|
|
|
|
2112
|
my $intersect_set = AlignDB::IntSpan::intersect(@indel_intspans); |
510
|
|
|
|
|
|
|
|
511
|
11
|
|
|
|
|
3905
|
my $trim_region = AlignDB::IntSpan->new(); |
512
|
11
|
|
|
|
|
110
|
for my $span ( $union_set->runlists ) { |
513
|
23
|
100
|
|
|
|
3512
|
if ( $intersect_set->superset($span) ) { |
514
|
15
|
|
|
|
|
5399
|
$trim_region->add($span); |
515
|
|
|
|
|
|
|
} |
516
|
|
|
|
|
|
|
} |
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
# trim all segments in trim_region |
519
|
11
|
|
|
|
|
1759
|
for my $span ( reverse $trim_region->spans ) { |
520
|
13
|
|
|
|
|
215
|
my $seg_start = $span->[0]; |
521
|
13
|
|
|
|
|
21
|
my $seg_end = $span->[1]; |
522
|
|
|
|
|
|
|
|
523
|
13
|
|
|
|
|
28
|
for my $i ( 0 .. $seq_count - 1 ) { |
524
|
39
|
|
|
|
|
89
|
substr( $seq_refs->[$i], $seg_start - 1, $seg_end - $seg_start + 1, "" ); |
525
|
|
|
|
|
|
|
} |
526
|
|
|
|
|
|
|
} |
527
|
|
|
|
|
|
|
|
528
|
11
|
|
|
|
|
78
|
return; |
529
|
|
|
|
|
|
|
} |
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
#----------------------------# |
532
|
|
|
|
|
|
|
# record complex indels and ingroup indels |
533
|
|
|
|
|
|
|
#----------------------------# |
534
|
|
|
|
|
|
|
# All ingroup intersect sets are parts of complex indels after trim_outgroup() |
535
|
|
|
|
|
|
|
# intersect 4-6 |
536
|
|
|
|
|
|
|
# T GGA--C |
537
|
|
|
|
|
|
|
# Q G----C |
538
|
|
|
|
|
|
|
# O GGAGAC |
539
|
|
|
|
|
|
|
# result, complex_region 2-3 |
540
|
|
|
|
|
|
|
# T GGAC |
541
|
|
|
|
|
|
|
# Q G--C |
542
|
|
|
|
|
|
|
# O GGAC |
543
|
|
|
|
|
|
|
sub trim_complex_indel { |
544
|
6
|
|
|
6
|
0
|
27
|
my $seq_refs = shift; |
545
|
|
|
|
|
|
|
|
546
|
6
|
|
|
|
|
10
|
my $seq_count = scalar @{$seq_refs}; |
|
6
|
|
|
|
|
11
|
|
547
|
6
|
|
|
|
|
16
|
my @ingroup_idx = ( 0 .. $seq_count - 2 ); |
548
|
|
|
|
|
|
|
|
549
|
6
|
50
|
|
|
|
13
|
Carp::confess "Need three or more sequences\n" if $seq_count < 3; |
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
# Don't expand indel set here. Last seq is outgroup |
552
|
6
|
|
|
|
|
12
|
my @indel_intspans; |
553
|
6
|
|
|
|
|
9
|
for my $i (@ingroup_idx) { |
554
|
12
|
|
|
|
|
28
|
my $indel_intspan = indel_intspan( $seq_refs->[$i] ); |
555
|
12
|
|
|
|
|
29
|
push @indel_intspans, $indel_intspan; |
556
|
|
|
|
|
|
|
} |
557
|
6
|
|
|
|
|
26
|
my $outgroup_indel_intspan = indel_intspan( $seq_refs->[ $seq_count - 1 ] ); |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
# find trim_region |
560
|
6
|
|
|
|
|
13
|
my $union_set = AlignDB::IntSpan::union(@indel_intspans); |
561
|
6
|
|
|
|
|
621
|
my $intersect_set = AlignDB::IntSpan::intersect(@indel_intspans); |
562
|
|
|
|
|
|
|
|
563
|
6
|
|
|
|
|
1317
|
my $complex_region = AlignDB::IntSpan->new(); |
564
|
6
|
|
|
|
|
60
|
for ( reverse $intersect_set->spans ) { |
565
|
3
|
|
|
|
|
61
|
my $seg_start = $_->[0]; |
566
|
3
|
|
|
|
|
7
|
my $seg_end = $_->[1]; |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
# trim sequence, including outgroup |
569
|
3
|
|
|
|
|
26
|
for my $i ( 0 .. $seq_count - 1 ) { |
570
|
9
|
|
|
|
|
21
|
substr( $seq_refs->[$i], $seg_start - 1, $seg_end - $seg_start + 1, "" ); |
571
|
|
|
|
|
|
|
} |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
# add to complex_region |
574
|
3
|
|
|
|
|
13
|
for my $span ( $union_set->runlists ) { |
575
|
4
|
|
|
|
|
209
|
my $sub_union_set = AlignDB::IntSpan->new($span); |
576
|
4
|
100
|
|
|
|
405
|
if ( $sub_union_set->superset("$seg_start-$seg_end") ) { |
577
|
3
|
|
|
|
|
1016
|
$complex_region->merge($sub_union_set); |
578
|
|
|
|
|
|
|
} |
579
|
|
|
|
|
|
|
} |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
# modify all related set |
582
|
3
|
|
|
|
|
527
|
$union_set = $union_set->banish_span( $seg_start, $seg_end ); |
583
|
3
|
|
|
|
|
511
|
for (@ingroup_idx) { |
584
|
6
|
|
|
|
|
353
|
$indel_intspans[$_] |
585
|
|
|
|
|
|
|
= $indel_intspans[$_]->banish_span( $seg_start, $seg_end ); |
586
|
|
|
|
|
|
|
} |
587
|
3
|
|
|
|
|
213
|
$outgroup_indel_intspan->banish_span( $seg_start, $seg_end ); |
588
|
3
|
|
|
|
|
339
|
$complex_region = $complex_region->banish_span( $seg_start, $seg_end ); |
589
|
|
|
|
|
|
|
} |
590
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
# add ingroup-outgroup complex indels to complex_region |
592
|
6
|
|
|
|
|
386
|
for my $i (@ingroup_idx) { |
593
|
12
|
|
|
|
|
1244
|
my $outgroup_intersect_intspan = $outgroup_indel_intspan->intersect( $indel_intspans[$i] ); |
594
|
12
|
|
|
|
|
2543
|
for my $sub_out_set ( $outgroup_intersect_intspan->sets ) { |
595
|
4
|
|
|
|
|
549
|
for my $sub_union_set ( $union_set->sets ) { |
596
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
# union_set > intersect_set |
598
|
5
|
50
|
|
|
|
925
|
if ( $sub_union_set->larger_than($sub_out_set) ) { |
599
|
0
|
|
|
|
|
0
|
$complex_region->merge($sub_union_set); |
600
|
|
|
|
|
|
|
} |
601
|
|
|
|
|
|
|
} |
602
|
|
|
|
|
|
|
} |
603
|
|
|
|
|
|
|
} |
604
|
|
|
|
|
|
|
|
605
|
6
|
|
|
|
|
97
|
return $complex_region->runlist; |
606
|
|
|
|
|
|
|
} |
607
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
# read normal fasta files |
609
|
|
|
|
|
|
|
sub read_fasta { |
610
|
1
|
|
|
1
|
0
|
569
|
my $infile = shift; |
611
|
|
|
|
|
|
|
|
612
|
1
|
|
|
|
|
3
|
my ( @names, %seqs ); |
613
|
|
|
|
|
|
|
|
614
|
1
|
|
|
|
|
14
|
my $in_fh = IO::Zlib->new( $infile, "rb" ); |
615
|
|
|
|
|
|
|
|
616
|
1
|
|
|
|
|
2050
|
my $cur_name; |
617
|
1
|
|
|
|
|
5
|
while ( my $line = $in_fh->getline ) { |
618
|
8
|
|
|
|
|
1299
|
chomp $line; |
619
|
8
|
50
|
33
|
|
|
47
|
if ( $line eq '' or substr( $line, 0, 1 ) eq " " ) { |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
620
|
0
|
|
|
|
|
0
|
next; |
621
|
|
|
|
|
|
|
} |
622
|
|
|
|
|
|
|
elsif ( substr( $line, 0, 1 ) eq "#" ) { |
623
|
0
|
|
|
|
|
0
|
next; |
624
|
|
|
|
|
|
|
} |
625
|
|
|
|
|
|
|
elsif ( substr( $line, 0, 1 ) eq ">" ) { |
626
|
4
|
|
|
|
|
17
|
($cur_name) = split /\s+/, $line; |
627
|
4
|
|
|
|
|
15
|
$cur_name =~ s/^>//; |
628
|
4
|
|
|
|
|
9
|
push @names, $cur_name; |
629
|
4
|
|
|
|
|
17
|
$seqs{$cur_name} = ''; |
630
|
|
|
|
|
|
|
} |
631
|
|
|
|
|
|
|
else { |
632
|
4
|
|
|
|
|
29
|
$seqs{$cur_name} .= $line; |
633
|
|
|
|
|
|
|
} |
634
|
|
|
|
|
|
|
} |
635
|
|
|
|
|
|
|
|
636
|
1
|
|
|
|
|
282
|
$in_fh->close; |
637
|
|
|
|
|
|
|
|
638
|
1
|
|
|
|
|
209
|
tie my %seq_of, "Tie::IxHash"; |
639
|
1
|
|
|
|
|
23
|
for my $name (@names) { |
640
|
4
|
|
|
|
|
67
|
$seq_of{$name} = $seqs{$name}; |
641
|
|
|
|
|
|
|
} |
642
|
1
|
|
|
|
|
19
|
return \%seq_of; |
643
|
|
|
|
|
|
|
} |
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
sub calc_gc_ratio { |
646
|
13
|
|
|
13
|
0
|
5498
|
my $seq_refs = shift; |
647
|
|
|
|
|
|
|
|
648
|
13
|
|
|
|
|
26
|
my $seq_count = scalar @{$seq_refs}; |
|
13
|
|
|
|
|
21
|
|
649
|
|
|
|
|
|
|
|
650
|
13
|
|
|
|
|
19
|
my @ratios; |
651
|
13
|
|
|
|
|
34
|
for my $i ( 0 .. $seq_count - 1 ) { |
652
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
# Count all four bases |
654
|
17
|
|
|
|
|
38
|
my $a_count = $seq_refs->[$i] =~ tr/Aa/Aa/; |
655
|
17
|
|
|
|
|
30
|
my $g_count = $seq_refs->[$i] =~ tr/Gg/Gg/; |
656
|
17
|
|
|
|
|
28
|
my $c_count = $seq_refs->[$i] =~ tr/Cc/Cc/; |
657
|
17
|
|
|
|
|
28
|
my $t_count = $seq_refs->[$i] =~ tr/Tt/Tt/; |
658
|
|
|
|
|
|
|
|
659
|
17
|
|
|
|
|
32
|
my $four_count = $a_count + $g_count + $c_count + $t_count; |
660
|
17
|
|
|
|
|
25
|
my $gc_count = $g_count + $c_count; |
661
|
|
|
|
|
|
|
|
662
|
17
|
50
|
|
|
|
29
|
if ( $four_count == 0 ) { |
663
|
0
|
|
|
|
|
0
|
next; |
664
|
|
|
|
|
|
|
} |
665
|
|
|
|
|
|
|
else { |
666
|
17
|
|
|
|
|
28
|
my $gc_ratio = $gc_count / $four_count; |
667
|
17
|
|
|
|
|
37
|
push @ratios, $gc_ratio; |
668
|
|
|
|
|
|
|
} |
669
|
|
|
|
|
|
|
} |
670
|
|
|
|
|
|
|
|
671
|
13
|
|
|
|
|
34
|
return mean(@ratios); |
672
|
|
|
|
|
|
|
} |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
sub pair_D { |
675
|
60
|
|
|
60
|
0
|
446
|
my $seq_refs = shift; |
676
|
|
|
|
|
|
|
|
677
|
60
|
|
|
|
|
80
|
my $seq_count = scalar @{$seq_refs}; |
|
60
|
|
|
|
|
112
|
|
678
|
60
|
50
|
|
|
|
128
|
if ( $seq_count != 2 ) { |
679
|
0
|
|
|
|
|
0
|
Carp::confess "Need two sequences\n"; |
680
|
|
|
|
|
|
|
} |
681
|
|
|
|
|
|
|
|
682
|
60
|
|
|
|
|
119
|
my $legnth = length $seq_refs->[0]; |
683
|
|
|
|
|
|
|
|
684
|
60
|
|
|
|
|
123
|
my ( $comparable_bases, $differences ) = (0) x 2; |
685
|
|
|
|
|
|
|
|
686
|
60
|
|
|
|
|
109
|
for my $pos ( 1 .. $legnth ) { |
687
|
4943
|
|
|
|
|
7884
|
my $base0 = substr $seq_refs->[0], $pos - 1, 1; |
688
|
4943
|
|
|
|
|
7684
|
my $base1 = substr $seq_refs->[1], $pos - 1, 1; |
689
|
|
|
|
|
|
|
|
690
|
4943
|
100
|
100
|
|
|
16664
|
if ( $base0 =~ /[atcg]/i |
691
|
|
|
|
|
|
|
&& $base1 =~ /[atcg]/i ) |
692
|
|
|
|
|
|
|
{ |
693
|
4858
|
|
|
|
|
6135
|
$comparable_bases++; |
694
|
4858
|
100
|
|
|
|
8643
|
if ( $base0 ne $base1 ) { |
695
|
915
|
|
|
|
|
1386
|
$differences++; |
696
|
|
|
|
|
|
|
} |
697
|
|
|
|
|
|
|
} |
698
|
|
|
|
|
|
|
} |
699
|
|
|
|
|
|
|
|
700
|
60
|
50
|
|
|
|
140
|
if ( $comparable_bases == 0 ) { |
701
|
0
|
|
|
|
|
0
|
Carp::carp "comparable_bases == 0\n"; |
702
|
0
|
|
|
|
|
0
|
return 0; |
703
|
|
|
|
|
|
|
} |
704
|
|
|
|
|
|
|
else { |
705
|
60
|
|
|
|
|
178
|
return $differences / $comparable_bases; |
706
|
|
|
|
|
|
|
} |
707
|
|
|
|
|
|
|
} |
708
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
# Split D value to D1 (substitutions in first_seq), D2( substitutions in second_seq) and Dcomplex |
710
|
|
|
|
|
|
|
# (substitutions can't be referred) |
711
|
|
|
|
|
|
|
sub ref_pair_D { |
712
|
7
|
|
|
7
|
0
|
2865
|
my $seq_refs = shift; # first, second, outgroup |
713
|
|
|
|
|
|
|
|
714
|
7
|
|
|
|
|
31
|
my $seq_count = scalar @{$seq_refs}; |
|
7
|
|
|
|
|
12
|
|
715
|
7
|
50
|
|
|
|
20
|
if ( $seq_count != 3 ) { |
716
|
0
|
|
|
|
|
0
|
Carp::confess "Need three sequences\n"; |
717
|
|
|
|
|
|
|
} |
718
|
|
|
|
|
|
|
|
719
|
7
|
|
|
|
|
17
|
my ( $d1, $d2, $dc ) = (0) x 3; |
720
|
7
|
|
|
|
|
14
|
my $length = length $seq_refs->[0]; |
721
|
|
|
|
|
|
|
|
722
|
7
|
50
|
|
|
|
16
|
return ( $d1, $d2, $dc ) if $length == 0; |
723
|
|
|
|
|
|
|
|
724
|
7
|
|
|
|
|
14
|
for my $pos ( 1 .. $length ) { |
725
|
70
|
|
|
|
|
107
|
my $base0 = substr $seq_refs->[0], $pos - 1, 1; |
726
|
70
|
|
|
|
|
103
|
my $base1 = substr $seq_refs->[1], $pos - 1, 1; |
727
|
70
|
|
|
|
|
100
|
my $base_og = substr $seq_refs->[2], $pos - 1, 1; |
728
|
70
|
100
|
|
|
|
127
|
if ( $base0 ne $base1 ) { |
729
|
8
|
100
|
33
|
|
|
61
|
if ( $base0 =~ /[atcg]/i |
|
|
|
66
|
|
|
|
|
730
|
|
|
|
|
|
|
&& $base1 =~ /[atcg]/i |
731
|
|
|
|
|
|
|
&& $base_og =~ /[atcg]/i ) |
732
|
|
|
|
|
|
|
{ |
733
|
7
|
100
|
|
|
|
18
|
if ( $base1 eq $base_og ) { |
|
|
100
|
|
|
|
|
|
734
|
3
|
|
|
|
|
7
|
$d1++; |
735
|
|
|
|
|
|
|
} |
736
|
|
|
|
|
|
|
elsif ( $base0 eq $base_og ) { |
737
|
3
|
|
|
|
|
6
|
$d2++; |
738
|
|
|
|
|
|
|
} |
739
|
|
|
|
|
|
|
else { |
740
|
1
|
|
|
|
|
3
|
$dc++; |
741
|
|
|
|
|
|
|
} |
742
|
|
|
|
|
|
|
} |
743
|
|
|
|
|
|
|
else { |
744
|
1
|
|
|
|
|
3
|
$dc++; |
745
|
|
|
|
|
|
|
} |
746
|
|
|
|
|
|
|
} |
747
|
|
|
|
|
|
|
} |
748
|
|
|
|
|
|
|
|
749
|
7
|
|
|
|
|
14
|
for ( $d1, $d2, $dc ) { |
750
|
21
|
|
|
|
|
34
|
$_ /= $length; |
751
|
|
|
|
|
|
|
} |
752
|
|
|
|
|
|
|
|
753
|
7
|
|
|
|
|
21
|
return ( $d1, $d2, $dc ); |
754
|
|
|
|
|
|
|
} |
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
sub multi_seq_stat { |
757
|
11
|
|
|
11
|
0
|
1290
|
my $seq_refs = shift; |
758
|
|
|
|
|
|
|
|
759
|
11
|
|
|
|
|
19
|
my $seq_count = scalar @{$seq_refs}; |
|
11
|
|
|
|
|
22
|
|
760
|
11
|
50
|
|
|
|
31
|
if ( $seq_count < 2 ) { |
761
|
0
|
|
|
|
|
0
|
Carp::confess "Need two or more sequences\n"; |
762
|
|
|
|
|
|
|
} |
763
|
|
|
|
|
|
|
|
764
|
11
|
|
|
|
|
28
|
my $legnth = length $seq_refs->[0]; |
765
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
# For every positions, search for polymorphism_site |
767
|
11
|
|
|
|
|
35
|
my ( $comparable_bases, $identities, $differences, $gaps, $ns, $align_errors, ) = (0) x 6; |
768
|
11
|
|
|
|
|
32
|
for my $pos ( 1 .. $legnth ) { |
769
|
746
|
|
|
|
|
1094
|
my @bases = (); |
770
|
746
|
|
|
|
|
1337
|
for my $i ( 0 .. $seq_count - 1 ) { |
771
|
2645
|
|
|
|
|
4151
|
my $base = substr( $seq_refs->[$i], $pos - 1, 1 ); |
772
|
2645
|
|
|
|
|
4494
|
push @bases, $base; |
773
|
|
|
|
|
|
|
} |
774
|
746
|
|
|
|
|
1187
|
@bases = uniq(@bases); |
775
|
|
|
|
|
|
|
|
776
|
746
|
100
|
|
991
|
|
2184
|
if ( all { $_ =~ /[agct]/i } @bases ) { |
|
991
|
100
|
|
|
|
3390
|
|
777
|
725
|
|
|
|
|
969
|
$comparable_bases++; |
778
|
725
|
100
|
|
936
|
|
1832
|
if ( all { $_ eq $bases[0] } @bases ) { |
|
936
|
|
|
|
|
2832
|
|
779
|
514
|
|
|
|
|
1380
|
$identities++; |
780
|
|
|
|
|
|
|
} |
781
|
|
|
|
|
|
|
else { |
782
|
211
|
|
|
|
|
621
|
$differences++; |
783
|
|
|
|
|
|
|
} |
784
|
|
|
|
|
|
|
} |
785
|
41
|
|
|
41
|
|
115
|
elsif ( any { $_ eq '-' } @bases ) { |
786
|
20
|
|
|
|
|
61
|
$gaps++; |
787
|
|
|
|
|
|
|
} |
788
|
|
|
|
|
|
|
else { |
789
|
1
|
|
|
|
|
4
|
$ns++; |
790
|
|
|
|
|
|
|
} |
791
|
|
|
|
|
|
|
} |
792
|
11
|
50
|
|
|
|
29
|
if ( $comparable_bases == 0 ) { |
793
|
0
|
|
|
|
|
0
|
print YAML::Syck::Dump { seqs => $seq_refs, }; |
794
|
0
|
|
|
|
|
0
|
Carp::carp "number_of_comparable_bases == 0!!\n"; |
795
|
0
|
|
|
|
|
0
|
return [ $legnth, $comparable_bases, $identities, $differences, |
796
|
|
|
|
|
|
|
$gaps, $ns, $legnth, undef, ]; |
797
|
|
|
|
|
|
|
} |
798
|
|
|
|
|
|
|
|
799
|
11
|
|
|
|
|
24
|
my @all_Ds; |
800
|
11
|
|
|
|
|
31
|
for ( my $i = 0; $i < $seq_count; $i++ ) { |
801
|
35
|
|
|
|
|
104
|
for ( my $j = $i + 1; $j < $seq_count; $j++ ) { |
802
|
42
|
|
|
|
|
132
|
my $D = pair_D( [ $seq_refs->[$i], $seq_refs->[$j] ] ); |
803
|
42
|
|
|
|
|
137
|
push @all_Ds, $D; |
804
|
|
|
|
|
|
|
} |
805
|
|
|
|
|
|
|
} |
806
|
|
|
|
|
|
|
|
807
|
11
|
|
|
|
|
32
|
my $D = mean(@all_Ds); |
808
|
|
|
|
|
|
|
|
809
|
11
|
|
|
|
|
65
|
return [ $legnth, $comparable_bases, $identities, $differences, |
810
|
|
|
|
|
|
|
$gaps, $ns, $align_errors, $D, ]; |
811
|
|
|
|
|
|
|
} |
812
|
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
sub get_snps { |
814
|
37
|
|
|
37
|
0
|
2978
|
my $seq_refs = shift; |
815
|
|
|
|
|
|
|
|
816
|
37
|
|
|
|
|
80
|
my $align_length = length $seq_refs->[0]; |
817
|
37
|
|
|
|
|
61
|
my $seq_count = scalar @{$seq_refs}; |
|
37
|
|
|
|
|
56
|
|
818
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
# SNPs |
820
|
37
|
|
|
|
|
67
|
my $snp_bases_of = {}; |
821
|
37
|
|
|
|
|
102
|
for my $pos ( 1 .. $align_length ) { |
822
|
40418
|
|
|
|
|
52662
|
my @bases; |
823
|
40418
|
|
|
|
|
66312
|
for my $i ( 0 .. $seq_count - 1 ) { |
824
|
387181
|
|
|
|
|
535967
|
my $base = substr( $seq_refs->[$i], $pos - 1, 1 ); |
825
|
387181
|
|
|
|
|
582042
|
push @bases, $base; |
826
|
|
|
|
|
|
|
} |
827
|
|
|
|
|
|
|
|
828
|
40418
|
100
|
|
382372
|
|
96073
|
if ( all { $_ =~ /[agct]/i } @bases ) { |
|
382372
|
|
|
|
|
950578
|
|
829
|
39652
|
100
|
|
375770
|
|
94489
|
if ( any { $_ ne $bases[0] } @bases ) { |
|
375770
|
|
|
|
|
765597
|
|
830
|
1537
|
|
|
|
|
7509
|
$snp_bases_of->{$pos} = \@bases; |
831
|
|
|
|
|
|
|
} |
832
|
|
|
|
|
|
|
} |
833
|
|
|
|
|
|
|
} |
834
|
|
|
|
|
|
|
|
835
|
37
|
|
|
|
|
61
|
my @sites; |
836
|
37
|
|
|
|
|
65
|
for my $pos ( sort { $a <=> $b } keys %{$snp_bases_of} ) { |
|
9498
|
|
|
|
|
10991
|
|
|
37
|
|
|
|
|
564
|
|
837
|
|
|
|
|
|
|
|
838
|
1537
|
|
|
|
|
2218
|
my @bases = @{ $snp_bases_of->{$pos} }; |
|
1537
|
|
|
|
|
4146
|
|
839
|
|
|
|
|
|
|
|
840
|
1537
|
|
|
|
|
2131
|
my $target_base = $bases[0]; |
841
|
1537
|
|
|
|
|
2594
|
my $all_bases = join '', @bases; |
842
|
|
|
|
|
|
|
|
843
|
1537
|
|
|
|
|
2083
|
my $query_base; |
844
|
|
|
|
|
|
|
my $mutant_to; |
845
|
1537
|
|
|
|
|
1858
|
my $snp_freq = 0; |
846
|
1537
|
|
|
|
|
1734
|
my $snp_occured; |
847
|
1537
|
|
|
|
|
3794
|
my @class = uniq( sort @bases ); |
848
|
1537
|
50
|
|
|
|
3152
|
if ( scalar @class < 2 ) { |
|
|
100
|
|
|
|
|
|
849
|
0
|
|
|
|
|
0
|
Carp::confess "no snp\n"; |
850
|
|
|
|
|
|
|
} |
851
|
|
|
|
|
|
|
elsif ( scalar @class > 2 ) { |
852
|
77
|
|
|
|
|
98
|
$snp_freq = -1; |
853
|
77
|
|
|
|
|
104
|
$snp_occured = 'unknown'; |
854
|
77
|
|
|
|
|
100
|
$query_base = ""; |
855
|
77
|
|
|
|
|
109
|
$mutant_to = ""; |
856
|
|
|
|
|
|
|
} |
857
|
|
|
|
|
|
|
else { |
858
|
1460
|
|
|
|
|
2155
|
for (@bases) { |
859
|
10170
|
100
|
|
|
|
13824
|
if ( $target_base ne $_ ) { |
860
|
3527
|
|
|
|
|
4235
|
$snp_freq++; |
861
|
3527
|
|
|
|
|
4726
|
$snp_occured .= '0'; |
862
|
|
|
|
|
|
|
} |
863
|
|
|
|
|
|
|
else { |
864
|
6643
|
|
|
|
|
8762
|
$snp_occured .= '1'; |
865
|
|
|
|
|
|
|
} |
866
|
|
|
|
|
|
|
} |
867
|
1460
|
|
|
|
|
1925
|
($query_base) = grep { $_ ne $target_base } @bases; |
|
10170
|
|
|
|
|
15776
|
|
868
|
1460
|
|
|
|
|
2259
|
$mutant_to = $target_base . '<->' . $query_base; |
869
|
|
|
|
|
|
|
} |
870
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
# here freq is the minor allele freq |
872
|
1537
|
|
|
|
|
2829
|
$snp_freq = List::Util::min( $snp_freq, $seq_count - $snp_freq ); |
873
|
|
|
|
|
|
|
|
874
|
1537
|
|
|
|
|
8266
|
push @sites, |
875
|
|
|
|
|
|
|
{ |
876
|
|
|
|
|
|
|
snp_pos => $pos, |
877
|
|
|
|
|
|
|
snp_target_base => $target_base, |
878
|
|
|
|
|
|
|
snp_query_base => $query_base, |
879
|
|
|
|
|
|
|
snp_all_bases => $all_bases, |
880
|
|
|
|
|
|
|
snp_mutant_to => $mutant_to, |
881
|
|
|
|
|
|
|
snp_freq => $snp_freq, |
882
|
|
|
|
|
|
|
snp_occured => $snp_occured, |
883
|
|
|
|
|
|
|
}; |
884
|
|
|
|
|
|
|
} |
885
|
|
|
|
|
|
|
|
886
|
37
|
|
|
|
|
1162
|
return \@sites; |
887
|
|
|
|
|
|
|
} |
888
|
|
|
|
|
|
|
|
889
|
|
|
|
|
|
|
sub polarize_snp { |
890
|
13
|
|
|
13
|
0
|
27
|
my $sites = shift; |
891
|
13
|
|
|
|
|
22
|
my $outgroup_seq = shift; |
892
|
|
|
|
|
|
|
|
893
|
13
|
|
|
|
|
20
|
for my $site ( @{$sites} ) { |
|
13
|
|
|
|
|
28
|
|
894
|
195
|
|
|
|
|
323
|
my $outgroup_base = substr $outgroup_seq, $site->{snp_pos} - 1, 1; |
895
|
|
|
|
|
|
|
|
896
|
195
|
|
|
|
|
386
|
my @nts = split '', $site->{snp_all_bases}; |
897
|
195
|
|
|
|
|
237
|
my @class; |
898
|
195
|
|
|
|
|
269
|
for my $nt (@nts) { |
899
|
582
|
|
|
|
|
688
|
my $class_bool = 0; |
900
|
582
|
|
|
|
|
732
|
for (@class) { |
901
|
571
|
100
|
|
|
|
976
|
if ( $_ eq $nt ) { $class_bool = 1; } |
|
188
|
|
|
|
|
342
|
|
902
|
|
|
|
|
|
|
} |
903
|
582
|
100
|
|
|
|
899
|
unless ($class_bool) { |
904
|
394
|
|
|
|
|
698
|
push @class, $nt; |
905
|
|
|
|
|
|
|
} |
906
|
|
|
|
|
|
|
} |
907
|
|
|
|
|
|
|
|
908
|
195
|
|
|
|
|
254
|
my ( $mutant_to, $snp_freq, $snp_occured ); |
909
|
|
|
|
|
|
|
|
910
|
195
|
50
|
|
|
|
370
|
if ( scalar @class < 2 ) { |
|
|
100
|
|
|
|
|
|
911
|
0
|
|
|
|
|
0
|
Carp::confess "Not a real SNP\n"; |
912
|
|
|
|
|
|
|
} |
913
|
|
|
|
|
|
|
elsif ( scalar @class == 2 ) { |
914
|
191
|
|
|
|
|
241
|
for my $nt (@nts) { |
915
|
570
|
100
|
|
|
|
790
|
if ( $nt eq $outgroup_base ) { |
916
|
255
|
|
|
|
|
355
|
$snp_occured .= '0'; |
917
|
|
|
|
|
|
|
} |
918
|
|
|
|
|
|
|
else { |
919
|
315
|
|
|
|
|
380
|
$snp_occured .= '1'; |
920
|
315
|
|
|
|
|
365
|
$snp_freq++; |
921
|
315
|
|
|
|
|
464
|
$mutant_to = "$outgroup_base->$nt"; |
922
|
|
|
|
|
|
|
} |
923
|
|
|
|
|
|
|
} |
924
|
|
|
|
|
|
|
} |
925
|
|
|
|
|
|
|
else { |
926
|
4
|
|
|
|
|
6
|
$snp_freq = -1; |
927
|
4
|
|
|
|
|
7
|
$mutant_to = 'Complex'; |
928
|
4
|
|
|
|
|
9
|
$snp_occured = 'unknown'; |
929
|
|
|
|
|
|
|
} |
930
|
|
|
|
|
|
|
|
931
|
|
|
|
|
|
|
# outgroup_base is not equal to any nts |
932
|
195
|
100
|
|
|
|
408
|
if ( $snp_occured eq ( '1' x ( length $snp_occured ) ) ) { |
933
|
28
|
|
|
|
|
42
|
$snp_freq = -1; |
934
|
28
|
|
|
|
|
38
|
$mutant_to = 'Complex'; |
935
|
28
|
|
|
|
|
35
|
$snp_occured = 'unknown'; |
936
|
|
|
|
|
|
|
} |
937
|
|
|
|
|
|
|
|
938
|
195
|
|
|
|
|
411
|
$site->{snp_outgroup_base} = $outgroup_base; |
939
|
195
|
|
|
|
|
252
|
$site->{snp_mutant_to} = $mutant_to; |
940
|
195
|
|
|
|
|
240
|
$site->{snp_freq} = $snp_freq; |
941
|
195
|
|
|
|
|
380
|
$site->{snp_occured} = $snp_occured; |
942
|
|
|
|
|
|
|
} |
943
|
|
|
|
|
|
|
|
944
|
13
|
|
|
|
|
38
|
return $sites; |
945
|
|
|
|
|
|
|
} |
946
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
sub get_indels { |
948
|
24
|
|
|
24
|
0
|
2012
|
my $seq_refs = shift; |
949
|
|
|
|
|
|
|
|
950
|
24
|
|
|
|
|
37
|
my $seq_count = scalar @{$seq_refs}; |
|
24
|
|
|
|
|
39
|
|
951
|
|
|
|
|
|
|
|
952
|
24
|
|
|
|
|
166
|
my $indel_set = AlignDB::IntSpan->new(); |
953
|
24
|
|
|
|
|
332
|
for my $i ( 0 .. $seq_count - 1 ) { |
954
|
87
|
|
|
|
|
3743
|
my $seq_indel_set = indel_intspan( $seq_refs->[$i] ); |
955
|
87
|
|
|
|
|
227
|
$indel_set->merge($seq_indel_set); |
956
|
|
|
|
|
|
|
} |
957
|
|
|
|
|
|
|
|
958
|
24
|
|
|
|
|
3127
|
my @sites; |
959
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
# indels |
961
|
24
|
|
|
|
|
83
|
for my $cur_indel ( $indel_set->spans ) { |
962
|
46
|
|
|
|
|
393
|
my ( $indel_start, $indel_end ) = @{$cur_indel}; |
|
46
|
|
|
|
|
86
|
|
963
|
46
|
|
|
|
|
67
|
my $indel_length = $indel_end - $indel_start + 1; |
964
|
|
|
|
|
|
|
|
965
|
46
|
|
|
|
|
63
|
my @indel_seqs; |
966
|
46
|
|
|
|
|
56
|
for my $seq ( @{$seq_refs} ) { |
|
46
|
|
|
|
|
84
|
|
967
|
170
|
|
|
|
|
305
|
push @indel_seqs, ( substr $seq, $indel_start - 1, $indel_length ); |
968
|
|
|
|
|
|
|
} |
969
|
46
|
|
|
|
|
102
|
my $indel_all_seqs = join "|", @indel_seqs; |
970
|
|
|
|
|
|
|
|
971
|
46
|
|
|
|
|
61
|
my $indel_type; |
972
|
46
|
|
|
|
|
101
|
my @uniq_indel_seqs = uniq(@indel_seqs); |
973
|
|
|
|
|
|
|
|
974
|
|
|
|
|
|
|
# seqs with least '-' char wins |
975
|
104
|
|
|
|
|
181
|
my ($indel_seq) = map { $_->[0] } |
976
|
70
|
|
|
|
|
151
|
sort { $a->[1] <=> $b->[1] } |
977
|
46
|
|
|
|
|
84
|
map { [ $_, tr/-/-/ ] } @uniq_indel_seqs; |
|
104
|
|
|
|
|
398
|
|
978
|
|
|
|
|
|
|
|
979
|
46
|
50
|
|
|
|
218
|
if ( scalar @uniq_indel_seqs < 2 ) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
980
|
0
|
|
|
|
|
0
|
Carp::confess "no indel!\n"; |
981
|
0
|
|
|
|
|
0
|
next; |
982
|
|
|
|
|
|
|
} |
983
|
|
|
|
|
|
|
elsif ( scalar @uniq_indel_seqs > 2 ) { |
984
|
12
|
|
|
|
|
19
|
$indel_type = 'C'; |
985
|
|
|
|
|
|
|
} |
986
|
|
|
|
|
|
|
elsif ( $indel_seq =~ /-/ ) { |
987
|
0
|
|
|
|
|
0
|
$indel_type = 'C'; |
988
|
|
|
|
|
|
|
} |
989
|
|
|
|
|
|
|
else { |
990
|
|
|
|
|
|
|
# 'D': means deletion relative to target/first seq |
991
|
|
|
|
|
|
|
# target is ---- |
992
|
|
|
|
|
|
|
# 'I': means insertion relative to target/first seq |
993
|
|
|
|
|
|
|
# target is AAAA |
994
|
34
|
100
|
|
|
|
70
|
if ( $indel_seqs[0] eq $indel_seq ) { |
995
|
24
|
|
|
|
|
36
|
$indel_type = 'I'; |
996
|
|
|
|
|
|
|
} |
997
|
|
|
|
|
|
|
else { |
998
|
10
|
|
|
|
|
21
|
$indel_type = 'D'; |
999
|
|
|
|
|
|
|
} |
1000
|
|
|
|
|
|
|
} |
1001
|
|
|
|
|
|
|
|
1002
|
46
|
|
|
|
|
104
|
my $indel_freq = 0; |
1003
|
46
|
|
|
|
|
65
|
my $indel_occured; |
1004
|
46
|
100
|
|
|
|
87
|
if ( $indel_type eq 'C' ) { |
1005
|
12
|
|
|
|
|
18
|
$indel_freq = -1; |
1006
|
12
|
|
|
|
|
16
|
$indel_occured = 'unknown'; |
1007
|
|
|
|
|
|
|
} |
1008
|
|
|
|
|
|
|
else { |
1009
|
34
|
|
|
|
|
60
|
for (@indel_seqs) { |
1010
|
|
|
|
|
|
|
|
1011
|
|
|
|
|
|
|
# same as target '1', not '0' |
1012
|
122
|
100
|
|
|
|
187
|
if ( $indel_seqs[0] eq $_ ) { |
1013
|
74
|
|
|
|
|
87
|
$indel_freq++; |
1014
|
74
|
|
|
|
|
139
|
$indel_occured .= '1'; |
1015
|
|
|
|
|
|
|
} |
1016
|
|
|
|
|
|
|
else { |
1017
|
48
|
|
|
|
|
70
|
$indel_occured .= '0'; |
1018
|
|
|
|
|
|
|
} |
1019
|
|
|
|
|
|
|
} |
1020
|
|
|
|
|
|
|
} |
1021
|
|
|
|
|
|
|
|
1022
|
|
|
|
|
|
|
# here freq is the minor allele freq |
1023
|
46
|
|
|
|
|
109
|
$indel_freq = List::Util::min( $indel_freq, $seq_count - $indel_freq ); |
1024
|
|
|
|
|
|
|
|
1025
|
46
|
|
|
|
|
312
|
push @sites, |
1026
|
|
|
|
|
|
|
{ |
1027
|
|
|
|
|
|
|
indel_start => $indel_start, |
1028
|
|
|
|
|
|
|
indel_end => $indel_end, |
1029
|
|
|
|
|
|
|
indel_length => $indel_length, |
1030
|
|
|
|
|
|
|
indel_seq => $indel_seq, |
1031
|
|
|
|
|
|
|
indel_all_seqs => $indel_all_seqs, |
1032
|
|
|
|
|
|
|
indel_freq => $indel_freq, |
1033
|
|
|
|
|
|
|
indel_occured => $indel_occured, |
1034
|
|
|
|
|
|
|
indel_type => $indel_type, |
1035
|
|
|
|
|
|
|
}; |
1036
|
|
|
|
|
|
|
} |
1037
|
|
|
|
|
|
|
|
1038
|
24
|
|
|
|
|
301
|
return \@sites; |
1039
|
|
|
|
|
|
|
} |
1040
|
|
|
|
|
|
|
|
1041
|
|
|
|
|
|
|
sub polarize_indel { |
1042
|
3
|
|
|
3
|
0
|
10
|
my $sites = shift; |
1043
|
3
|
|
|
|
|
5
|
my $outgroup_seq = shift; |
1044
|
|
|
|
|
|
|
|
1045
|
3
|
|
|
|
|
13
|
my $outgroup_indel_set = indel_intspan($outgroup_seq); |
1046
|
|
|
|
|
|
|
|
1047
|
3
|
|
|
|
|
8
|
for my $site ( @{$sites} ) { |
|
3
|
|
|
|
|
10
|
|
1048
|
5
|
|
|
|
|
18
|
my @indel_seqs = split /\|/, $site->{indel_all_seqs}; |
1049
|
|
|
|
|
|
|
|
1050
|
|
|
|
|
|
|
my $indel_outgroup_seq = substr $outgroup_seq, $site->{indel_start} - 1, |
1051
|
5
|
|
|
|
|
14
|
$site->{indel_length}; |
1052
|
|
|
|
|
|
|
|
1053
|
5
|
|
|
|
|
10
|
my ( $indel_type, $indel_occured, $indel_freq ); |
1054
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
my $indel_set |
1056
|
5
|
|
|
|
|
14
|
= AlignDB::IntSpan->new()->add_pair( $site->{indel_start}, $site->{indel_end} ); |
1057
|
|
|
|
|
|
|
|
1058
|
|
|
|
|
|
|
# this line is different to previous subroutines |
1059
|
5
|
|
|
|
|
288
|
my @uniq_indel_seqs = uniq( @indel_seqs, $indel_outgroup_seq ); |
1060
|
|
|
|
|
|
|
|
1061
|
|
|
|
|
|
|
# seqs with least '-' char wins |
1062
|
11
|
|
|
|
|
22
|
my ($indel_seq) = map { $_->[0] } |
1063
|
7
|
|
|
|
|
18
|
sort { $a->[1] <=> $b->[1] } |
1064
|
5
|
|
|
|
|
13
|
map { [ $_, tr/-/-/ ] } @uniq_indel_seqs; |
|
11
|
|
|
|
|
33
|
|
1065
|
|
|
|
|
|
|
|
1066
|
5
|
50
|
|
|
|
43
|
if ( scalar @uniq_indel_seqs < 2 ) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
1067
|
0
|
|
|
|
|
0
|
Carp::confess "no indel!\n"; |
1068
|
|
|
|
|
|
|
} |
1069
|
|
|
|
|
|
|
elsif ( scalar @uniq_indel_seqs > 2 ) { |
1070
|
1
|
|
|
|
|
2
|
$indel_type = 'C'; |
1071
|
|
|
|
|
|
|
} |
1072
|
|
|
|
|
|
|
elsif ( $indel_seq =~ /-/ ) { |
1073
|
0
|
|
|
|
|
0
|
$indel_type = 'C'; |
1074
|
|
|
|
|
|
|
} |
1075
|
|
|
|
|
|
|
else { |
1076
|
|
|
|
|
|
|
|
1077
|
4
|
50
|
66
|
|
|
31
|
if ( ( $indel_outgroup_seq !~ /\-/ ) |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
1078
|
|
|
|
|
|
|
and ( $indel_seq ne $indel_outgroup_seq ) ) |
1079
|
|
|
|
|
|
|
{ |
1080
|
|
|
|
|
|
|
|
1081
|
|
|
|
|
|
|
# this section should already be judged in previes |
1082
|
|
|
|
|
|
|
# uniq_indel_seqs section, but I keep it here for safe |
1083
|
|
|
|
|
|
|
|
1084
|
|
|
|
|
|
|
# reference indel content does not contain '-' and is not equal |
1085
|
|
|
|
|
|
|
# to the one of alignment |
1086
|
|
|
|
|
|
|
# AAA |
1087
|
|
|
|
|
|
|
# A-A |
1088
|
|
|
|
|
|
|
# ref ACA |
1089
|
0
|
|
|
|
|
0
|
$indel_type = 'C'; |
1090
|
|
|
|
|
|
|
} |
1091
|
|
|
|
|
|
|
elsif ( $outgroup_indel_set->intersect($indel_set)->is_not_empty ) { |
1092
|
3
|
|
|
|
|
1021
|
my $island = $outgroup_indel_set->find_islands($indel_set); |
1093
|
3
|
100
|
|
|
|
5550
|
if ( $island->equal($indel_set) ) { |
1094
|
|
|
|
|
|
|
|
1095
|
|
|
|
|
|
|
# NNNN |
1096
|
|
|
|
|
|
|
# N--N |
1097
|
|
|
|
|
|
|
# ref N--N |
1098
|
2
|
|
|
|
|
122
|
$indel_type = 'I'; |
1099
|
|
|
|
|
|
|
} |
1100
|
|
|
|
|
|
|
else { |
1101
|
|
|
|
|
|
|
# reference indel island is larger or smaller |
1102
|
|
|
|
|
|
|
# NNNN |
1103
|
|
|
|
|
|
|
# N-NN |
1104
|
|
|
|
|
|
|
# ref N--N |
1105
|
|
|
|
|
|
|
# or |
1106
|
|
|
|
|
|
|
# NNNN |
1107
|
|
|
|
|
|
|
# N--N |
1108
|
|
|
|
|
|
|
# ref N-NN |
1109
|
1
|
|
|
|
|
70
|
$indel_type = 'C'; |
1110
|
|
|
|
|
|
|
} |
1111
|
|
|
|
|
|
|
} |
1112
|
|
|
|
|
|
|
elsif ( $outgroup_indel_set->intersect($indel_set)->is_empty ) { |
1113
|
|
|
|
|
|
|
|
1114
|
|
|
|
|
|
|
# NNNN |
1115
|
|
|
|
|
|
|
# N--N |
1116
|
|
|
|
|
|
|
# ref NNNN |
1117
|
1
|
|
|
|
|
722
|
$indel_type = 'D'; |
1118
|
|
|
|
|
|
|
} |
1119
|
|
|
|
|
|
|
else { |
1120
|
0
|
|
|
|
|
0
|
Carp::confess "Errors when polarizing indel!\n"; |
1121
|
|
|
|
|
|
|
} |
1122
|
|
|
|
|
|
|
} |
1123
|
|
|
|
|
|
|
|
1124
|
5
|
100
|
|
|
|
17
|
if ( $indel_type eq 'C' ) { |
1125
|
2
|
|
|
|
|
4
|
$indel_occured = 'unknown'; |
1126
|
2
|
|
|
|
|
4
|
$indel_freq = -1; |
1127
|
|
|
|
|
|
|
} |
1128
|
|
|
|
|
|
|
else { |
1129
|
3
|
|
|
|
|
10
|
for my $seq (@indel_seqs) { |
1130
|
8
|
100
|
|
|
|
15
|
if ( $seq eq $indel_outgroup_seq ) { |
1131
|
3
|
|
|
|
|
7
|
$indel_occured .= '0'; |
1132
|
|
|
|
|
|
|
} |
1133
|
|
|
|
|
|
|
else { |
1134
|
5
|
|
|
|
|
9
|
$indel_occured .= '1'; |
1135
|
5
|
|
|
|
|
9
|
$indel_freq++; |
1136
|
|
|
|
|
|
|
} |
1137
|
|
|
|
|
|
|
} |
1138
|
|
|
|
|
|
|
} |
1139
|
|
|
|
|
|
|
|
1140
|
5
|
|
|
|
|
10
|
$site->{indel_outgroup_seq} = $indel_outgroup_seq; |
1141
|
5
|
|
|
|
|
9
|
$site->{indel_type} = $indel_type; |
1142
|
5
|
|
|
|
|
8
|
$site->{indel_occured} = $indel_occured; |
1143
|
5
|
|
|
|
|
16
|
$site->{indel_freq} = $indel_freq; |
1144
|
|
|
|
|
|
|
} |
1145
|
|
|
|
|
|
|
|
1146
|
3
|
|
|
|
|
12
|
return $sites; |
1147
|
|
|
|
|
|
|
} |
1148
|
|
|
|
|
|
|
|
1149
|
|
|
|
|
|
|
sub chr_to_align { |
1150
|
8
|
|
|
8
|
1
|
80
|
my AlignDB::IntSpan $intspan = shift; |
1151
|
8
|
|
|
|
|
13
|
my $pos = shift; |
1152
|
|
|
|
|
|
|
|
1153
|
8
|
|
50
|
|
|
22
|
my $chr_start = shift || 1; |
1154
|
8
|
|
50
|
|
|
19
|
my $chr_strand = shift || "+"; |
1155
|
|
|
|
|
|
|
|
1156
|
8
|
|
|
|
|
24
|
my $chr_end = $chr_start + $intspan->size - 1; |
1157
|
|
|
|
|
|
|
|
1158
|
8
|
50
|
33
|
|
|
601
|
if ( $pos < $chr_start || $pos > $chr_end ) { |
1159
|
0
|
|
|
|
|
0
|
Carp::confess "[$pos] out of ranges [$chr_start,$chr_end]\n"; |
1160
|
|
|
|
|
|
|
} |
1161
|
|
|
|
|
|
|
|
1162
|
8
|
|
|
|
|
15
|
my $align_pos; |
1163
|
8
|
100
|
|
|
|
21
|
if ( $chr_strand eq "+" ) { |
1164
|
5
|
|
|
|
|
9
|
$align_pos = $pos - $chr_start + 1; |
1165
|
5
|
|
|
|
|
17
|
$align_pos = $intspan->at($align_pos); |
1166
|
|
|
|
|
|
|
} |
1167
|
|
|
|
|
|
|
else { |
1168
|
3
|
|
|
|
|
6
|
$align_pos = $pos - $chr_start + 1; |
1169
|
3
|
|
|
|
|
8
|
$align_pos = $intspan->at( -$align_pos ); |
1170
|
|
|
|
|
|
|
} |
1171
|
|
|
|
|
|
|
|
1172
|
8
|
|
|
|
|
815
|
return $align_pos; |
1173
|
|
|
|
|
|
|
} |
1174
|
|
|
|
|
|
|
|
1175
|
|
|
|
|
|
|
sub align_to_chr { |
1176
|
734
|
|
|
734
|
1
|
7807
|
my AlignDB::IntSpan $intspan = shift; |
1177
|
734
|
|
|
|
|
1035
|
my $pos = shift; |
1178
|
|
|
|
|
|
|
|
1179
|
734
|
|
50
|
|
|
1462
|
my $chr_start = shift || 1; |
1180
|
734
|
|
50
|
|
|
4607
|
my $chr_strand = shift || "+"; |
1181
|
|
|
|
|
|
|
|
1182
|
734
|
|
|
|
|
4756
|
my $chr_end = $chr_start + $intspan->size - 1; |
1183
|
|
|
|
|
|
|
|
1184
|
734
|
50
|
|
|
|
97677
|
if ( $pos < 1 ) { |
1185
|
0
|
|
|
|
|
0
|
Carp::confess "align pos out of ranges\n"; |
1186
|
|
|
|
|
|
|
} |
1187
|
|
|
|
|
|
|
|
1188
|
734
|
|
|
|
|
1036
|
my $chr_pos; |
1189
|
734
|
100
|
|
|
|
1591
|
if ( $intspan->contains($pos) ) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
1190
|
728
|
|
|
|
|
35145
|
$chr_pos = $intspan->index($pos); |
1191
|
|
|
|
|
|
|
} |
1192
|
|
|
|
|
|
|
elsif ( $pos < $intspan->min ) { |
1193
|
2
|
|
|
|
|
117
|
$chr_pos = 1; |
1194
|
|
|
|
|
|
|
} |
1195
|
|
|
|
|
|
|
elsif ( $pos > $intspan->max ) { |
1196
|
2
|
|
|
|
|
149
|
$chr_pos = $intspan->size; |
1197
|
|
|
|
|
|
|
} |
1198
|
|
|
|
|
|
|
else { |
1199
|
|
|
|
|
|
|
# pin to left base |
1200
|
2
|
|
|
|
|
147
|
my @spans = $intspan->spans; |
1201
|
2
|
|
|
|
|
70
|
for my $i ( 0 .. $#spans ) { |
1202
|
4
|
100
|
|
|
|
9
|
if ( $spans[$i]->[1] < $pos ) { |
1203
|
2
|
|
|
|
|
5
|
next; |
1204
|
|
|
|
|
|
|
} |
1205
|
|
|
|
|
|
|
else { |
1206
|
2
|
|
|
|
|
5
|
$pos = $spans[ $i - 1 ]->[1]; |
1207
|
2
|
|
|
|
|
4
|
last; |
1208
|
|
|
|
|
|
|
} |
1209
|
|
|
|
|
|
|
} |
1210
|
2
|
|
|
|
|
6
|
$chr_pos = $intspan->index($pos); |
1211
|
|
|
|
|
|
|
} |
1212
|
|
|
|
|
|
|
|
1213
|
734
|
100
|
|
|
|
81493
|
if ( $chr_strand eq "+" ) { |
1214
|
727
|
|
|
|
|
1136
|
$chr_pos = $chr_pos + $chr_start - 1; |
1215
|
|
|
|
|
|
|
} |
1216
|
|
|
|
|
|
|
else { |
1217
|
7
|
|
|
|
|
12
|
$chr_pos = $chr_end - $chr_pos + 1; |
1218
|
|
|
|
|
|
|
} |
1219
|
|
|
|
|
|
|
|
1220
|
734
|
|
|
|
|
1432
|
return $chr_pos; |
1221
|
|
|
|
|
|
|
} |
1222
|
|
|
|
|
|
|
|
1223
|
|
|
|
|
|
|
sub calc_ld { |
1224
|
9
|
|
|
9
|
1
|
5247
|
my $strA = shift; |
1225
|
9
|
|
|
|
|
15
|
my $strB = shift; |
1226
|
|
|
|
|
|
|
|
1227
|
9
|
|
|
|
|
14
|
my $size = length $strA; |
1228
|
|
|
|
|
|
|
|
1229
|
9
|
50
|
|
|
|
24
|
if ( $size != length $strB ) { |
1230
|
0
|
|
|
|
|
0
|
Carp::confess "Lengths not equal for [$strA] and [$strA]\n"; |
1231
|
0
|
|
|
|
|
0
|
return; |
1232
|
|
|
|
|
|
|
} |
1233
|
|
|
|
|
|
|
|
1234
|
9
|
|
|
|
|
17
|
for ( $strA, $strB ) { |
1235
|
18
|
50
|
|
|
|
53
|
if (/[^10]/) { |
1236
|
0
|
|
|
|
|
0
|
Carp::confess "[$_] contains illegal chars\n"; |
1237
|
0
|
|
|
|
|
0
|
return; |
1238
|
|
|
|
|
|
|
} |
1239
|
|
|
|
|
|
|
} |
1240
|
|
|
|
|
|
|
|
1241
|
9
|
|
|
|
|
19
|
my $A_count = $strA =~ tr/1/1/; |
1242
|
9
|
|
|
|
|
18
|
my $fA = $A_count / $size; |
1243
|
9
|
|
|
|
|
15
|
my $fa = 1 - $fA; |
1244
|
|
|
|
|
|
|
|
1245
|
9
|
|
|
|
|
13
|
my $B_count = $strB =~ tr/1/1/; |
1246
|
9
|
|
|
|
|
16
|
my $fB = $B_count / $size; |
1247
|
9
|
|
|
|
|
13
|
my $fb = 1 - $fB; |
1248
|
|
|
|
|
|
|
|
1249
|
9
|
50
|
|
36
|
|
45
|
if ( any { $_ == 0 } ( $fA, $fa, $fB, $fb ) ) { |
|
36
|
|
|
|
|
89
|
|
1250
|
0
|
|
|
|
|
0
|
return ( undef, undef ); |
1251
|
|
|
|
|
|
|
} |
1252
|
|
|
|
|
|
|
|
1253
|
|
|
|
|
|
|
# '1' in strA and '1' in strB as fAB |
1254
|
9
|
|
|
|
|
29
|
my ( $AB_count, $fAB ) = ( 0, 0 ); |
1255
|
9
|
|
|
|
|
20
|
for my $i ( 1 .. $size ) { |
1256
|
70
|
|
|
|
|
107
|
my $ichar = substr $strA, $i - 1, 1; |
1257
|
70
|
|
|
|
|
96
|
my $schar = substr $strB, $i - 1, 1; |
1258
|
70
|
100
|
100
|
|
|
175
|
if ( $ichar eq '1' and $schar eq '1' ) { |
1259
|
23
|
|
|
|
|
40
|
$AB_count++; |
1260
|
|
|
|
|
|
|
} |
1261
|
|
|
|
|
|
|
} |
1262
|
9
|
|
|
|
|
15
|
$fAB = $AB_count / $size; |
1263
|
|
|
|
|
|
|
|
1264
|
9
|
|
|
|
|
18
|
my $DAB = $fAB - $fA * $fB; |
1265
|
|
|
|
|
|
|
|
1266
|
9
|
|
|
|
|
14
|
my ( $r, $dprime ); |
1267
|
9
|
|
|
|
|
16
|
$r = $DAB / sqrt( $fA * $fa * $fB * $fb ); |
1268
|
|
|
|
|
|
|
|
1269
|
9
|
100
|
|
|
|
20
|
if ( $DAB < 0 ) { |
1270
|
1
|
|
|
|
|
5
|
$dprime = $DAB / List::Util::min( $fA * $fB, $fa * $fb ); |
1271
|
|
|
|
|
|
|
} |
1272
|
|
|
|
|
|
|
else { |
1273
|
8
|
|
|
|
|
27
|
$dprime = $DAB / List::Util::min( $fA * $fb, $fa * $fB ); |
1274
|
|
|
|
|
|
|
} |
1275
|
|
|
|
|
|
|
|
1276
|
9
|
|
|
|
|
29
|
return ( $r, $dprime ); |
1277
|
|
|
|
|
|
|
} |
1278
|
|
|
|
|
|
|
|
1279
|
|
|
|
|
|
|
sub poa_consensus { |
1280
|
0
|
|
|
0
|
0
|
|
my $seq_refs = shift; |
1281
|
|
|
|
|
|
|
|
1282
|
0
|
|
|
|
|
|
my $aln_prog = "poa"; |
1283
|
|
|
|
|
|
|
|
1284
|
|
|
|
|
|
|
# temp in and out |
1285
|
0
|
|
|
|
|
|
my $temp_in = Path::Tiny->tempfile("seq_in_XXXXXXXX"); |
1286
|
0
|
|
|
|
|
|
my $temp_out = Path::Tiny->tempfile("seq_out_XXXXXXXX"); |
1287
|
|
|
|
|
|
|
|
1288
|
|
|
|
|
|
|
# msa may change the order of sequences |
1289
|
0
|
|
|
|
|
|
my @indexes = 0 .. scalar( @{$seq_refs} - 1 ); |
|
0
|
|
|
|
|
|
|
1290
|
|
|
|
|
|
|
{ |
1291
|
0
|
|
|
|
|
|
my $fh = $temp_in->openw(); |
|
0
|
|
|
|
|
|
|
1292
|
0
|
|
|
|
|
|
for my $i (@indexes) { |
1293
|
0
|
|
|
|
|
|
printf {$fh} ">seq_%d\n", $i; |
|
0
|
|
|
|
|
|
|
1294
|
0
|
|
|
|
|
|
printf {$fh} "%s\n", $seq_refs->[$i]; |
|
0
|
|
|
|
|
|
|
1295
|
|
|
|
|
|
|
} |
1296
|
0
|
|
|
|
|
|
close $fh; |
1297
|
|
|
|
|
|
|
} |
1298
|
|
|
|
|
|
|
|
1299
|
0
|
|
|
|
|
|
my @args; |
1300
|
|
|
|
|
|
|
{ |
1301
|
0
|
|
|
|
|
|
push @args, "-hb"; |
|
0
|
|
|
|
|
|
|
1302
|
0
|
|
|
|
|
|
push @args, "-read_fasta " . $temp_in->absolute->stringify; |
1303
|
0
|
|
|
|
|
|
push @args, "-clustal " . $temp_out->absolute->stringify; |
1304
|
0
|
|
|
|
|
|
push @args, File::ShareDir::dist_file( 'App-Fasops', 'poa-blosum80.mat' ); |
1305
|
|
|
|
|
|
|
} |
1306
|
|
|
|
|
|
|
|
1307
|
0
|
|
|
|
|
|
my $cmd_line = join " ", ( $aln_prog, @args ); |
1308
|
0
|
|
|
|
|
|
my $ok = IPC::Cmd::run( command => $cmd_line ); |
1309
|
|
|
|
|
|
|
|
1310
|
0
|
0
|
|
|
|
|
if ( !$ok ) { |
1311
|
0
|
|
|
|
|
|
Carp::confess("Calling [$aln_prog] failed\n"); |
1312
|
|
|
|
|
|
|
} |
1313
|
|
|
|
|
|
|
|
1314
|
0
|
|
|
|
|
|
my $consensus = join "", grep {/^CONSENS0/} $temp_out->lines( { chomp => 1, } ); |
|
0
|
|
|
|
|
|
|
1315
|
0
|
|
|
|
|
|
$consensus =~ s/CONSENS0//g; |
1316
|
0
|
|
|
|
|
|
$consensus =~ s/\s//g; |
1317
|
0
|
|
|
|
|
|
$consensus =~ s/-//g; |
1318
|
|
|
|
|
|
|
|
1319
|
0
|
|
|
|
|
|
return $consensus; |
1320
|
|
|
|
|
|
|
} |
1321
|
|
|
|
|
|
|
|
1322
|
|
|
|
|
|
|
1; |
1323
|
|
|
|
|
|
|
|
1324
|
|
|
|
|
|
|
__END__ |