line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::Gonzales::Seq::Util; |
2
|
|
|
|
|
|
|
|
3
|
5
|
|
|
5
|
|
122340
|
use warnings; |
|
5
|
|
|
|
|
20
|
|
|
5
|
|
|
|
|
196
|
|
4
|
5
|
|
|
5
|
|
28
|
use strict; |
|
5
|
|
|
|
|
12
|
|
|
5
|
|
|
|
|
95
|
|
5
|
5
|
|
|
5
|
|
27
|
use Carp; |
|
5
|
|
|
|
|
7
|
|
|
5
|
|
|
|
|
309
|
|
6
|
|
|
|
|
|
|
|
7
|
5
|
|
|
5
|
|
31
|
use Scalar::Util qw/blessed/; |
|
5
|
|
|
|
|
12
|
|
|
5
|
|
|
|
|
249
|
|
8
|
5
|
|
|
5
|
|
50
|
use Data::Dumper; |
|
5
|
|
|
|
|
11
|
|
|
5
|
|
|
|
|
244
|
|
9
|
5
|
|
|
5
|
|
4098
|
use Bio::Gonzales::Matrix::IO; |
|
5
|
|
|
|
|
40
|
|
|
5
|
|
|
|
|
496
|
|
10
|
5
|
|
|
5
|
|
54
|
use File::Which qw/which/; |
|
5
|
|
|
|
|
12
|
|
|
5
|
|
|
|
|
263
|
|
11
|
5
|
|
|
5
|
|
2495
|
use Bio::Gonzales::Seq::IO; |
|
5
|
|
|
|
|
17
|
|
|
5
|
|
|
|
|
429
|
|
12
|
5
|
|
|
5
|
|
37
|
use Bio::Gonzales::Seq; |
|
5
|
|
|
|
|
73
|
|
|
5
|
|
|
|
|
110
|
|
13
|
|
|
|
|
|
|
|
14
|
5
|
|
|
5
|
|
72
|
use base 'Exporter'; |
|
5
|
|
|
|
|
12
|
|
|
5
|
|
|
|
|
10336
|
|
15
|
|
|
|
|
|
|
our ( @EXPORT, @EXPORT_OK, %EXPORT_TAGS ); |
16
|
|
|
|
|
|
|
our $VERSION = '0.083'; # VERSION |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
@EXPORT = qw(); |
19
|
|
|
|
|
|
|
%EXPORT_TAGS = (); |
20
|
|
|
|
|
|
|
@EXPORT_OK = qw( |
21
|
|
|
|
|
|
|
pairwise_identity_l |
22
|
|
|
|
|
|
|
pairwise_identity_s |
23
|
|
|
|
|
|
|
pairwise_identity_gaps_l |
24
|
|
|
|
|
|
|
pairwise_identity_gaps_s |
25
|
|
|
|
|
|
|
pairwise_identities |
26
|
|
|
|
|
|
|
map_seqids |
27
|
|
|
|
|
|
|
seqid_mapper |
28
|
|
|
|
|
|
|
crc64 |
29
|
|
|
|
|
|
|
strand_convert |
30
|
|
|
|
|
|
|
seq_lengths |
31
|
|
|
|
|
|
|
seq_apply |
32
|
|
|
|
|
|
|
revcom_seq_string |
33
|
|
|
|
|
|
|
); |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
our %STRAND_CHAR_TABLE = ( |
36
|
|
|
|
|
|
|
'+' => 1, |
37
|
|
|
|
|
|
|
'-' => -1, |
38
|
|
|
|
|
|
|
'.' => 0, |
39
|
|
|
|
|
|
|
-1 => '-', |
40
|
|
|
|
|
|
|
1 => '+', |
41
|
|
|
|
|
|
|
0 => '.', |
42
|
|
|
|
|
|
|
); |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
our $BLASTDB_CMD = which('blastdbcmd'); |
45
|
|
|
|
|
|
|
our $SAMTOOLS_CMD = which('samtools'); |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
sub strand_convert { |
48
|
142
|
100
|
33
|
142
|
0
|
780
|
if ( @_ && @_ > 0 && $_[-1] && exists( $STRAND_CHAR_TABLE{ $_[-1] } ) ) { |
|
|
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
49
|
1
|
|
|
|
|
6
|
return $STRAND_CHAR_TABLE{ $_[-1] }; |
50
|
|
|
|
|
|
|
} else { |
51
|
141
|
|
|
|
|
385
|
return '.'; |
52
|
|
|
|
|
|
|
} |
53
|
|
|
|
|
|
|
} |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
sub seq_lengths { |
56
|
0
|
|
|
0
|
0
|
0
|
my $f = shift; |
57
|
|
|
|
|
|
|
|
58
|
0
|
|
|
|
|
0
|
my $d; |
59
|
0
|
0
|
|
|
|
0
|
if ( -f $f . ".fai" ) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
60
|
0
|
|
|
|
|
0
|
$d = mslurp( $f . ".fai", { header => undef } ); |
61
|
|
|
|
|
|
|
} elsif ( -f $f . ".nhr" ) { |
62
|
0
|
0
|
|
|
|
0
|
open my $fh, '-|', $BLASTDB_CMD, '-db', $f, '-entry', 'all', '-outfmt', "%a\t%l" |
63
|
|
|
|
|
|
|
or die "Can't open filehandle: $!"; |
64
|
0
|
|
|
|
|
0
|
$d = mslurp( $fh, { header => undef } ); |
65
|
0
|
|
|
|
|
0
|
close $fh; |
66
|
|
|
|
|
|
|
} elsif ($SAMTOOLS_CMD) { |
67
|
0
|
0
|
|
|
|
0
|
system( 'samtools', 'faidx', $f ) == 0 or die "system failed: $?"; |
68
|
0
|
0
|
|
|
|
0
|
$d = mslurp( $f . ".fai", { header => undef } ) if ( -f $f . ".fai" ); |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
# nothing worked so far |
72
|
0
|
0
|
|
|
|
0
|
unless ($d) { |
73
|
0
|
|
|
|
|
0
|
say STDERR "could not use samtools or blast indices, using std method"; |
74
|
0
|
|
|
|
|
0
|
my @lengths; |
75
|
0
|
|
|
|
|
0
|
my $fit = faiterate($f); |
76
|
0
|
|
|
|
|
0
|
while ( my $seq = $fit->() ) { |
77
|
0
|
|
|
|
|
0
|
push @lengths, [ $seq->id, $seq->length ]; |
78
|
|
|
|
|
|
|
} |
79
|
0
|
|
|
|
|
0
|
$d = \@lengths; |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
0
|
|
|
|
|
0
|
my %sl; |
83
|
0
|
|
|
|
|
0
|
for my $r (@$d) { |
84
|
0
|
0
|
|
|
|
0
|
die "double ID" if ( $sl{ $r->[0] } ); |
85
|
0
|
|
|
|
|
0
|
$sl{ $r->[0] } = $r->[1]; |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
|
88
|
0
|
|
|
|
|
0
|
return \%sl; |
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
sub revcom_seq_string { |
92
|
0
|
|
|
0
|
0
|
0
|
return Bio::Gonzales::Seq::_revcom_from_string(@_); |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
sub seq_apply { |
96
|
0
|
|
|
0
|
0
|
0
|
my ( $f, $sub, $pattern ) = @_; |
97
|
|
|
|
|
|
|
|
98
|
0
|
0
|
0
|
|
|
0
|
die "file or sub ref not correct" unless ( ref $sub eq 'CODE' && -f $f ); |
99
|
0
|
|
|
|
|
0
|
my $seq_lengths = seq_lengths($f); |
100
|
0
|
|
|
|
|
0
|
my @res; |
101
|
0
|
|
|
|
|
0
|
my $num = keys %$seq_lengths; |
102
|
0
|
|
|
|
|
0
|
for my $sid ( keys %$seq_lengths ) { |
103
|
0
|
|
|
|
|
0
|
my $spat; |
104
|
0
|
0
|
|
|
|
0
|
if ($pattern) { |
105
|
0
|
|
|
|
|
0
|
$spat = $pattern; |
106
|
0
|
|
|
|
|
0
|
$spat =~ s/\{id\}/$sid/g; |
107
|
0
|
|
|
|
|
0
|
$spat =~ s/\{begin\}/1/g; |
108
|
0
|
|
|
|
|
0
|
$spat =~ s/\{end\}/$seq_lengths->{$sid}/g; |
109
|
|
|
|
|
|
|
} |
110
|
0
|
|
|
|
|
0
|
push @res, $sub->( { id => $sid, begin => 1, end => $seq_lengths->{$sid}, num => $num }, $spat ); |
111
|
|
|
|
|
|
|
} |
112
|
0
|
|
|
|
|
0
|
return \@res; |
113
|
|
|
|
|
|
|
} |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
sub pairwise_identity_l { |
116
|
0
|
|
|
0
|
0
|
0
|
my ( $seq1, $seq2 ) = @_; |
117
|
0
|
|
|
|
|
0
|
return _pairwise_identity_generic( $seq1, $seq2, 1, 0 ); |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
sub pairwise_identity_s { |
121
|
0
|
|
|
0
|
0
|
0
|
my ( $seq1, $seq2 ) = @_; |
122
|
0
|
|
|
|
|
0
|
return _pairwise_identity_generic( $seq1, $seq2, 0, 0 ); |
123
|
|
|
|
|
|
|
} |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
sub pairwise_identity_gaps_l { |
126
|
0
|
|
|
0
|
0
|
0
|
my ( $seq1, $seq2 ) = @_; |
127
|
0
|
|
|
|
|
0
|
return _pairwise_identity_generic( $seq1, $seq2, 1, 1 ); |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
sub _pairwise_identity_generic { |
131
|
0
|
|
|
0
|
|
0
|
my ( $seq1, $seq2, $use_longest, $include_gaps ) = @_; |
132
|
|
|
|
|
|
|
|
133
|
0
|
0
|
|
|
|
0
|
$seq1 = $seq1->seq if ( blessed $seq1); |
134
|
0
|
0
|
|
|
|
0
|
$seq2 = $seq2->seq if ( blessed $seq2); |
135
|
|
|
|
|
|
|
|
136
|
0
|
|
|
|
|
0
|
my $seq1_gaps = 0; |
137
|
0
|
|
|
|
|
0
|
my $seq2_gaps = 0; |
138
|
0
|
0
|
|
|
|
0
|
if ( !$include_gaps ) { |
139
|
0
|
|
|
|
|
0
|
$seq1_gaps = $seq1 =~ y/-/./; |
140
|
0
|
|
|
|
|
0
|
$seq2_gaps = () = $seq2 =~ /-/g; |
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
|
143
|
0
|
|
|
|
|
0
|
my $mask = $seq1 ^ $seq2; |
144
|
0
|
|
|
|
|
0
|
my $matches = $mask =~ tr/\x0/\x0/; |
145
|
|
|
|
|
|
|
|
146
|
0
|
|
|
|
|
0
|
my $longest; |
147
|
|
|
|
|
|
|
my $shortest; |
148
|
0
|
0
|
|
|
|
0
|
if ( length($seq2) - $seq2_gaps < length($seq1) - $seq1_gaps ) { |
149
|
0
|
|
|
|
|
0
|
$longest = length($seq1) - $seq1_gaps; |
150
|
0
|
|
|
|
|
0
|
$shortest = length($seq2) - $seq2_gaps; |
151
|
|
|
|
|
|
|
} else { |
152
|
0
|
|
|
|
|
0
|
$shortest = length($seq1) - $seq1_gaps; |
153
|
0
|
|
|
|
|
0
|
$longest = length($seq2) - $seq2_gaps; |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
|
156
|
0
|
0
|
|
|
|
0
|
if ($use_longest) { |
157
|
0
|
|
|
|
|
0
|
return ( $matches / $longest ); |
158
|
|
|
|
|
|
|
} else { |
159
|
0
|
|
|
|
|
0
|
return ( $matches / $shortest ); |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
sub pairwise_identity_gaps_s { |
164
|
0
|
|
|
0
|
0
|
0
|
my ( $seq1, $seq2 ) = @_; |
165
|
|
|
|
|
|
|
|
166
|
0
|
|
|
|
|
0
|
return _pairwise_identity_generic( $seq1, $seq2, 0, 1 ); |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
sub pairwise_identities { |
170
|
0
|
|
|
0
|
0
|
0
|
my ( $sub, @seqs ) = @_; |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
#creating an upper triangular matrix |
173
|
|
|
|
|
|
|
|
174
|
0
|
|
|
|
|
0
|
my @dist; |
175
|
0
|
|
|
|
|
0
|
for ( my $i = 0; $i < @seqs; $i++ ) { |
176
|
0
|
|
|
|
|
0
|
push @dist, []; |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
|
179
|
0
|
|
|
|
|
0
|
my $i; |
180
|
0
|
|
|
|
|
0
|
for ( $i = 0; $i < @seqs - 1; $i++ ) { |
181
|
0
|
|
|
|
|
0
|
$dist[$i][$i] = 1; |
182
|
0
|
|
|
|
|
0
|
for ( my $j = $i + 1; $j < @seqs; $j++ ) { |
183
|
0
|
|
|
|
|
0
|
$dist[$j][$i] = $sub->( $seqs[$j], $seqs[$i] ); |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
} |
186
|
0
|
|
|
|
|
0
|
$dist[$i][$i] = 1; |
187
|
|
|
|
|
|
|
|
188
|
0
|
|
|
|
|
0
|
return \@dist; |
189
|
|
|
|
|
|
|
} |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
sub map_seqids { |
192
|
0
|
|
|
0
|
1
|
0
|
my ( $seqs, $pattern ) = @_; |
193
|
|
|
|
|
|
|
|
194
|
0
|
|
|
|
|
0
|
my %map; |
195
|
|
|
|
|
|
|
|
196
|
0
|
|
|
|
|
0
|
my $i = seqid_mapper($pattern); |
197
|
|
|
|
|
|
|
|
198
|
0
|
|
|
|
|
0
|
for my $s (@$seqs) { |
199
|
0
|
|
|
|
|
0
|
my ( $new, $old ) = $i->($s); |
200
|
0
|
|
|
|
|
0
|
$map{$new} = $old; |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
|
203
|
0
|
|
|
|
|
0
|
return \%map; |
204
|
|
|
|
|
|
|
} |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
sub seqid_mapper { |
207
|
0
|
|
|
0
|
1
|
0
|
my ( $pattern, @extra_args ) = @_; |
208
|
0
|
0
|
|
|
|
0
|
$pattern = 's%08d' unless defined $pattern; |
209
|
|
|
|
|
|
|
|
210
|
0
|
|
|
|
|
0
|
my $handler; |
211
|
0
|
0
|
|
|
|
0
|
if ( ref $pattern eq 'CODE' ) { |
|
|
0
|
|
|
|
|
|
212
|
0
|
|
|
|
|
0
|
$handler = $pattern; |
213
|
|
|
|
|
|
|
} elsif ( ref $pattern eq 'HASH' ) { |
214
|
0
|
|
|
|
|
0
|
my $i = 1; |
215
|
|
|
|
|
|
|
$handler = sub { |
216
|
0
|
|
|
0
|
|
0
|
my $id = shift; |
217
|
0
|
0
|
|
|
|
0
|
return defined $pattern->{$id} ? $pattern->{$id} : 'unknown_' . $i++; |
218
|
0
|
|
|
|
|
0
|
}; |
219
|
|
|
|
|
|
|
} else { |
220
|
0
|
|
|
|
|
0
|
my $i = 1; |
221
|
|
|
|
|
|
|
$handler = sub { |
222
|
0
|
|
|
0
|
|
0
|
return sprintf $pattern, $i++; |
223
|
|
|
|
|
|
|
|
224
|
0
|
|
|
|
|
0
|
}; |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
return sub { |
228
|
0
|
|
|
0
|
|
0
|
my ($seq) = @_; |
229
|
0
|
0
|
|
|
|
0
|
return unless ($seq); |
230
|
0
|
0
|
|
|
|
0
|
unless ( blessed($seq) ) { |
231
|
0
|
0
|
|
|
|
0
|
if ( ref $seq eq 'ARRAY' ) { |
232
|
0
|
|
|
|
|
0
|
croak "you supplied an array to the mapper function, use single seq Bio::Gonzales::Seq objects"; |
233
|
|
|
|
|
|
|
} else { |
234
|
0
|
|
|
|
|
0
|
confess Dumper $seq ; |
235
|
|
|
|
|
|
|
} |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
|
238
|
0
|
|
|
|
|
0
|
my $orig_id = $seq->id; |
239
|
0
|
|
|
|
|
0
|
my $id = $handler->( $orig_id, @extra_args ); |
240
|
0
|
|
|
|
|
0
|
$seq->id($id); |
241
|
|
|
|
|
|
|
|
242
|
0
|
|
|
|
|
0
|
return ( $id, $orig_id ); |
243
|
0
|
|
|
|
|
0
|
}; |
244
|
|
|
|
|
|
|
} |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
{ |
247
|
|
|
|
|
|
|
my $POLY64REVh = 0xd8000000; |
248
|
|
|
|
|
|
|
my @CRCTableh = 256; |
249
|
|
|
|
|
|
|
my @CRCTablel = 256; |
250
|
|
|
|
|
|
|
my $initialized; |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
sub crc64 { |
253
|
25
|
|
|
25
|
0
|
35
|
my $sequence = shift; |
254
|
25
|
|
|
|
|
43
|
my $crcl = 0; |
255
|
25
|
|
|
|
|
32
|
my $crch = 0; |
256
|
25
|
100
|
|
|
|
53
|
if ( !$initialized ) { |
257
|
1
|
|
|
|
|
2
|
$initialized = 1; |
258
|
1
|
|
|
|
|
4
|
for ( my $i = 0; $i < 256; $i++ ) { |
259
|
256
|
|
|
|
|
305
|
my $partl = $i; |
260
|
256
|
|
|
|
|
301
|
my $parth = 0; |
261
|
256
|
|
|
|
|
432
|
for ( my $j = 0; $j < 8; $j++ ) { |
262
|
2048
|
|
|
|
|
2510
|
my $rflag = $partl & 1; |
263
|
2048
|
|
|
|
|
2428
|
$partl >>= 1; |
264
|
2048
|
50
|
|
|
|
3211
|
$partl |= ( 1 << 31 ) if $parth & 1; |
265
|
2048
|
|
|
|
|
2432
|
$parth >>= 1; |
266
|
2048
|
100
|
|
|
|
4098
|
$parth ^= $POLY64REVh if $rflag; |
267
|
|
|
|
|
|
|
} |
268
|
256
|
|
|
|
|
346
|
$CRCTableh[$i] = $parth; |
269
|
256
|
|
|
|
|
446
|
$CRCTablel[$i] = $partl; |
270
|
|
|
|
|
|
|
} |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
|
273
|
25
|
|
|
|
|
1021
|
foreach ( split '', $sequence ) { |
274
|
4581
|
|
|
|
|
6364
|
my $shr = ( $crch & 0xFF ) << 24; |
275
|
4581
|
|
|
|
|
5502
|
my $temp1h = $crch >> 8; |
276
|
4581
|
|
|
|
|
5583
|
my $temp1l = ( $crcl >> 8 ) | $shr; |
277
|
4581
|
|
|
|
|
6582
|
my $tableindex = ( $crcl ^ ( unpack "C", $_ ) ) & 0xFF; |
278
|
4581
|
|
|
|
|
6057
|
$crch = $temp1h ^ $CRCTableh[$tableindex]; |
279
|
4581
|
|
|
|
|
6550
|
$crcl = $temp1l ^ $CRCTablel[$tableindex]; |
280
|
|
|
|
|
|
|
} |
281
|
25
|
50
|
|
|
|
400
|
return wantarray ? ( $crch, $crcl ) : sprintf( "%08X%08X", $crch, $crcl ); |
282
|
|
|
|
|
|
|
} |
283
|
|
|
|
|
|
|
} |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
1; |
286
|
|
|
|
|
|
|
__END__ |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
=head1 NAME |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
=head1 SYNOPSIS |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
use Bio::Gonzales::Seq::Util qw( |
295
|
|
|
|
|
|
|
pairwise_identity_l |
296
|
|
|
|
|
|
|
pairwise_identity_s |
297
|
|
|
|
|
|
|
pairwise_identity_gaps_l |
298
|
|
|
|
|
|
|
pairwise_identity_gaps_s |
299
|
|
|
|
|
|
|
pairwise_identities |
300
|
|
|
|
|
|
|
map_seqids |
301
|
|
|
|
|
|
|
seqid_mapper |
302
|
|
|
|
|
|
|
overlaps |
303
|
|
|
|
|
|
|
cluster_overlapping_ranges |
304
|
|
|
|
|
|
|
); |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
=head1 DESCRIPTION |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
=head1 SUBROUTINES |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
=over 4 |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
=item B<< $map = map_seqids($seqs, I<$pattern>) >> |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
Maps all sequence ids of C<$seqs> in situ. If C<$pattern> is not given, C<s%9d> is taken as default. |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
=item B<< $clustered_ranges = cluster_overlapping_ranges(\@ranges) >> |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
This function takes some ranges and clusters them by overlap. |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
@ranges = ( |
322
|
|
|
|
|
|
|
[ $start, $stop, $some, $custom, $elements ], |
323
|
|
|
|
|
|
|
[ ... ], |
324
|
|
|
|
|
|
|
... |
325
|
|
|
|
|
|
|
); |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
$clustered_ranges = [ |
328
|
|
|
|
|
|
|
# first cluster |
329
|
|
|
|
|
|
|
[ |
330
|
|
|
|
|
|
|
[ $start, $stop, $some, $custom, $elements ], |
331
|
|
|
|
|
|
|
... |
332
|
|
|
|
|
|
|
], |
333
|
|
|
|
|
|
|
# next cluster |
334
|
|
|
|
|
|
|
[ |
335
|
|
|
|
|
|
|
[ $start, $stop, $some, $custom, $elements ], |
336
|
|
|
|
|
|
|
... |
337
|
|
|
|
|
|
|
], |
338
|
|
|
|
|
|
|
... |
339
|
|
|
|
|
|
|
]; |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
=item B<< $map = map_seqids(\@seqs!, $pattern) >> |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
Wrapper around C<seqid_mapper()>, maps the ids of @seqs and returns the map. |
345
|
|
|
|
|
|
|
This function works directly on the sequences. |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
=item B<< $mapper = seqid_mapper(\%idmap) >> |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
Create a mapper that maps given sequences to new ids generated by the argument |
350
|
|
|
|
|
|
|
given. A hash as argument will be used as mapping base, taking the key as old and the |
351
|
|
|
|
|
|
|
value as new id. In case the sequence id is non-existent in the hash, an artificial id |
352
|
|
|
|
|
|
|
following the pattern C<unknown_$i> with C<$i> running from 0 |
353
|
|
|
|
|
|
|
onwards will be generated.. |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
=item B<< $mapper = seqid_mapper(\&handler) >> |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
Create a mapper that maps given sequences to new ids generated by successive |
358
|
|
|
|
|
|
|
calls to the handler. The handler will get the existing/original id as |
359
|
|
|
|
|
|
|
argument and shall return a new id. A simple mapper would be: |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
my $i = 1; |
363
|
|
|
|
|
|
|
my $mapper = seqid_mapper(sub { sprintf "id%d", $i++ }); |
364
|
|
|
|
|
|
|
or |
365
|
|
|
|
|
|
|
my $mapper = seqid_mapper(sub { my $id = shift; $id =~ s/pep/cds/; return $id}); |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
my ($old_id, $new_id) = $mapper->($sequence_object); |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
The sequence object WILL BE ALTERED IN SITU. |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
=item B<< $mapper = seqid_mapper($pattern) >> |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
Use pattern as basis for sequence id mapping, "%s" or "%d" must be included |
374
|
|
|
|
|
|
|
ONLY ONCE and will be substituted by a couter, running from 0 to INF. |
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
=item B<< $mapper = seqid_mapper() >> |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
Same as C<$mapper = seqid_mapper("s%9d")> |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
=item B<< overlaps([$a_begin, $a_end], [$b_begin, $b_end]) >> |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
Returns true if a overlaps with b, false otherwise. |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
=back |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
=head1 SEE ALSO |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
=head1 AUTHOR |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
jw bargsten, C<< <joachim.bargsten at wur.nl> >> |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
=cut |