| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# |
|
2
|
|
|
|
|
|
|
# My Perl library to deal with short read sequencing data. |
|
3
|
|
|
|
|
|
|
# created by Li Shen, Nov 2009. |
|
4
|
|
|
|
|
|
|
# |
|
5
|
|
|
|
|
|
|
# strGCPercent added March 2010 - Li Shen |
|
6
|
|
|
|
|
|
|
# bin_genome_count,read_chrom_len,sep_chrom_bed,del_chrfile added April 2010 - Li Shen |
|
7
|
|
|
|
|
|
|
# bin_genome_count_direction added June 2010 - Li Shen |
|
8
|
|
|
|
|
|
|
# |
|
9
|
|
|
|
|
|
|
# |
|
10
|
|
|
|
|
|
|
|
|
11
|
1
|
|
|
1
|
|
15
|
use 5.006; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
41
|
|
|
12
|
1
|
|
|
1
|
|
5
|
use strict; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
62
|
|
|
13
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
55
|
|
|
14
|
|
|
|
|
|
|
package MyShortRead::MyShortRead; |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
|
|
17
|
1
|
|
|
1
|
|
5
|
use POSIX; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
9
|
|
|
18
|
1
|
|
|
1
|
|
4522
|
use Time::HiRes qw(time); |
|
|
1
|
|
|
|
|
3622
|
|
|
|
1
|
|
|
|
|
5
|
|
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
require Exporter; |
|
21
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
# Items to export into callers namespace by default. Note: do not export |
|
24
|
|
|
|
|
|
|
# names by default without a very good reason. Use EXPORT_OK instead. |
|
25
|
|
|
|
|
|
|
# Do not simply export all your public functions/methods/constants. |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
# This allows declaration use MyShortRead::MyShortRead ':all'; |
|
28
|
|
|
|
|
|
|
# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK |
|
29
|
|
|
|
|
|
|
# will save memory. |
|
30
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( 'all' => [ qw( |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
) ] ); |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
our @EXPORT = qw(compare2 overlap2 strGCPercent bin_genome_count bin_genome_count_direction read_chrlen_tbl read_chrlen_ordered sep_chrom_bed del_chrfile order_chr); |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
our $VERSION = '1.00'; |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
# Preloaded methods go here. |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# Comparing two genomic intervals according to start or end position. |
|
44
|
|
|
|
|
|
|
sub compare2{ |
|
45
|
0
|
|
|
0
|
0
|
|
my($a,$b) = @_; |
|
46
|
0
|
|
|
|
|
|
my $cmpby = 'start'; |
|
47
|
0
|
0
|
|
|
|
|
if(@_ > 2) {$cmpby = $_[2];} |
|
|
0
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
|
|
49
|
0
|
|
|
|
|
|
$a->{chrom} =~ /^(chr)?(\w+)$/; |
|
50
|
0
|
|
|
|
|
|
my $chr1 = $2; |
|
51
|
0
|
|
|
|
|
|
$b->{chrom} =~ /^(chr)?(\w+)$/; |
|
52
|
0
|
|
|
|
|
|
my $chr2 = $2; |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
# $chr1 = 100 if uc $chr1 eq 'X'; |
|
55
|
|
|
|
|
|
|
# $chr1 = 101 if uc $chr1 eq 'Y'; |
|
56
|
|
|
|
|
|
|
# $chr1 = 102 if uc $chr1 eq 'M'; |
|
57
|
|
|
|
|
|
|
# $chr1 = 199 if uc $chr1 eq 'Z'; |
|
58
|
|
|
|
|
|
|
# $chr2 = 100 if uc $chr2 eq 'X'; |
|
59
|
|
|
|
|
|
|
# $chr2 = 101 if uc $chr2 eq 'Y'; |
|
60
|
|
|
|
|
|
|
# $chr2 = 102 if uc $chr2 eq 'M'; |
|
61
|
|
|
|
|
|
|
# $chr2 = 199 if uc $chr2 eq 'Z'; |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
# return $chr1 <=> $chr2 unless $chr1 == $chr2; |
|
64
|
|
|
|
|
|
|
|
|
65
|
0
|
0
|
|
|
|
|
return $chr1 cmp $chr2 unless $chr1 eq $chr2; |
|
66
|
|
|
|
|
|
|
|
|
67
|
0
|
0
|
|
|
|
|
if($cmpby eq 'end') {return $a->{'end'} <=> $b->{'end'};} |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
else {return $a->{'start'} <=> $b->{'start'};} |
|
69
|
|
|
|
|
|
|
} |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
# Order chromosome names. |
|
72
|
|
|
|
|
|
|
sub order_chr{ |
|
73
|
0
|
|
|
0
|
0
|
|
my($a,$b) = @_; |
|
74
|
0
|
|
|
|
|
|
my($chr1, $chr2); |
|
75
|
|
|
|
|
|
|
# Find chromosome names. |
|
76
|
0
|
0
|
|
|
|
|
if(exists $a->{'chr'}) {$chr1 = $a->{'chr'};} |
|
|
0
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
77
|
0
|
|
|
|
|
|
elsif(exists $a->{chrom}) {$chr1 = $a->{chrom};} |
|
78
|
0
|
|
|
|
|
|
else {warn "Cannot find chromosome name! Order is not determined.\n"; return 0;} |
|
79
|
0
|
0
|
|
|
|
|
if(exists $b->{'chr'}) {$chr2 = $b->{'chr'};} |
|
|
0
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
80
|
0
|
|
|
|
|
|
elsif(exists $b->{chrom}) {$chr2 = $b->{chrom};} |
|
81
|
0
|
|
|
|
|
|
else {warn "Cannot find chromosome name! Order is not determined.\n"; return 0;} |
|
82
|
|
|
|
|
|
|
|
|
83
|
0
|
|
|
|
|
|
return $chr1 cmp $chr2; |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
# # Get rid of 'chr' if exists. |
|
86
|
|
|
|
|
|
|
# $chr1 =~ /^(chr)?(\w+)$/; $chr1 = $2; |
|
87
|
|
|
|
|
|
|
# $chr2 =~ /^(chr)?(\w+)$/; $chr2 = $2; |
|
88
|
|
|
|
|
|
|
# # Convert letters to large numbers. |
|
89
|
|
|
|
|
|
|
# $chr1 = 100 if uc $chr1 eq 'X'; |
|
90
|
|
|
|
|
|
|
# $chr1 = 101 if uc $chr1 eq 'Y'; |
|
91
|
|
|
|
|
|
|
# $chr1 = 102 if uc $chr1 eq 'M'; |
|
92
|
|
|
|
|
|
|
# $chr1 = 199 if uc $chr1 eq 'Z'; |
|
93
|
|
|
|
|
|
|
# $chr2 = 100 if uc $chr2 eq 'X'; |
|
94
|
|
|
|
|
|
|
# $chr2 = 101 if uc $chr2 eq 'Y'; |
|
95
|
|
|
|
|
|
|
# $chr2 = 102 if uc $chr2 eq 'M'; |
|
96
|
|
|
|
|
|
|
# $chr2 = 199 if uc $chr2 eq 'Z'; |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
# return $chr1 <=> $chr2; |
|
99
|
|
|
|
|
|
|
} |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# Find the interval list with the lowest start or end position. |
|
102
|
|
|
|
|
|
|
# return the index of the lowest list and set the position through reference. |
|
103
|
|
|
|
|
|
|
sub find_lowest_list{ |
|
104
|
0
|
|
|
0
|
0
|
|
my($rlists, $rpos) = @_; |
|
105
|
0
|
|
|
|
|
|
my $cmpby = 'start'; |
|
106
|
0
|
0
|
|
|
|
|
if(@_ > 2) {$cmpby = $_[2];} |
|
|
0
|
|
|
|
|
|
|
|
107
|
0
|
|
|
|
|
|
my $ll = 0; # initialize lowest list. |
|
108
|
0
|
|
|
|
|
|
my $lowest_intrv = $rlists->[$ll]{cur_intrv}; # intrv with the lowest end position. |
|
109
|
0
|
|
|
|
|
|
for my $i(1..$#{$rlists}){ |
|
|
0
|
|
|
|
|
|
|
|
110
|
0
|
|
|
|
|
|
my $cur_intrv = $rlists->[$i]{cur_intrv}; |
|
111
|
0
|
0
|
|
|
|
|
if(compare2($cur_intrv, $lowest_intrv, $cmpby) < 0){ |
|
112
|
0
|
|
|
|
|
|
$lowest_intrv = $cur_intrv; |
|
113
|
0
|
|
|
|
|
|
$ll = $i; |
|
114
|
|
|
|
|
|
|
} |
|
115
|
|
|
|
|
|
|
} |
|
116
|
|
|
|
|
|
|
# return the lowest position through reference. |
|
117
|
0
|
0
|
|
|
|
|
${$rpos} = $cmpby eq 'end'? $lowest_intrv->{'end'} : $lowest_intrv->{'start'}; |
|
|
0
|
|
|
|
|
|
|
|
118
|
0
|
|
|
|
|
|
return $ll; |
|
119
|
|
|
|
|
|
|
} |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
# Judge whether two intervals overlap. |
|
122
|
|
|
|
|
|
|
sub overlap2{ |
|
123
|
0
|
|
|
0
|
0
|
|
my($refintrv1, $refintrv2) = @_; |
|
124
|
0
|
0
|
|
|
|
|
if($refintrv1->{chrom} eq $refintrv2->{chrom}){ |
|
125
|
0
|
0
|
0
|
|
|
|
if($refintrv1->{'start'} <= $refintrv2->{'end'} and |
|
126
|
|
|
|
|
|
|
$refintrv1->{'end'} >= $refintrv2->{'start'}){ |
|
127
|
0
|
|
|
|
|
|
return 1; |
|
128
|
|
|
|
|
|
|
} |
|
129
|
|
|
|
|
|
|
} |
|
130
|
0
|
|
|
|
|
|
return 0; |
|
131
|
|
|
|
|
|
|
} |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
# Calculate the GC percentage given a DNA sequence in string representation. |
|
134
|
|
|
|
|
|
|
sub strGCPercent{ |
|
135
|
0
|
|
|
0
|
0
|
|
my $dnaSeq = shift; |
|
136
|
0
|
|
|
|
|
|
my $c = ($dnaSeq =~ s/[gc]/X/gi); # Count the # of G or C without regarding to case. |
|
137
|
0
|
|
|
|
|
|
my $n = ($dnaSeq =~ s/n/Y/gi); # count the # of 'N's. |
|
138
|
0
|
|
|
|
|
|
return ($c+$n/2) / length($dnaSeq); |
|
139
|
|
|
|
|
|
|
} |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
# Given a BED file, bin the genome and count the reads in bins. |
|
142
|
|
|
|
|
|
|
sub bin_genome_count{ |
|
143
|
|
|
|
|
|
|
# BED file,bin size,chromosome length,fragment size,reference to bin vector. |
|
144
|
0
|
|
|
0
|
0
|
|
my($bedfile,$binsize,$chrlen,$fragsize,$r_binVec) = @_; |
|
145
|
0
|
0
|
|
|
|
|
open HBED, "<", $bedfile or die "Open bed file error: $!\n"; |
|
146
|
0
|
|
|
|
|
|
$#{$r_binVec} = ceil($chrlen/$binsize)-1; # Calculate bin vector size.\ |
|
|
0
|
|
|
|
|
|
|
|
147
|
0
|
|
|
|
|
|
foreach my $i(0..$#{$r_binVec}) {$r_binVec->[$i] = 0;} # Initialize bin vector to zeros. |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
148
|
0
|
|
|
|
|
|
my $line = 0; |
|
149
|
0
|
|
|
|
|
|
while(){ |
|
150
|
0
|
|
|
|
|
|
$line++; |
|
151
|
0
|
|
|
|
|
|
chomp; |
|
152
|
0
|
|
|
|
|
|
my @cells = split /\t/; |
|
153
|
0
|
0
|
|
|
|
|
if(@cells < 6){ |
|
154
|
0
|
|
|
|
|
|
warn "Format error at $bedfile, line:$line. Skip!\n"; |
|
155
|
0
|
|
|
|
|
|
next; |
|
156
|
|
|
|
|
|
|
} |
|
157
|
0
|
|
|
|
|
|
my($chrom,$start,$end,$name,$score,$strand) = @cells; |
|
158
|
|
|
|
|
|
|
# Shift read location by fragment size. |
|
159
|
0
|
|
|
|
|
|
my $loc; |
|
160
|
0
|
0
|
|
|
|
|
if($strand eq '+') {$loc = $start + $fragsize/2;} |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
else {$loc = $end - $fragsize/2;} |
|
162
|
|
|
|
|
|
|
# Increment read count in bin. |
|
163
|
|
|
|
|
|
|
# $loc--; # Decrement location to correctly assign the read to bin vector.(NO! BED is 0-based.) |
|
164
|
0
|
|
|
|
|
|
$r_binVec->[floor($loc/$binsize)]++; |
|
165
|
|
|
|
|
|
|
} |
|
166
|
0
|
|
|
|
|
|
close HBED; |
|
167
|
|
|
|
|
|
|
} |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
# Bin the genome and count the reads with direction. |
|
170
|
|
|
|
|
|
|
sub bin_genome_count_direction{ |
|
171
|
|
|
|
|
|
|
# BED file,bin size,chromosome length,fragment size,reference to bin vector. |
|
172
|
0
|
|
|
0
|
0
|
|
my($bedfile,$binsize,$chrlen,$fragsize,$r_binVec_F,$r_binVec_R) = @_; |
|
173
|
0
|
0
|
|
|
|
|
open HBED, "<", $bedfile or die "Open bed file error: $!\n"; |
|
174
|
0
|
|
|
|
|
|
$#{$r_binVec_F} = ceil($chrlen/$binsize)-1; # Calculate bin vector size.\ |
|
|
0
|
|
|
|
|
|
|
|
175
|
0
|
|
|
|
|
|
$#{$r_binVec_R} = ceil($chrlen/$binsize)-1; # Calculate bin vector size.\ |
|
|
0
|
|
|
|
|
|
|
|
176
|
0
|
|
|
|
|
|
foreach my $i(0..$#{$r_binVec_F}) {$r_binVec_F->[$i] = 0;} # Initialize bin vector to zeros. |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
177
|
0
|
|
|
|
|
|
foreach my $i(0..$#{$r_binVec_R}) {$r_binVec_R->[$i] = 0;} # Initialize bin vector to zeros. |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
178
|
0
|
|
|
|
|
|
my $line = 0; |
|
179
|
0
|
|
|
|
|
|
while(){ |
|
180
|
0
|
|
|
|
|
|
$line++; |
|
181
|
0
|
|
|
|
|
|
chomp; |
|
182
|
0
|
|
|
|
|
|
my @cells = split; |
|
183
|
0
|
0
|
|
|
|
|
if(@cells < 6){ |
|
184
|
0
|
|
|
|
|
|
warn "Format error at $bedfile, line:$line. Skip!\n"; |
|
185
|
0
|
|
|
|
|
|
next; |
|
186
|
|
|
|
|
|
|
} |
|
187
|
0
|
|
|
|
|
|
my($chrom,$start,$end,$name,$score,$strand) = @cells; |
|
188
|
|
|
|
|
|
|
# Shift read location by fragment size and then increment the read count at the correct location. |
|
189
|
0
|
|
|
|
|
|
my $loc; |
|
190
|
0
|
0
|
|
|
|
|
if($strand eq '+') { |
|
191
|
0
|
|
|
|
|
|
$loc = $start + $fragsize/2; |
|
192
|
0
|
|
|
|
|
|
$r_binVec_F->[floor($loc/$binsize)]++; |
|
193
|
|
|
|
|
|
|
} |
|
194
|
|
|
|
|
|
|
else { |
|
195
|
0
|
|
|
|
|
|
$loc = $end - $fragsize/2; |
|
196
|
0
|
|
|
|
|
|
$r_binVec_R->[floor($loc/$binsize)]++; |
|
197
|
|
|
|
|
|
|
} |
|
198
|
|
|
|
|
|
|
} |
|
199
|
0
|
|
|
|
|
|
close HBED; |
|
200
|
|
|
|
|
|
|
} |
|
201
|
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
# Read chromosome name and length into a hash table. |
|
203
|
|
|
|
|
|
|
sub read_chrlen_tbl{ |
|
204
|
0
|
|
|
0
|
0
|
|
my $chrfile = shift; |
|
205
|
0
|
0
|
|
|
|
|
open HCHR, "<", $chrfile or die "Cannot open chromosome length file:$!\n"; |
|
206
|
0
|
|
|
|
|
|
my $line = 0; |
|
207
|
0
|
|
|
|
|
|
my %chrlen; # hash table for chromosome length. |
|
208
|
0
|
|
|
|
|
|
while(){ |
|
209
|
0
|
|
|
|
|
|
$line++; |
|
210
|
0
|
|
|
|
|
|
chomp; |
|
211
|
0
|
0
|
|
|
|
|
next if $_ eq ''; # skip empty line. |
|
212
|
0
|
|
|
|
|
|
my @cells = split; |
|
213
|
0
|
0
|
|
|
|
|
warn "Insufficient chromosome information at $chrfile, line:$line. Skip!\n" if @cells < 2; |
|
214
|
0
|
|
|
|
|
|
my($chrom,$len) = @cells; |
|
215
|
0
|
0
|
0
|
|
|
|
if($chrom =~ /^\S+$/ and $len =~ /^[0-9]+$/){ |
|
|
0
|
|
|
|
|
|
|
|
216
|
0
|
|
|
|
|
|
$chrlen{$chrom} = $len; |
|
217
|
|
|
|
|
|
|
} |
|
218
|
|
|
|
|
|
|
else {warn "Format error at $chrfile, line:$line. Skip!\n"} |
|
219
|
|
|
|
|
|
|
} |
|
220
|
0
|
|
|
|
|
|
return %chrlen; |
|
221
|
|
|
|
|
|
|
} |
|
222
|
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
# Read chromosome name and length into an array and order them by name. |
|
224
|
|
|
|
|
|
|
sub read_chrlen_ordered{ |
|
225
|
0
|
|
|
0
|
0
|
|
my $chrfile = shift; |
|
226
|
0
|
0
|
|
|
|
|
open HCHR, "<", $chrfile or die "Cannot open chromosome length file:$!\n"; |
|
227
|
0
|
|
|
|
|
|
my $line = 0; |
|
228
|
0
|
|
|
|
|
|
my @chrlen; # array for chromosome length. |
|
229
|
0
|
|
|
|
|
|
while(){ |
|
230
|
0
|
|
|
|
|
|
$line++; |
|
231
|
0
|
|
|
|
|
|
chomp; |
|
232
|
0
|
0
|
|
|
|
|
next if $_ eq ''; # skip empty line. |
|
233
|
0
|
|
|
|
|
|
my @cells = split; |
|
234
|
0
|
0
|
|
|
|
|
warn "Insufficient chromosome information at $chrfile, line:$line. Skip!\n" if @cells < 2; |
|
235
|
0
|
|
|
|
|
|
my($chrom,$len) = @cells; |
|
236
|
0
|
0
|
0
|
|
|
|
if($chrom =~ /^\S+$/ and $len =~ /^[0-9]+$/){ |
|
|
0
|
|
|
|
|
|
|
|
237
|
0
|
|
|
|
|
|
my %rec = ( |
|
238
|
|
|
|
|
|
|
chrom => $chrom, |
|
239
|
|
|
|
|
|
|
len => $len); |
|
240
|
0
|
|
|
|
|
|
push @chrlen, \%rec; |
|
241
|
|
|
|
|
|
|
} |
|
242
|
|
|
|
|
|
|
else {warn "Format error at $chrfile, line:$line. Skip!\n"} |
|
243
|
|
|
|
|
|
|
} |
|
244
|
0
|
|
|
|
|
|
return sort {order_chr($a,$b)} @chrlen; # return ordered list. |
|
|
0
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
} |
|
246
|
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
# Separate a BED file into small ones by chromosome names. |
|
249
|
|
|
|
|
|
|
sub sep_chrom_bed{ |
|
250
|
|
|
|
|
|
|
# BED file, reference to chromosome length hash table. |
|
251
|
0
|
|
|
0
|
0
|
|
my($bedfile,@chrname) = @_; |
|
252
|
0
|
|
|
|
|
|
my $timestamp = time; |
|
253
|
0
|
|
|
|
|
|
my $timehead = ".$timestamp."; # timestamp file name head. |
|
254
|
0
|
|
|
|
|
|
my %hsh_chrfile; # hash of chromosome file handles. |
|
255
|
|
|
|
|
|
|
my %hsh_chrname; # hash of chromosome file names. |
|
256
|
|
|
|
|
|
|
# Create chromosome files for writing. |
|
257
|
0
|
|
|
|
|
|
foreach my $chrom(@chrname){ |
|
258
|
0
|
|
|
|
|
|
my $chrfile = $timehead . $chrom . ".bed"; |
|
259
|
0
|
|
|
|
|
|
while(-e $chrfile){ # prevent existing files from being overridden. |
|
260
|
0
|
|
|
|
|
|
$chrfile = ($timehead + rand) . $chrom . ".bed"; |
|
261
|
|
|
|
|
|
|
} |
|
262
|
0
|
|
|
|
|
|
my $fh; |
|
263
|
0
|
0
|
|
|
|
|
open $fh, ">", $chrfile or die "Cannot create bed file:$!\n"; |
|
264
|
0
|
|
|
|
|
|
$hsh_chrfile{$chrom} = $fh; |
|
265
|
0
|
|
|
|
|
|
$hsh_chrname{$chrom} = $chrfile; |
|
266
|
|
|
|
|
|
|
} |
|
267
|
0
|
0
|
|
|
|
|
open HBED, "<", $bedfile or die "Cannot open $bedfile:$!\n"; |
|
268
|
0
|
|
|
|
|
|
my $line = 0; # line number in BED file. |
|
269
|
0
|
|
|
|
|
|
my $nread = 0; # actual number of reads splitted. |
|
270
|
|
|
|
|
|
|
# Go through bed file and assign each line to corresponding file. |
|
271
|
0
|
|
|
|
|
|
while(){ |
|
272
|
0
|
|
|
|
|
|
$line++; |
|
273
|
0
|
|
|
|
|
|
chomp; |
|
274
|
0
|
0
|
|
|
|
|
next if $_ eq ''; |
|
275
|
0
|
|
|
|
|
|
my @cells = split; |
|
276
|
0
|
0
|
|
|
|
|
if(@cells < 6){ |
|
277
|
0
|
|
|
|
|
|
warn "Insufficient tag information at $bedfile, line:$line. Skip!\n"; |
|
278
|
0
|
|
|
|
|
|
next; |
|
279
|
|
|
|
|
|
|
} |
|
280
|
0
|
|
|
|
|
|
my($chrom,$start,$end,$name,$score,$strand) = @cells; |
|
281
|
0
|
0
|
|
|
|
|
if(exists $hsh_chrfile{$chrom}){ |
|
282
|
0
|
|
|
|
|
|
$nread++; |
|
283
|
0
|
|
|
|
|
|
print {$hsh_chrfile{$chrom}} $_ . "\n"; |
|
|
0
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
} |
|
285
|
|
|
|
|
|
|
else{ |
|
286
|
0
|
|
|
|
|
|
warn "Unspecified chromosome name at $bedfile, line:$line.\nCheck your chromosome length file. Skip!\n"; |
|
287
|
|
|
|
|
|
|
} |
|
288
|
|
|
|
|
|
|
} |
|
289
|
|
|
|
|
|
|
# Close all files. |
|
290
|
0
|
|
|
|
|
|
close HBED; |
|
291
|
0
|
|
|
|
|
|
foreach my $hdl(values %hsh_chrfile) {close $hdl;} |
|
|
0
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
# Return chromosome file name vector. |
|
294
|
0
|
|
|
|
|
|
return ($nread, \%hsh_chrname); |
|
295
|
|
|
|
|
|
|
} |
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
# Delete all separated chromosome files. |
|
298
|
|
|
|
|
|
|
sub del_chrfile{ |
|
299
|
0
|
|
|
0
|
0
|
|
foreach my $file(@_) { |
|
300
|
0
|
0
|
|
|
|
|
unlink $file or warn "Cannot delete $file: $!\n"; |
|
301
|
|
|
|
|
|
|
} |
|
302
|
|
|
|
|
|
|
} |
|
303
|
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
1; |
|
305
|
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
__END__ |