line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
1
|
|
|
1
|
|
6810
|
use 5.006; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
64
|
|
2
|
|
|
|
|
|
|
our $VERSION = '1.11'; |
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
# Package to analyze the ChIP-seq differential analysis results. |
5
|
|
|
|
|
|
|
# Contains functions to manipulate and score a cluster of diff sites. |
6
|
|
|
|
|
|
|
# Can be used to identify chromatin modifiation hotspots. |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
package diffReps::DiffRes; |
9
|
1
|
|
|
1
|
|
6
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
40
|
|
10
|
|
|
|
|
|
|
#use MyShortRead::MyShortRead qw(compare2); |
11
|
1
|
|
|
1
|
|
3428
|
use Math::CDF qw(ppois); |
|
1
|
|
|
|
|
8597
|
|
|
1
|
|
|
|
|
152
|
|
12
|
1
|
|
|
1
|
|
1241
|
use MyBioinfo::Common qw( min ); |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
108
|
|
13
|
1
|
|
|
1
|
|
7
|
use constant INF => 1e10; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
66
|
|
14
|
1
|
|
|
1
|
|
5
|
use constant WIN1 => 2e5; # 200kb. |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
56
|
|
15
|
1
|
|
|
1
|
|
5
|
use constant WIN2 => 1e6; # 1Mb. |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
3441
|
|
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
require Exporter; |
18
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
19
|
|
|
|
|
|
|
our @EXPORT_OK = qw( cmpsite ); |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
# Constructor. |
23
|
|
|
|
|
|
|
# Lambda: defined as the avg site-to-site distance which can be used to |
24
|
|
|
|
|
|
|
# calculate an expected segment number for a cluster. |
25
|
|
|
|
|
|
|
# Expected segments = cluster distance / lambda |
26
|
|
|
|
|
|
|
# P-value is evaluated as Poisson(Observed segment |Expected segment). |
27
|
|
|
|
|
|
|
# With the same Observed / Expected ratio, larger cluster is always favored |
28
|
|
|
|
|
|
|
# vs. smaller one. |
29
|
|
|
|
|
|
|
sub new{ |
30
|
0
|
|
|
0
|
0
|
|
my $class = shift; |
31
|
0
|
|
|
|
|
|
my $self = { |
32
|
|
|
|
|
|
|
siteList => [], # diff site list. |
33
|
|
|
|
|
|
|
# each site is a hash: (chrom, start, end, siteCenter, recLoc). |
34
|
|
|
|
|
|
|
curPos => 0, # current position in site list. |
35
|
|
|
|
|
|
|
glbLambda => undef, # global lambda: avg inter-distance between two sites. |
36
|
|
|
|
|
|
|
clust => { |
37
|
|
|
|
|
|
|
from => undef, # start position (in the list) of cluster. |
38
|
|
|
|
|
|
|
to => undef, # end position (in the list) of cluster. |
39
|
|
|
|
|
|
|
seg => undef, # cluster segments. |
40
|
|
|
|
|
|
|
dist => undef, # left-to-right distance. |
41
|
|
|
|
|
|
|
enrich => undef, # observed / expected segment. |
42
|
|
|
|
|
|
|
pval => undef # P-value based on Poisson distribution. |
43
|
|
|
|
|
|
|
} |
44
|
|
|
|
|
|
|
}; |
45
|
0
|
|
|
|
|
|
bless $self, $class; |
46
|
0
|
|
|
|
|
|
return $self; |
47
|
|
|
|
|
|
|
} |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
# Evaluate the significance of current cluster. |
50
|
|
|
|
|
|
|
sub sigeval{ |
51
|
0
|
|
|
0
|
0
|
|
my($self) = @_; |
52
|
0
|
|
|
|
|
|
$self->dist_seg_clust(); # calc actual distance, segments for cluster. |
53
|
|
|
|
|
|
|
# Trivial case: the new pushed site is on another chromosome or the cluster contains a single site. |
54
|
0
|
0
|
0
|
|
|
|
if($self->{'clust'}{'dist'} == INF or $self->{'clust'}{'dist'} == 0){ |
55
|
0
|
|
|
|
|
|
$self->{'clust'}{'seg'} = undef; |
56
|
0
|
|
|
|
|
|
$self->{'clust'}{'enrich'} = undef; |
57
|
0
|
|
|
|
|
|
$self->{'clust'}{'pval'} = 1.0; |
58
|
0
|
|
|
|
|
|
return 1.0; |
59
|
|
|
|
|
|
|
} |
60
|
0
|
|
|
|
|
|
my($l_d1,$l_s1,$l_d2,$l_s2) = $self->calc_locL(-1); # search left. |
61
|
0
|
|
|
|
|
|
my($r_d1,$r_s1,$r_d2,$r_s2) = $self->calc_locL(+1); # search right. |
62
|
0
|
|
|
|
|
|
my $d1 = $l_d1 + $r_d1; |
63
|
0
|
|
|
|
|
|
my $s1 = $l_s1 + $r_s1; |
64
|
0
|
|
|
|
|
|
my $d2 = $l_d2 + $r_d2; |
65
|
0
|
|
|
|
|
|
my $s2 = $l_s2 + $r_s2; |
66
|
|
|
|
|
|
|
|
67
|
0
|
0
|
|
|
|
|
my $L1 = $s1 > 0? $d1 / $s1 : INF; # local lambda 1. |
68
|
0
|
0
|
|
|
|
|
my $L2 = $s2 > 0? $d2 / $s2 : INF; # local lambda 2. |
69
|
|
|
|
|
|
|
|
70
|
0
|
|
|
|
|
|
my $L_min = &min($L1, $L2, $self->{'glbLambda'}); |
71
|
0
|
0
|
0
|
|
|
|
if($L_min != INF and defined $self->{'clust'}{'dist'}){ |
72
|
0
|
|
|
|
|
|
my $exp_seg = $self->{'clust'}{'dist'} / $L_min; |
73
|
0
|
|
|
|
|
|
$self->{'clust'}{'enrich'} = $self->{'clust'}{'seg'} / $exp_seg; |
74
|
0
|
|
|
|
|
|
$self->{'clust'}{'pval'} = 1 - ppois($self->{'clust'}{'seg'}, $exp_seg); |
75
|
|
|
|
|
|
|
}else{ |
76
|
0
|
|
|
|
|
|
$self->{'clust'}{'enrich'} = undef; |
77
|
0
|
|
|
|
|
|
$self->{'clust'}{'pval'} = 1.0; |
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
|
80
|
0
|
|
|
|
|
|
return $self->{'clust'}{'pval'}; |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
# Search either direction of a differential list to calculate local lambda. |
84
|
|
|
|
|
|
|
sub calc_locL{ |
85
|
0
|
|
|
0
|
0
|
|
my($self,$di) = @_; # di must be: -1 or 1 for left or right. |
86
|
|
|
|
|
|
|
# Calc site distance considering direction. |
87
|
|
|
|
|
|
|
sub site_dist_di{ |
88
|
0
|
|
|
0
|
0
|
|
my($di,$w_site,$c_site) = @_; |
89
|
0
|
0
|
|
|
|
|
return $di < 0? &site_dist($w_site, $c_site) : &site_dist($c_site, $w_site); |
90
|
|
|
|
|
|
|
} |
91
|
|
|
|
|
|
|
# Calc segment number considering direction. |
92
|
|
|
|
|
|
|
sub seg_di{ |
93
|
0
|
|
|
0
|
0
|
|
my($di,$w_loc,$c_loc) = @_; |
94
|
0
|
0
|
|
|
|
|
return $di < 0? $c_loc - $w_loc : $w_loc - $c_loc; |
95
|
|
|
|
|
|
|
} |
96
|
|
|
|
|
|
|
# Move back to the last site within window size and calculate distance and segments |
97
|
|
|
|
|
|
|
# considering direction. |
98
|
|
|
|
|
|
|
sub last_dist_seg_di{ |
99
|
0
|
|
|
0
|
0
|
|
my($self,$di,$w_loc,$c_loc,$c_site) = @_; |
100
|
0
|
|
|
|
|
|
my $w_loc_ = $w_loc - $di; # recover to the last site that is still on the same chromosome. |
101
|
0
|
|
|
|
|
|
my $w_site = $self->{'siteList'}->[$w_loc_]; |
102
|
0
|
|
|
|
|
|
my $d = &site_dist_di($di, $w_site, $c_site); |
103
|
0
|
|
|
|
|
|
my $s = &seg_di($di, $w_loc_, $c_loc); |
104
|
|
|
|
|
|
|
|
105
|
0
|
|
|
|
|
|
return ($d,$s); |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
# Setup initial parameter values. |
109
|
0
|
0
|
|
|
|
|
my $c_loc = $di < 0? $self->{'clust'}{'from'} : $self->{'clust'}{'to'}; # cluster edge location. |
110
|
0
|
|
|
|
|
|
my $c_site = $self->{'siteList'}->[$c_loc]; # cluster edge site. |
111
|
0
|
|
|
|
|
|
my($dist1,$seg1,$dist2,$seg2); # local distance and #segment 1 and 2. |
112
|
0
|
|
|
|
|
|
$dist1 = $dist2 = $seg1 = $seg2 = 0; |
113
|
0
|
|
|
|
|
|
my $tag1 = 0; # tag for window1. |
114
|
0
|
|
|
|
|
|
my $nsite = @{$self->{'siteList'}}; # number of total sites. |
|
0
|
|
|
|
|
|
|
115
|
0
|
|
|
|
|
|
my $w_loc; |
116
|
0
|
|
0
|
|
|
|
for($w_loc = $c_loc + $di; $w_loc >= 0 and $w_loc < $nsite; $w_loc += $di){ |
117
|
0
|
|
|
|
|
|
my $w_site = $self->{'siteList'}->[$w_loc]; # window site. |
118
|
0
|
|
|
|
|
|
my $d = &site_dist_di($di, $w_site, $c_site); # distance between two sites. |
119
|
0
|
0
|
0
|
|
|
|
if(!$tag1 and $d > WIN1){ # pass window1 for the first time? |
120
|
0
|
|
|
|
|
|
($dist1,$seg1) = $self->last_dist_seg_di($di, $w_loc, $c_loc, $c_site); |
121
|
0
|
|
|
|
|
|
$tag1 = 1; # set tag for window1. |
122
|
|
|
|
|
|
|
} |
123
|
0
|
0
|
|
|
|
|
if($d > WIN2){ # pass window2. |
124
|
0
|
|
|
|
|
|
($dist2,$seg2) = $self->last_dist_seg_di($di, $w_loc, $c_loc, $c_site); |
125
|
0
|
|
|
|
|
|
last; |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
} |
128
|
0
|
0
|
0
|
|
|
|
if($w_loc < 0 or $w_loc >= $nsite){ # exceeded the begining or end of the diff list. |
129
|
0
|
|
|
|
|
|
my($d,$s) = $self->last_dist_seg_di($di, $w_loc, $c_loc, $c_site); |
130
|
0
|
0
|
|
|
|
|
if(!$tag1){ |
131
|
0
|
|
|
|
|
|
($dist1,$seg1) = ($d, $s); |
132
|
0
|
|
|
|
|
|
$tag1 = 1; |
133
|
|
|
|
|
|
|
} |
134
|
0
|
|
|
|
|
|
($dist2,$seg2) = ($d, $s); |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
|
137
|
0
|
|
|
|
|
|
return ($dist1, $seg1, $dist2, $seg2); |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
# Read genomic coordinates of chromatin modification sites from a list |
141
|
|
|
|
|
|
|
# of files. Record the line number for each diff site. |
142
|
|
|
|
|
|
|
sub read_modsite{ |
143
|
0
|
|
|
0
|
0
|
|
my($self,$rfile,$info) = @_; |
144
|
|
|
|
|
|
|
# Figure out which columns are used in output. |
145
|
|
|
|
|
|
|
# Format eg: 1,2,3-6,7-8,9 |
146
|
0
|
|
|
|
|
|
my @icol = (); # columns. |
147
|
0
|
0
|
|
|
|
|
if($info ne '0'){ |
148
|
0
|
|
|
|
|
|
my @iseg = split /,/, $info; # segments. |
149
|
0
|
|
|
|
|
|
foreach my $s(@iseg){ |
150
|
0
|
|
|
|
|
|
my($sta,$end) = split /-/, $s; |
151
|
0
|
0
|
|
|
|
|
$end = $sta unless defined $end; |
152
|
0
|
|
|
|
|
|
for my $c($sta..$end){ |
153
|
0
|
|
|
|
|
|
push @icol, $c; |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
# Start extracting sites from inputs. |
158
|
0
|
|
|
|
|
|
$self->{siteList} = []; # deplete site list first. |
159
|
0
|
|
|
|
|
|
foreach my $f(@{$rfile}){ |
|
0
|
|
|
|
|
|
|
160
|
0
|
0
|
|
|
|
|
open DIFF, '<', $f or die "Open diff site file error: $!\n"; |
161
|
0
|
|
|
|
|
|
my $ln = 0; |
162
|
0
|
|
|
|
|
|
my $site; |
163
|
0
|
|
|
|
|
|
while(){ # Get rid of header lines. |
164
|
0
|
|
|
|
|
|
$ln++; |
165
|
0
|
0
|
|
|
|
|
last if ($site = &parse_site($_, $f, $ln, \@icol)); |
166
|
|
|
|
|
|
|
} |
167
|
0
|
0
|
|
|
|
|
push @{$self->{siteList}}, $site if $site; |
|
0
|
|
|
|
|
|
|
168
|
0
|
|
|
|
|
|
while(){ |
169
|
0
|
|
|
|
|
|
$ln++; |
170
|
0
|
0
|
|
|
|
|
if($site = &parse_site($_, $f, $ln, \@icol)){ |
|
0
|
|
|
|
|
|
|
171
|
0
|
|
|
|
|
|
push @{$self->{siteList}}, $site; |
|
0
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
else {die "Un-recognized diff site format in file $f: $ln\n";} |
174
|
|
|
|
|
|
|
} |
175
|
0
|
|
|
|
|
|
close DIFF; |
176
|
|
|
|
|
|
|
} |
177
|
0
|
|
|
|
|
|
$self->{curPos} = 0; # reset current position. |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# Parse a line of diff site file and store it in hash table. |
182
|
|
|
|
|
|
|
sub parse_site{ |
183
|
0
|
|
|
0
|
0
|
|
my($rec,$file,$ln,$rcol) = @_; |
184
|
0
|
|
|
|
|
|
my $mark; |
185
|
|
|
|
|
|
|
# Try to figure out the histone mark name from the input file... |
186
|
0
|
0
|
|
|
|
|
if($file =~ /((h[0-9]+k[0-9]+((me[0-9]+)|ac))|(polII)|(pol2))/i) {$mark = $1;} |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
else {$mark = $file;} # if not matching pattern, use file name instead. |
188
|
0
|
0
|
|
|
|
|
if($rec =~ /^(\S+)\t([0-9]+)\t([0-9]+)/){ |
|
0
|
|
|
|
|
|
|
189
|
0
|
|
|
|
|
|
my @cells = split /\t/, $rec; |
190
|
|
|
|
|
|
|
# Extract output info from specified columns. |
191
|
0
|
|
|
|
|
|
my @vinfo; |
192
|
0
|
|
|
|
|
|
foreach my $c(@{$rcol}){ |
|
0
|
|
|
|
|
|
|
193
|
0
|
0
|
0
|
|
|
|
if($c > 0 and $c <= @cells){ |
194
|
0
|
|
|
|
|
|
push @vinfo, $cells[$c-1]; |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
} |
197
|
0
|
|
|
|
|
|
my $sinfo = ''; |
198
|
0
|
0
|
|
|
|
|
if(@vinfo > 0){ |
199
|
0
|
|
|
|
|
|
$sinfo = join(',', @vinfo); |
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
# Return record as a hash table. |
202
|
|
|
|
|
|
|
return { |
203
|
0
|
|
|
|
|
|
chrom => $cells[0], # chromosome name. |
204
|
|
|
|
|
|
|
start => $cells[1], |
205
|
|
|
|
|
|
|
end => $cells[2], |
206
|
|
|
|
|
|
|
siteCenter => ($cells[1] + $cells[2]) / 2, # diff site center. |
207
|
|
|
|
|
|
|
recLoc => $file . ':(' . $sinfo . '):' . $ln, # record location in file:(info):ln format. |
208
|
|
|
|
|
|
|
markType => $mark |
209
|
|
|
|
|
|
|
}; |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
else {return 0;} |
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
# Sort diff sites according to genomic coordinates. |
215
|
|
|
|
|
|
|
sub sort_list{ |
216
|
0
|
|
|
0
|
0
|
|
my($self) = @_; |
217
|
0
|
0
|
|
|
|
|
if(@{$self->{siteList}} > 1){ |
|
0
|
|
|
|
|
|
|
218
|
0
|
|
|
|
|
|
@{$self->{siteList}} = sort cmpsite @{$self->{siteList}}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
} |
220
|
|
|
|
|
|
|
} |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
# Comparing two sites according to their centers. |
223
|
|
|
|
|
|
|
sub cmpsite{ |
224
|
0
|
|
|
0
|
0
|
|
$a->{chrom} =~ /^(chr)?(\w+)$/; |
225
|
0
|
|
|
|
|
|
my $chr1 = $2; |
226
|
0
|
|
|
|
|
|
$b->{chrom} =~ /^(chr)?(\w+)$/; |
227
|
0
|
|
|
|
|
|
my $chr2 = $2; |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
# $chr1 = 100 if uc $chr1 eq 'X'; |
230
|
|
|
|
|
|
|
# $chr1 = 101 if uc $chr1 eq 'Y'; |
231
|
|
|
|
|
|
|
# $chr1 = 102 if uc $chr1 eq 'M'; |
232
|
|
|
|
|
|
|
# $chr1 = 199 if uc $chr1 eq 'Z'; |
233
|
|
|
|
|
|
|
# $chr2 = 100 if uc $chr2 eq 'X'; |
234
|
|
|
|
|
|
|
# $chr2 = 101 if uc $chr2 eq 'Y'; |
235
|
|
|
|
|
|
|
# $chr2 = 102 if uc $chr2 eq 'M'; |
236
|
|
|
|
|
|
|
# $chr2 = 199 if uc $chr2 eq 'Z'; |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
# return $chr1 <=> $chr2 unless $chr1 == $chr2; |
239
|
|
|
|
|
|
|
|
240
|
0
|
0
|
|
|
|
|
return $chr1 cmp $chr2 unless $chr1 eq $chr2; |
241
|
|
|
|
|
|
|
|
242
|
0
|
|
|
|
|
|
return $a->{siteCenter} <=> $b->{siteCenter}; |
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
# Calculate global lambda. Return negative value if it cannot be determined. |
246
|
|
|
|
|
|
|
sub calc_glbL{ |
247
|
0
|
|
|
0
|
0
|
|
my($self) = @_; |
248
|
0
|
0
|
|
|
|
|
if(@{$self->{siteList}} < 2){ |
|
0
|
|
|
|
|
|
|
249
|
0
|
|
|
|
|
|
$self->{'glbLambda'} = INF; |
250
|
0
|
|
|
|
|
|
return; |
251
|
|
|
|
|
|
|
} |
252
|
0
|
|
|
|
|
|
my $tot_dist = 0; # total internal distance. |
253
|
0
|
|
|
|
|
|
my $tot_seg = 0; # total distance segments. |
254
|
0
|
|
|
|
|
|
my $left = 0; |
255
|
|
|
|
|
|
|
# Go through each chromosome and do: |
256
|
|
|
|
|
|
|
# identify the left and right-most site, calculate the distance and |
257
|
|
|
|
|
|
|
# record the number of internal segments; add them to global variables. |
258
|
|
|
|
|
|
|
# Finally, calcualte an averaged distance. |
259
|
0
|
|
|
|
|
|
my $l_site = $self->{siteList}->[$left]; |
260
|
0
|
|
|
|
|
|
my $r_site; |
261
|
|
|
|
|
|
|
my $right; |
262
|
0
|
|
|
|
|
|
for($right = 1; $right < @{$self->{siteList}}; $right++){ |
|
0
|
|
|
|
|
|
|
263
|
0
|
|
|
|
|
|
$r_site = $self->{siteList}->[$right]; |
264
|
0
|
0
|
|
|
|
|
if(!same_chr($l_site, $r_site)){ |
265
|
0
|
|
|
|
|
|
$r_site = $self->{siteList}->[$right-1]; # the last site on same chromosome. |
266
|
0
|
|
|
|
|
|
my $d = site_dist($l_site, $r_site); |
267
|
0
|
0
|
|
|
|
|
if($d < INF){ |
268
|
0
|
|
|
|
|
|
$tot_dist += $d; |
269
|
0
|
|
|
|
|
|
$tot_seg += $right - $left -1; |
270
|
|
|
|
|
|
|
} |
271
|
0
|
|
|
|
|
|
$left = $right; # move into new chromosome. |
272
|
0
|
|
|
|
|
|
$l_site = $self->{siteList}->[$left]; |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
} # Upon exit: left and right sites are always on the same chromosome. |
275
|
|
|
|
|
|
|
# Add the last chromosome. |
276
|
0
|
|
|
|
|
|
$tot_dist += site_dist($l_site, $r_site); |
277
|
0
|
|
|
|
|
|
$tot_seg += $right - $left -1; |
278
|
0
|
0
|
|
|
|
|
$self->{'glbLambda'} = $tot_seg > 0? $tot_dist / $tot_seg : INF; |
279
|
|
|
|
|
|
|
} |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
# Determine if two sites are on the same chromosome. |
282
|
|
|
|
|
|
|
sub same_chr{ |
283
|
0
|
|
|
0
|
0
|
|
my($l_site,$r_site) = @_; |
284
|
0
|
|
|
|
|
|
return $l_site->{'chrom'} eq $r_site->{'chrom'}; |
285
|
|
|
|
|
|
|
} |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
# Calcualte distance between two sites. Always return a positive value. |
288
|
|
|
|
|
|
|
# If Left > Right, stop the program and issue a fatal error. |
289
|
|
|
|
|
|
|
sub site_dist{ |
290
|
0
|
|
|
0
|
0
|
|
my($l_site,$r_site) = @_; |
291
|
0
|
0
|
|
|
|
|
if(!same_chr($l_site, $r_site)){ |
292
|
0
|
|
|
|
|
|
return INF; |
293
|
|
|
|
|
|
|
} |
294
|
0
|
0
|
|
|
|
|
if($l_site->{'siteCenter'} > $r_site->{'siteCenter'}){ |
295
|
0
|
|
|
|
|
|
die "Left site has larger coordinate than right site when calculating distance. Check program logic. Stop now.\n"; |
296
|
|
|
|
|
|
|
} |
297
|
0
|
|
|
|
|
|
return $r_site->{'siteCenter'} - $l_site->{'siteCenter'}; |
298
|
|
|
|
|
|
|
} |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
# Initialize a cluster with the current site. |
301
|
|
|
|
|
|
|
sub init_clust{ |
302
|
0
|
|
|
0
|
0
|
|
my($self) = @_; |
303
|
0
|
|
|
|
|
|
$self->{clust}{from} = $self->{curPos}; |
304
|
0
|
|
|
|
|
|
$self->{clust}{to} = $self->{curPos}; |
305
|
0
|
|
|
|
|
|
$self->{clust}{seg} = undef; |
306
|
0
|
|
|
|
|
|
$self->{clust}{dist} = undef; |
307
|
0
|
|
|
|
|
|
$self->{clust}{enrich} = undef; |
308
|
0
|
|
|
|
|
|
$self->{clust}{pval} = 1.0; |
309
|
|
|
|
|
|
|
|
310
|
0
|
|
|
|
|
|
return $self->{clust}{pval}; |
311
|
|
|
|
|
|
|
} |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
# Are we at the end of the site list? |
314
|
|
|
|
|
|
|
sub is_list_end{ |
315
|
0
|
|
|
0
|
0
|
|
my($self) = @_; |
316
|
0
|
|
|
|
|
|
return $self->{curPos} >= @{$self->{siteList}} - 1; |
|
0
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
} |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
# Shift the leftmost site out of the cluster. |
320
|
|
|
|
|
|
|
sub shift_clust{ |
321
|
0
|
|
|
0
|
0
|
|
my($self) = @_; |
322
|
0
|
0
|
|
|
|
|
if($self->{clust}{from} < $self->{clust}{to}){ |
323
|
0
|
|
|
|
|
|
$self->{clust}{from}++; |
324
|
0
|
|
|
|
|
|
return $self->sigeval(); |
325
|
|
|
|
|
|
|
}else{ |
326
|
0
|
|
|
|
|
|
warn "Only one site left in cluster. Shift was not performed.\n"; |
327
|
0
|
|
|
|
|
|
return $self->{clust}{pval}; |
328
|
|
|
|
|
|
|
} |
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
# Un-shift the leftmost site into the cluster. |
332
|
|
|
|
|
|
|
sub unshift_clust{ |
333
|
0
|
|
|
0
|
0
|
|
my($self) = @_; |
334
|
0
|
0
|
|
|
|
|
if($self->{clust}{from} > 0){ |
335
|
0
|
|
|
|
|
|
$self->{clust}{from}--; |
336
|
0
|
|
|
|
|
|
return $self->sigeval(); |
337
|
|
|
|
|
|
|
}else{ |
338
|
0
|
|
|
|
|
|
warn "Leftmost site is already at the beginning of the difflist. Unshift was not performed.\n"; |
339
|
0
|
|
|
|
|
|
return $self->{clust}{pval}; |
340
|
|
|
|
|
|
|
} |
341
|
|
|
|
|
|
|
} |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
# Add a site to the rightmost of the cluster. |
344
|
|
|
|
|
|
|
sub push_clust{ |
345
|
0
|
|
|
0
|
0
|
|
my($self) = @_; |
346
|
0
|
0
|
|
|
|
|
if($self->{curPos} < @{$self->{siteList}}){ |
|
0
|
|
|
|
|
|
|
347
|
0
|
|
|
|
|
|
$self->{curPos}++; # move current position to the right. |
348
|
0
|
|
|
|
|
|
$self->{clust}{to}++; |
349
|
0
|
|
|
|
|
|
return $self->sigeval(); |
350
|
|
|
|
|
|
|
} |
351
|
|
|
|
|
|
|
else { |
352
|
0
|
|
|
|
|
|
warn "Current position is already at the end of the list. Push was not performed.\n"; |
353
|
0
|
|
|
|
|
|
return $self->{clust}{pval}; |
354
|
|
|
|
|
|
|
} |
355
|
|
|
|
|
|
|
} |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
# Remove the rightmost site from the cluster. |
358
|
|
|
|
|
|
|
sub pop_clust{ |
359
|
0
|
|
|
0
|
0
|
|
my($self) = @_; |
360
|
0
|
0
|
|
|
|
|
if($self->{clust}{to} > $self->{clust}{from}){ |
361
|
0
|
|
|
|
|
|
$self->{clust}{to}--; |
362
|
0
|
|
|
|
|
|
return $self->sigeval(); |
363
|
|
|
|
|
|
|
} |
364
|
|
|
|
|
|
|
else{ |
365
|
0
|
|
|
|
|
|
warn "Site list cannot be empty. Pop was not performed.\n"; |
366
|
0
|
|
|
|
|
|
return $self->{clust}{pval}; |
367
|
|
|
|
|
|
|
} |
368
|
|
|
|
|
|
|
} |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
# Update distance and segments for current cluster. |
371
|
|
|
|
|
|
|
sub dist_seg_clust{ |
372
|
0
|
|
|
0
|
0
|
|
my($self) = @_; |
373
|
0
|
|
|
|
|
|
my $l_loc = $self->{clust}{from}; |
374
|
0
|
|
|
|
|
|
my $r_loc = $self->{clust}{to}; |
375
|
0
|
|
|
|
|
|
$self->{'clust'}{'seg'} = $r_loc - $l_loc; |
376
|
0
|
|
|
|
|
|
my $l_site = $self->{'siteList'}->[$l_loc]; |
377
|
0
|
|
|
|
|
|
my $r_site = $self->{'siteList'}->[$r_loc]; |
378
|
0
|
|
|
|
|
|
$self->{'clust'}{'dist'} = &site_dist($l_site, $r_site); |
379
|
|
|
|
|
|
|
} |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
# Extract a cluster's info to external procedure. |
382
|
|
|
|
|
|
|
sub get_clustinfo{ |
383
|
0
|
|
|
0
|
0
|
|
my($self,$h) = @_; |
384
|
0
|
|
|
|
|
|
my $from = $self->{clust}{from}; |
385
|
0
|
|
|
|
|
|
my $to = $self->{clust}{to}; |
386
|
0
|
0
|
|
|
|
|
if($from < $to){ |
387
|
0
|
|
|
|
|
|
my @v_rec; # vector of records of cluster members. |
388
|
0
|
|
|
|
|
|
for my $i($from..$to){ |
389
|
0
|
|
|
|
|
|
push @v_rec, $self->{siteList}[$i]{recLoc}; |
390
|
|
|
|
|
|
|
} |
391
|
0
|
|
|
|
|
|
my %h_mark; # hash of mark types; |
392
|
0
|
|
|
|
|
|
for my $i($from..$to){ |
393
|
0
|
|
|
|
|
|
$h_mark{$self->{siteList}[$i]{markType}} = 1; |
394
|
|
|
|
|
|
|
} |
395
|
0
|
|
|
|
|
|
my @v_mark = sort {$a cmp $b} keys %h_mark; # vector of unique mark types. |
|
0
|
|
|
|
|
|
|
396
|
0
|
|
|
|
|
|
my $chrom = $self->{siteList}[$from]{chrom}; |
397
|
0
|
|
|
|
|
|
my $start = $self->{siteList}[$from]{start}; |
398
|
0
|
|
|
|
|
|
my $end = $self->{siteList}[$to]{end}; |
399
|
|
|
|
|
|
|
return { # return an anonymous hash. |
400
|
0
|
|
|
|
|
|
chrom => $chrom, |
401
|
|
|
|
|
|
|
start => $start, |
402
|
|
|
|
|
|
|
end => $end, |
403
|
|
|
|
|
|
|
len => $end-$start+1, |
404
|
|
|
|
|
|
|
enrich => $self->{clust}{enrich}, |
405
|
|
|
|
|
|
|
pval => $self->{clust}{pval}, |
406
|
|
|
|
|
|
|
v_rec => [@v_rec], |
407
|
|
|
|
|
|
|
v_mark => [@v_mark] |
408
|
|
|
|
|
|
|
}; |
409
|
|
|
|
|
|
|
}else{ |
410
|
0
|
|
|
|
|
|
warn "Cluster contains less than two sites. No print was done.\n"; |
411
|
0
|
|
|
|
|
|
return {}; |
412
|
|
|
|
|
|
|
} |
413
|
|
|
|
|
|
|
} |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
# Get number of sites. |
416
|
|
|
|
|
|
|
sub get_siteN{ |
417
|
0
|
|
|
0
|
0
|
|
my($self) = @_; |
418
|
0
|
|
|
|
|
|
return scalar @{$self->{siteList}}; |
|
0
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
} |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
# Class to manage cluster list. |
423
|
|
|
|
|
|
|
package diffReps::ClustList; |
424
|
1
|
|
|
1
|
|
8
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
48
|
|
425
|
1
|
|
|
1
|
|
5
|
use MyBioinfo::Common qw(padjBH); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
709
|
|
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
# Constructor. |
428
|
|
|
|
|
|
|
sub new{ |
429
|
0
|
|
|
0
|
|
|
my $class = shift; |
430
|
0
|
|
|
|
|
|
my $self = { |
431
|
|
|
|
|
|
|
clustList => [], |
432
|
|
|
|
|
|
|
f_handler => undef |
433
|
|
|
|
|
|
|
}; |
434
|
0
|
|
|
|
|
|
bless $self, $class; |
435
|
0
|
|
|
|
|
|
return $self; |
436
|
|
|
|
|
|
|
} |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
# Add a cluster to the list. |
439
|
|
|
|
|
|
|
sub add{ |
440
|
0
|
|
|
0
|
|
|
my($self,$rc) = @_; |
441
|
0
|
|
|
|
|
|
push @{$self->{clustList}}, $rc; |
|
0
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
} |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
# Adjust cluster P-values by BH. |
445
|
|
|
|
|
|
|
sub adjPval{ |
446
|
0
|
|
|
0
|
|
|
my($self,$N) = @_; |
447
|
0
|
|
|
|
|
|
my @p; |
448
|
0
|
|
|
|
|
|
foreach my $c(@{$self->{clustList}}) {push @p, $c->{pval};} |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
449
|
0
|
|
|
|
|
|
my @padj = padjBH(\@p, $N); |
450
|
0
|
|
|
|
|
|
my $i = 0; # iterator for adjusted P-values. |
451
|
0
|
|
|
|
|
|
foreach my $c(@{$self->{clustList}}) {$c->{padj} = $padj[$i++];} |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
} |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
# Output all clusters' info to a file. |
455
|
|
|
|
|
|
|
sub output{ |
456
|
0
|
|
|
0
|
|
|
my($self) = @_; |
457
|
0
|
0
|
|
|
|
|
if(!defined $self->{f_handler}){ |
458
|
0
|
|
|
|
|
|
warn "File handler has not been opened yet.\n"; |
459
|
0
|
|
|
|
|
|
return; |
460
|
|
|
|
|
|
|
} |
461
|
0
|
|
|
|
|
|
my $h = $self->{f_handler}; |
462
|
0
|
|
|
|
|
|
foreach my $c(@{$self->{clustList}}){ |
|
0
|
|
|
|
|
|
|
463
|
0
|
|
|
|
|
|
print $h join("\t", $c->{chrom}, |
464
|
|
|
|
|
|
|
$c->{start}, $c->{end}, $c->{len}, |
465
|
|
|
|
|
|
|
$c->{enrich}, |
466
|
|
|
|
|
|
|
$c->{pval}, $c->{padj}, |
467
|
0
|
|
|
|
|
|
scalar @{$c->{v_rec}}, join(';', @{$c->{v_rec}}), |
|
0
|
|
|
|
|
|
|
468
|
0
|
|
|
|
|
|
scalar @{$c->{v_mark}}, join(';', @{$c->{v_mark}})), "\n"; |
|
0
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
} |
470
|
|
|
|
|
|
|
} |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
# Output header line. |
473
|
|
|
|
|
|
|
sub print_header{ |
474
|
0
|
|
|
0
|
|
|
my($self) = @_; |
475
|
0
|
|
|
|
|
|
my $h = $self->{f_handler}; |
476
|
0
|
0
|
|
|
|
|
if(defined $h){ |
477
|
0
|
|
|
|
|
|
print $h "Chrom\tStart\tEnd\tLength\tenrich\tpval\tpadj\tnsite\tSites\tntype\tMarkType\n"; |
478
|
|
|
|
|
|
|
}else{ |
479
|
0
|
|
|
|
|
|
warn "File handler has not been opened yet. Nothing was done.\n"; |
480
|
|
|
|
|
|
|
} |
481
|
|
|
|
|
|
|
} |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
# Spit a line into opened file. |
484
|
|
|
|
|
|
|
sub spit{ |
485
|
0
|
|
|
0
|
|
|
my($self,$line) = @_; |
486
|
0
|
|
|
|
|
|
my $h = $self->{f_handler}; |
487
|
0
|
0
|
|
|
|
|
if(defined $h){ |
488
|
0
|
|
|
|
|
|
print $h "$line\n"; |
489
|
|
|
|
|
|
|
}else{ |
490
|
0
|
|
|
|
|
|
warn "File handler has not been opened yet. Nothing was done.\n"; |
491
|
|
|
|
|
|
|
} |
492
|
|
|
|
|
|
|
} |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
# Open file handler. |
495
|
|
|
|
|
|
|
sub open_file{ |
496
|
0
|
|
|
0
|
|
|
my($self,$f) = @_; |
497
|
0
|
|
|
|
|
|
my $h; |
498
|
0
|
0
|
|
|
|
|
open $h, '>', $f or die "Open output file error: $!\n"; |
499
|
0
|
|
|
|
|
|
$self->{f_handler} = $h; |
500
|
|
|
|
|
|
|
} |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
# Close file handler. |
503
|
|
|
|
|
|
|
sub done{ |
504
|
0
|
|
|
0
|
|
|
my($self) = @_; |
505
|
0
|
0
|
|
|
|
|
close $self->{f_handler} if defined $self->{f_handler}; |
506
|
|
|
|
|
|
|
} |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
# Preloaded methods go here. |
512
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
# Autoload methods go after =cut, and are processed by the autosplit program. |
514
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
1; |
516
|
|
|
|
|
|
|
__END__ |