line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#!/usr/local/bin/perl |
2
|
|
|
|
|
|
|
#The ShatterProof package is copyright (c) 2013 Ontario Institute for Cancer Research (OICR). |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
#This package and its accompanying libraries is free software; you can redistribute it and/or modify it under the terms of the GPL (either version 1, or at your option, any later version) or the Artistic License 2.0. Refer to LICENSE for the full license text. |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
#OICR makes no representations whatsoever as to the SOFTWARE contained herein. It is experimental in nature and is provided WITHOUT WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE OR ANY OTHER WARRANTY, EXPRESS OR IMPLIED. CSHL MAKES NO REPRESENTATION OR WARRANTY THAT THE USE OF THIS SOFTWARE WILL NOT INFRINGE ANY PATENT OR OTHER PROPRIETARY RIGHT. |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
#By downloading this SOFTWARE, your Institution hereby indemnifies OICR against any loss, claim, damage or liability, of whatsoever kind or |
9
|
|
|
|
|
|
|
#nature, which may arise from your Institution's respective use, handling or storage of the SOFTWARE. |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
#If publications result from research using this SOFTWARE, we ask that the Ontario Institute for Cancer Research be acknowledged and/or credit be given to OICR scientists, as scientifically appropriate. |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
### Shatterproof.pm ############################################################################### |
14
|
|
|
|
|
|
|
# ShatterProof is a tool that can be used to analyze next generation sequencing data for signs |
15
|
|
|
|
|
|
|
# of chromothripsis. |
16
|
|
|
|
|
|
|
# See POD at end of file for more description |
17
|
|
|
|
|
|
|
# |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
### INCLUDES ###################################################################################### |
20
|
9
|
|
|
9
|
|
384330
|
use strict; |
|
9
|
|
|
|
|
21
|
|
|
9
|
|
|
|
|
317
|
|
21
|
9
|
|
|
9
|
|
43
|
use warnings; |
|
9
|
|
|
|
|
12
|
|
|
9
|
|
|
|
|
260
|
|
22
|
9
|
|
|
9
|
|
37
|
use Carp; |
|
9
|
|
|
|
|
18
|
|
|
9
|
|
|
|
|
801
|
|
23
|
|
|
|
|
|
|
|
24
|
9
|
|
|
9
|
|
38
|
use vars qw($VERSIONS); |
|
9
|
|
|
|
|
12
|
|
|
9
|
|
|
|
|
420
|
|
25
|
|
|
|
|
|
|
|
26
|
9
|
|
|
9
|
|
40
|
use feature 'switch'; |
|
9
|
|
|
|
|
11
|
|
|
9
|
|
|
|
|
873
|
|
27
|
|
|
|
|
|
|
#use Switch; |
28
|
9
|
|
|
9
|
|
38
|
use File::Basename; |
|
9
|
|
|
|
|
14
|
|
|
9
|
|
|
|
|
532
|
|
29
|
9
|
|
|
9
|
|
41
|
use List::Util qw[min max]; |
|
9
|
|
|
|
|
12
|
|
|
9
|
|
|
|
|
759
|
|
30
|
9
|
|
|
9
|
|
5065
|
use Statistics::Distributions; |
|
9
|
|
|
|
|
29320
|
|
|
9
|
|
|
|
|
620
|
|
31
|
9
|
|
|
9
|
|
6056
|
use POSIX; |
|
9
|
|
|
|
|
62493
|
|
|
9
|
|
|
|
|
61
|
|
32
|
|
|
|
|
|
|
################################################################################################### |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
### HISTORY ####################################################################################### |
35
|
|
|
|
|
|
|
# Version Date Coder Comments |
36
|
|
|
|
|
|
|
# 0.001 2012/03/19 sgovind Versioning start point |
37
|
|
|
|
|
|
|
# 0.002 2012/04/03 sgovind moved input validation methods and run methods |
38
|
|
|
|
|
|
|
# from shatterproof.pl to here |
39
|
|
|
|
|
|
|
# 0.003 2012/10/08 sgovind Updated POD |
40
|
|
|
|
|
|
|
# 0.04 2012/11/25 sgovind Stable build before changing translocation scoring equation |
41
|
|
|
|
|
|
|
# 0.05 2012/12/26 sgovind See change log for details |
42
|
|
|
|
|
|
|
# 0.06 2012/12/27 sgovind Added additional documentation for new config variable |
43
|
|
|
|
|
|
|
# 0.07 2012/12/27 sgovind Added example guide for provided sample data |
44
|
|
|
|
|
|
|
# 0.08 2013/05/22 sgovind Added EXPORT code for test case, added minor error checking |
45
|
|
|
|
|
|
|
# 0.09 2013/06/10 sgovind Minor changes to accomodate testing |
46
|
|
|
|
|
|
|
# 0.10 2013/06/19 sgovind Minor changes to accomodate testing |
47
|
|
|
|
|
|
|
# 0.11 2013/06/24 sgovind Changed all sorts to stable sorts. Reduced number of posix |
48
|
|
|
|
|
|
|
# calculations. |
49
|
|
|
|
|
|
|
# 0.12 2013/06/28 sgovind Changed sort order of interchromosomal translocation output |
50
|
|
|
|
|
|
|
# 0.13 2013/0716 sgovind Corrected logical error in calculate_loh_score |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
our $VERSION = '0.14'; |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
package Shatterproof; |
55
|
9
|
|
|
9
|
|
24114
|
use Exporter; |
|
9
|
|
|
|
|
62
|
|
|
9
|
|
|
|
|
78034
|
|
56
|
|
|
|
|
|
|
our @ISA = 'Exporter'; |
57
|
|
|
|
|
|
|
our @EXPORT = qw( |
58
|
|
|
|
|
|
|
$bin_size |
59
|
|
|
|
|
|
|
$genome_cnv_data_hash_ref |
60
|
|
|
|
|
|
|
$chromosome_copy_number_count_hash_ref |
61
|
|
|
|
|
|
|
$chromosome_cnv_breakpoints_hash_ref |
62
|
|
|
|
|
|
|
$tp53_mutation_found |
63
|
|
|
|
|
|
|
$genome_trans_data_hash_ref |
64
|
|
|
|
|
|
|
$chromosome_translocation_count_hash_ref |
65
|
|
|
|
|
|
|
$genome_trans_breakpoints_hash_ref |
66
|
|
|
|
|
|
|
$genome_mutation_density_hash_ref |
67
|
|
|
|
|
|
|
$suspect_regions_array_ref |
68
|
|
|
|
|
|
|
$likely_regions_array_ref |
69
|
|
|
|
|
|
|
$genome_cnv_data_windows_hash_ref |
70
|
|
|
|
|
|
|
$genome_trans_data_windows_hash_ref |
71
|
|
|
|
|
|
|
$genome_mutation_data_windows_hash_ref |
72
|
|
|
|
|
|
|
$localization_window_size |
73
|
|
|
|
|
|
|
); |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
### Global Variables ############################################################################## |
76
|
|
|
|
|
|
|
my $pos = 0; #used to parse command line variables |
77
|
|
|
|
|
|
|
my $ARGC; #stores the number of command line arguments provided |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
my %chromosome_length = ( #stores the sequence length of each chromosome |
80
|
|
|
|
|
|
|
X => 154913754, |
81
|
|
|
|
|
|
|
Y => 57741652, |
82
|
|
|
|
|
|
|
1 => 247199719, |
83
|
|
|
|
|
|
|
2 => 242751149, |
84
|
|
|
|
|
|
|
3 => 199446827, |
85
|
|
|
|
|
|
|
4 => 191263063, |
86
|
|
|
|
|
|
|
5 => 180837866, |
87
|
|
|
|
|
|
|
6 => 170896993, |
88
|
|
|
|
|
|
|
7 => 158821424, |
89
|
|
|
|
|
|
|
8 => 146274826, |
90
|
|
|
|
|
|
|
9 => 140442298, |
91
|
|
|
|
|
|
|
10 => 135374737, |
92
|
|
|
|
|
|
|
11 => 134452384, |
93
|
|
|
|
|
|
|
12 => 132289534, |
94
|
|
|
|
|
|
|
13 => 114127980, |
95
|
|
|
|
|
|
|
14 => 106360585, |
96
|
|
|
|
|
|
|
15 => 100338915, |
97
|
|
|
|
|
|
|
16 => 88822254, |
98
|
|
|
|
|
|
|
17 => 78654742, |
99
|
|
|
|
|
|
|
18 => 76117153, |
100
|
|
|
|
|
|
|
19 => 63806651, |
101
|
|
|
|
|
|
|
20 => 62435965, |
102
|
|
|
|
|
|
|
21 => 46944323, |
103
|
|
|
|
|
|
|
22 => 49528953 |
104
|
|
|
|
|
|
|
); |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
my $TP53_start = 1000000*7.57; #start base pair of the TP53 gene |
107
|
|
|
|
|
|
|
my $TP53_end = 1000000*7.59; #end base pair of the TP53 gene |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
my $insertion_data_present = 0; |
110
|
|
|
|
|
|
|
my $LOH_data_present = 0; |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
#The values for the following 13 variables are defined in the config.pl file |
114
|
|
|
|
|
|
|
our $bin_size; #number of bases pairs that will be compressed into 1 region when analyzing the genome |
115
|
|
|
|
|
|
|
#this value defines how many base pairs are included in one array element in the data_hash_ref varaibles |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
our $localization_window_size; #number of regions to sum together when performing sliding window analysis of the genome |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
our $expected_mutation_density; #the expected mutation density of translocations in a highly mutated region |
120
|
|
|
|
|
|
|
#used to calculate spread factor of translocations |
121
|
|
|
|
|
|
|
our $low_mutation_density_threshold; #the mutation density that will be used to call likely regions |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
our $collapse_regions; #flag variable |
124
|
|
|
|
|
|
|
#value 1: merge overlapping CNV regions that have the same copy number |
125
|
|
|
|
|
|
|
#value 0: do not merge overlapping CNV regions that have the same copy number. If such |
126
|
|
|
|
|
|
|
# regions are found an error is thrown |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
our $outlier_deviation; #the number of standard deviations away from the mean a value has to be in order to be considered non-significant |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
our $translocation_cut_off_count; #the max number of translocation chromosomes that will be tolerated before translocation score is set to 0 |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
our $chromosome_localization_weight; #the scoring formula weight given to the localization of mutations to a specific region on the chromosome |
133
|
|
|
|
|
|
|
our $genome_localization_weight; #the scoring formula weight given to the localization of mutations to a specific chromosome |
134
|
|
|
|
|
|
|
our $cnv_weight; #the scoring formula weight given to the aberrant CNV hallmark |
135
|
|
|
|
|
|
|
our $translocation_weight; #the scoring formula weight given to the localization of translocations |
136
|
|
|
|
|
|
|
our $insertion_breakpoint_weight; #the scoring formula weight given to the number of insertions found at translocation breakpoints |
137
|
|
|
|
|
|
|
our $loh_weight; #the scoring formula weight given to the amount of heterozygosity that is retained in a mutated region |
138
|
|
|
|
|
|
|
our $tp53_mutated_weight; #the scoring formula weight given to the presents or absence of a TP53 mutation |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
### SUB METHODS ################################################################################### |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
#=head2 Sub-Method: run |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
### run ########################################################################################### |
145
|
|
|
|
|
|
|
# Description: |
146
|
|
|
|
|
|
|
# Main method called by shatterproof.pl |
147
|
|
|
|
|
|
|
# Calls primary sub methods |
148
|
|
|
|
|
|
|
# |
149
|
|
|
|
|
|
|
# Input variables: |
150
|
|
|
|
|
|
|
# $argv_ref: reference to @ARGV |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
#=cut |
153
|
|
|
|
|
|
|
sub run { |
154
|
|
|
|
|
|
|
|
155
|
0
|
|
|
0
|
0
|
0
|
my $argv_ref = shift; #parse parameters |
156
|
0
|
|
|
|
|
0
|
my @argv = @{$argv_ref}; #dereference array reference |
|
0
|
|
|
|
|
0
|
|
157
|
|
|
|
|
|
|
|
158
|
0
|
|
|
|
|
0
|
my $cnv_directory; #stores the path to the directory where the CNV input files are found |
159
|
|
|
|
|
|
|
my $trans_directory; #stores the path to the directory where the translocation input files are found |
160
|
0
|
|
|
|
|
0
|
my $insertion_directory; #stores the path to the directory where the insertion input files are found |
161
|
0
|
|
|
|
|
0
|
my $loh_directory; #stores the path to the directory where the loss of hetrozygosity input files are found |
162
|
0
|
|
|
|
|
0
|
my $output_directory; #stores the path to the directory where output files will be placed |
163
|
|
|
|
|
|
|
|
164
|
0
|
|
|
|
|
0
|
my $config_file_path; #stores the path to the configuration file |
165
|
|
|
|
|
|
|
|
166
|
0
|
|
|
|
|
0
|
my $tp53_mutated = 0; #flag variable to indicate if the TP53 gene should be considered to be mutated |
167
|
0
|
|
|
|
|
0
|
my $tp53_mutation_found = 0; #flag variable to indicate if a mutation was found in the TP53 region. This does not affect scoring |
168
|
|
|
|
|
|
|
|
169
|
0
|
|
|
|
|
0
|
my @cnv_files; #list of CNV input files |
170
|
|
|
|
|
|
|
my @trans_files; #list of translocation input files |
171
|
0
|
|
|
|
|
0
|
my @insertion_files; #list of insertion input files |
172
|
0
|
|
|
|
|
0
|
my @loh_files; #list of LOH input files |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
|
175
|
0
|
|
|
|
|
0
|
my $chromosome_copy_number_count_hash_ref; #hash |
176
|
|
|
|
|
|
|
#key1: chromosome eg. 1,2,X,Y |
177
|
|
|
|
|
|
|
#key2: copy-number state eg. 0,1,3,20 |
178
|
|
|
|
|
|
|
#value: number of regions with copy number key2 |
179
|
|
|
|
|
|
|
|
180
|
0
|
|
|
|
|
0
|
my $chromosome_cnv_breakpoints_hash_ref; #hash |
181
|
|
|
|
|
|
|
#key: chromosome eg. 1,2,X,Y |
182
|
|
|
|
|
|
|
#value: an array storing the start and end points of all cnv regions on key |
183
|
|
|
|
|
|
|
|
184
|
0
|
|
|
|
|
0
|
my $chromosome_translocation_count_hash_ref; #hash |
185
|
|
|
|
|
|
|
#key1: chromosome eg. 1,2,X,Y |
186
|
|
|
|
|
|
|
#key2: chromosome eg. 1,2,X,Y |
187
|
|
|
|
|
|
|
#value: number of translocations between key1 and key2 |
188
|
|
|
|
|
|
|
|
189
|
0
|
|
|
|
|
0
|
my $chromosome_insertion_count_hash_ref; #hash |
190
|
|
|
|
|
|
|
#key: chromosome eg. 1,2,X,Y |
191
|
|
|
|
|
|
|
#value: number of insertions on key |
192
|
|
|
|
|
|
|
|
193
|
0
|
|
|
|
|
0
|
my $chromosome_loh_breakpoints_hash_ref; #hash |
194
|
|
|
|
|
|
|
#key: chromosome eg. 1,2,X,Y |
195
|
|
|
|
|
|
|
#value: an array storing the start and end points of all loh regions on key |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
|
198
|
0
|
|
|
|
|
0
|
my $genome_trans_breakpoints_hash_ref; #hash |
199
|
|
|
|
|
|
|
#key: chromosome eg. 1,2,X,Y |
200
|
|
|
|
|
|
|
#value: an array storing all the translocation breakpoints on key |
201
|
|
|
|
|
|
|
|
202
|
0
|
|
|
|
|
0
|
my $genome_trans_insertion_breakpoints_hash_ref; #hash |
203
|
|
|
|
|
|
|
#key: chromosome eg. 1,2,X,Y |
204
|
|
|
|
|
|
|
#value: an array storing only the translocation breakpoints on key that have a insertion with 10 base pairs |
205
|
|
|
|
|
|
|
|
206
|
0
|
|
|
|
|
0
|
my $genome_mutation_density_hash_ref; #hash |
207
|
|
|
|
|
|
|
#key: chromosome eg. 1,2,X,Y |
208
|
|
|
|
|
|
|
#value: the total number of mutation on key divided by the sequence length of key |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
|
211
|
0
|
|
|
|
|
0
|
my $genome_cnv_data_hash_ref = initialize_genome_hash(); #hash {key1}[index]{key2} |
212
|
|
|
|
|
|
|
#key1: chromosome eg. 1,2,X,Y |
213
|
|
|
|
|
|
|
#value: an array storing references to hashes which contain information about the |
214
|
|
|
|
|
|
|
# CNVs in each region of key1. The index of the array indicated the region |
215
|
|
|
|
|
|
|
#key2: 'BPcount' -> gives the number of CNV breakpoints in the region. |
216
|
|
|
|
|
|
|
# a number, eg. '1' -> gives the number of subregions within the region that |
217
|
|
|
|
|
|
|
# have a copy number of 1 |
218
|
|
|
|
|
|
|
|
219
|
0
|
|
|
|
|
0
|
my $genome_trans_data_hash_ref = initialize_genome_hash(); #hash {key1}[index]{key2}{key3} |
220
|
|
|
|
|
|
|
#key1: chromosome eg. 1,2,X,Y |
221
|
|
|
|
|
|
|
#value1:an array storing references to hashes which contain information about the |
222
|
|
|
|
|
|
|
# translocations in each region of key1. The index of the array indicated the region. |
223
|
|
|
|
|
|
|
#key2: 'BPcount' -> gives the number of translocation breakpoints in the region |
224
|
|
|
|
|
|
|
# 'in' -> gives a reference to a hash that contains information about translocations |
225
|
|
|
|
|
|
|
# into the region |
226
|
|
|
|
|
|
|
# 'out' -> gives a reference to a hash that contains information about translocations |
227
|
|
|
|
|
|
|
# out of the region |
228
|
|
|
|
|
|
|
#key3: chromosome eg. 1,2,X,Y |
229
|
|
|
|
|
|
|
#value2:the number of subregions in the region that were translocated to key1 from key3 if key2 = 'in' |
230
|
|
|
|
|
|
|
# or |
231
|
|
|
|
|
|
|
# the number of subregions in the region that were translocated from key1 to key3 if key2 = 'out' |
232
|
|
|
|
|
|
|
|
233
|
0
|
|
|
|
|
0
|
my $genome_insertion_data_hash_ref = initialize_genome_hash(); #hash {key1}[index] |
234
|
|
|
|
|
|
|
#key1: chromosome eg. 1,2,X,Y |
235
|
|
|
|
|
|
|
#value: an array storing the number of insertions in each region of key1 |
236
|
|
|
|
|
|
|
# The index of the array indicated the region |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
|
239
|
0
|
|
|
|
|
0
|
my $genome_cnv_data_windows_hash_ref; #hash {key1}[index] |
240
|
|
|
|
|
|
|
#key1: chromosome eg. 1,2,X,Y |
241
|
|
|
|
|
|
|
#value: an array storing the number of CNV breakpoints in each window of the genome. |
242
|
|
|
|
|
|
|
# Each window begins at the region indicated by the array index |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
my $genome_trans_data_windows_hash_ref; #hash {key1}[index] |
245
|
|
|
|
|
|
|
#key1: chromosome eg. 1,2,X,Y |
246
|
|
|
|
|
|
|
#value: an array storing the number of translocation breakpoints in each window of the genome. |
247
|
|
|
|
|
|
|
# Each window begins at the region indicated by the array index |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
|
250
|
0
|
|
|
|
|
0
|
my $genome_mutation_data_windows_hash_ref; #hash {key1}[index] |
251
|
|
|
|
|
|
|
#key1: chromosome eg. 1,2,X,Y |
252
|
|
|
|
|
|
|
#value: an array storing the total number of mutation breakpoints in each window of the genome. |
253
|
|
|
|
|
|
|
# Each window begins at the region indicated by the array index |
254
|
|
|
|
|
|
|
|
255
|
0
|
|
|
|
|
0
|
my $suspect_regions_array_ref; #reference to array that stores regions where chromothripsis most likely occured. Format: chr start end |
256
|
0
|
|
|
|
|
0
|
my $likely_regions_array_ref; #reference to array that stores regions where chromothripsis may have occured. Format: chr start end |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
#Validate input arguements and parse them to the correct variables |
260
|
0
|
|
|
|
|
0
|
validate_input(\@argv, \$cnv_directory, \$trans_directory, \$insertion_directory, \$loh_directory, \$tp53_mutated, \$output_directory, \$config_file_path); |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
#Load the values from the config file |
263
|
0
|
0
|
|
|
|
0
|
if(load_config_file($config_file_path)!=1){ |
264
|
0
|
|
|
|
|
0
|
die("ERROR - could not load config file\n"); |
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
|
267
|
0
|
|
|
|
|
0
|
print "CNV dir:\t$cnv_directory\n"; |
268
|
0
|
|
|
|
|
0
|
print "Trans dir:\t$trans_directory\n"; |
269
|
|
|
|
|
|
|
|
270
|
0
|
0
|
|
|
|
0
|
if(defined($insertion_directory)){ |
271
|
0
|
|
|
|
|
0
|
print "insertion dir:\t$insertion_directory\n"; |
272
|
|
|
|
|
|
|
} |
273
|
|
|
|
|
|
|
|
274
|
0
|
0
|
|
|
|
0
|
if(defined($loh_directory)){ |
275
|
0
|
|
|
|
|
0
|
print "LOH dir:\t$loh_directory\n"; |
276
|
|
|
|
|
|
|
} |
277
|
|
|
|
|
|
|
|
278
|
0
|
|
|
|
|
0
|
print "Output dir:\t$output_directory\n"; |
279
|
|
|
|
|
|
|
|
280
|
0
|
|
|
|
|
0
|
print "Force TP53 Mutation:\t$tp53_mutated\n\n"; |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
#Get a list of files for each of the provided input directories |
283
|
0
|
|
|
|
|
0
|
@cnv_files = glob ("$cnv_directory"."*.spc"); |
284
|
0
|
|
|
|
|
0
|
@trans_files = glob ("$trans_directory"."*.spt"); |
285
|
|
|
|
|
|
|
|
286
|
0
|
0
|
0
|
|
|
0
|
if(scalar(@cnv_files)==0 || scalar(@trans_files)==0){ |
287
|
0
|
|
|
|
|
0
|
die "ERROR: no CNV or translocation input files found\n"; |
288
|
|
|
|
|
|
|
} |
289
|
|
|
|
|
|
|
|
290
|
0
|
0
|
|
|
|
0
|
if(defined($insertion_directory)){ |
291
|
0
|
|
|
|
|
0
|
@insertion_files = glob ("$insertion_directory"."*.vcf"); |
292
|
|
|
|
|
|
|
} |
293
|
|
|
|
|
|
|
|
294
|
0
|
0
|
|
|
|
0
|
if(defined($loh_directory)){ |
295
|
0
|
|
|
|
|
0
|
@loh_files = glob ("$loh_directory"."*.spl"); |
296
|
|
|
|
|
|
|
} |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
#Echo a list of all the input files |
299
|
0
|
|
|
|
|
0
|
$" = "\n\t\t"; |
300
|
0
|
|
|
|
|
0
|
print "CNV files:\t@cnv_files\n\n"; |
301
|
0
|
|
|
|
|
0
|
print "Trans files:\t@trans_files\n\n"; |
302
|
0
|
|
|
|
|
0
|
$" = "\n\t\t"; |
303
|
0
|
0
|
|
|
|
0
|
if(scalar(@insertion_files)==0){ |
304
|
0
|
|
|
|
|
0
|
print "Indel files:\t-none\n\n"; |
305
|
|
|
|
|
|
|
} |
306
|
|
|
|
|
|
|
else{ |
307
|
0
|
|
|
|
|
0
|
print "Indel files:\t@insertion_files\n\n"; |
308
|
|
|
|
|
|
|
} |
309
|
0
|
|
|
|
|
0
|
$" = "\n\t"; |
310
|
0
|
0
|
|
|
|
0
|
if(scalar(@loh_files)==0){ |
311
|
0
|
|
|
|
|
0
|
print "LOH files:\t-none\n\n"; |
312
|
|
|
|
|
|
|
} |
313
|
|
|
|
|
|
|
else{ |
314
|
0
|
|
|
|
|
0
|
print "LOH files:\t@loh_files\n\n"; |
315
|
|
|
|
|
|
|
} |
316
|
0
|
|
|
|
|
0
|
$" = " "; |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
#Create the output directory if it does not exist |
319
|
0
|
0
|
|
|
|
0
|
mkdir ("$output_directory",0770) unless (-d "$output_directory"); |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
#Check that the output directory exists |
322
|
0
|
0
|
|
|
|
0
|
if(!(-e $output_directory)){ |
323
|
0
|
|
|
|
|
0
|
die "ERROR: could not create directory: $output_directory\n"; |
324
|
|
|
|
|
|
|
} |
325
|
|
|
|
|
|
|
|
326
|
0
|
|
|
|
|
0
|
print "\n--analyzing CNV data\n"; |
327
|
0
|
|
|
|
|
0
|
($genome_cnv_data_hash_ref, $chromosome_copy_number_count_hash_ref, $chromosome_cnv_breakpoints_hash_ref) = analyze_cnv_data($output_directory, \@cnv_files, $bin_size, \$tp53_mutation_found); |
328
|
0
|
|
|
|
|
0
|
print "---done analyzing CNV data\n\n"; |
329
|
|
|
|
|
|
|
|
330
|
0
|
|
|
|
|
0
|
print "--analyzing translocation data\n"; |
331
|
0
|
|
|
|
|
0
|
($genome_trans_data_hash_ref, $chromosome_translocation_count_hash_ref, $genome_trans_breakpoints_hash_ref) = analyze_trans_data($output_directory, \@trans_files, $bin_size, \$tp53_mutation_found); |
332
|
0
|
|
|
|
|
0
|
print "---done analyzing translocation data\n\n"; |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
#If insertion data was provided then analyze it |
335
|
0
|
0
|
|
|
|
0
|
if(defined($insertion_directory)){ |
336
|
0
|
|
|
|
|
0
|
print "--analyzing insertion data\n"; |
337
|
0
|
|
|
|
|
0
|
($genome_insertion_data_hash_ref, $chromosome_insertion_count_hash_ref, $genome_trans_insertion_breakpoints_hash_ref) = analyze_insertion_data($output_directory, \@insertion_files, $bin_size, $genome_trans_breakpoints_hash_ref, \$tp53_mutation_found); |
338
|
0
|
|
|
|
|
0
|
print "---done analyzing insertion data\n\n"; |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
#Delete intermediate storage |
342
|
0
|
|
|
|
|
0
|
%$genome_trans_breakpoints_hash_ref = (); |
343
|
0
|
|
|
|
|
0
|
undef $genome_trans_breakpoints_hash_ref; |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
#If LOH data was provided then analyze it |
346
|
0
|
0
|
|
|
|
0
|
if(defined($loh_directory)){ |
347
|
0
|
|
|
|
|
0
|
print "--analyzing LOH data\n"; |
348
|
0
|
|
|
|
|
0
|
($chromosome_loh_breakpoints_hash_ref) = analyze_loh_data($output_directory, \@loh_files, \$tp53_mutation_found); |
349
|
0
|
|
|
|
|
0
|
print "---done analyzing LOH data\n\n"; |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
#Check that the correct format of the LOH hash has been preserved |
352
|
0
|
|
|
|
|
0
|
my %loh_hash = %{$chromosome_loh_breakpoints_hash_ref}; |
|
0
|
|
|
|
|
0
|
|
353
|
0
|
|
|
|
|
0
|
for my $key1 (keys %loh_hash){ |
354
|
0
|
|
|
|
|
0
|
my @a = @{$loh_hash{$key1}}; |
|
0
|
|
|
|
|
0
|
|
355
|
0
|
|
|
|
|
0
|
my $size = @a; |
356
|
|
|
|
|
|
|
|
357
|
0
|
0
|
|
|
|
0
|
if($size % 2 != 0){ |
358
|
0
|
|
|
|
|
0
|
die "ERROR: odd number of loh breakpoints recorded for chromosome $key1\n"; |
359
|
|
|
|
|
|
|
} |
360
|
|
|
|
|
|
|
} |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
} |
363
|
|
|
|
|
|
|
|
364
|
0
|
|
|
|
|
0
|
print "--calculating chromosome mutation densities\n"; |
365
|
0
|
|
|
|
|
0
|
$genome_mutation_density_hash_ref = calculate_genome_localization($output_directory, $chromosome_copy_number_count_hash_ref, $chromosome_translocation_count_hash_ref); |
366
|
0
|
|
|
|
|
0
|
print "---done calculating chromosome mutation densities\n\n"; |
367
|
|
|
|
|
|
|
|
368
|
0
|
|
|
|
|
0
|
print "--calculating chromosome region mutation densities\n"; |
369
|
0
|
|
|
|
|
0
|
($suspect_regions_array_ref, $likely_regions_array_ref, $genome_cnv_data_windows_hash_ref, $genome_trans_data_windows_hash_ref, $genome_mutation_data_windows_hash_ref) = calculate_chromosome_localization($output_directory, $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $bin_size, $localization_window_size); |
370
|
0
|
|
|
|
|
0
|
print "---done calculating chromosome region mutation densities\n\n"; |
371
|
|
|
|
|
|
|
|
372
|
0
|
|
|
|
|
0
|
print "--analyzing suspect regions\n"; |
373
|
0
|
|
|
|
|
0
|
analyze_suspect_regions($output_directory, $suspect_regions_array_ref, $genome_mutation_density_hash_ref, $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $genome_trans_insertion_breakpoints_hash_ref, $bin_size, $localization_window_size, $tp53_mutated, $tp53_mutation_found, $chromosome_cnv_breakpoints_hash_ref, $chromosome_loh_breakpoints_hash_ref); |
374
|
0
|
|
|
|
|
0
|
print "---done analyzing suspect regions\n\n"; |
375
|
|
|
|
|
|
|
|
376
|
0
|
|
|
|
|
0
|
print "--analyzing likely regions\n"; |
377
|
0
|
|
|
|
|
0
|
analyze_likely_regions($output_directory, $likely_regions_array_ref, $genome_mutation_density_hash_ref, $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $bin_size, $localization_window_size); |
378
|
0
|
|
|
|
|
0
|
print "---done analyzing likely regions\n\n"; |
379
|
|
|
|
|
|
|
|
380
|
0
|
|
|
|
|
0
|
print "--calculating copy number count\n"; |
381
|
0
|
|
|
|
|
0
|
check_copy_number_count($output_directory, $chromosome_copy_number_count_hash_ref); |
382
|
0
|
|
|
|
|
0
|
print "---done calculating copy number count\n\n"; |
383
|
|
|
|
|
|
|
|
384
|
0
|
|
|
|
|
0
|
print "--calculating switch count\n"; |
385
|
0
|
|
|
|
|
0
|
check_copy_number_switches($output_directory, $chromosome_copy_number_count_hash_ref); |
386
|
0
|
|
|
|
|
0
|
print "---done calculating switch count\n\n"; |
387
|
|
|
|
|
|
|
|
388
|
0
|
|
|
|
|
0
|
print "--calculating interchromosomal translocation rate\n"; |
389
|
0
|
|
|
|
|
0
|
calculate_interchromosomal_translocation_rate($output_directory, $chromosome_translocation_count_hash_ref); |
390
|
0
|
|
|
|
|
0
|
print "---done calculating interchromosomal translocation rate\n"; |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
}#sub run |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
#=head2 Sub-Method: validate_input |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
### validate_input ################################################################################ |
397
|
|
|
|
|
|
|
# Description: |
398
|
|
|
|
|
|
|
# Validates command line arguments. Prints error messages if some input if invalid. |
399
|
|
|
|
|
|
|
# |
400
|
|
|
|
|
|
|
# Input variables: |
401
|
|
|
|
|
|
|
# $argv_ref: reference to @ARGV |
402
|
|
|
|
|
|
|
# $cnv_directory_ref: reference to variable storing the CNV input directory |
403
|
|
|
|
|
|
|
# $trans_directory_ref: reference to variable storing the translocation input directory |
404
|
|
|
|
|
|
|
# $insertion_directory_ref: reference to variable storing the insertion input directory |
405
|
|
|
|
|
|
|
# $loh_directory_ref: reference to variable storing the LOH input directory |
406
|
|
|
|
|
|
|
# $tp53_mutated_ref: reference to variable storing the tp53 mutated flag |
407
|
|
|
|
|
|
|
# $output_directory_ref: reference to variable storing the output directory |
408
|
|
|
|
|
|
|
# $config_file_path_ref: reference to variable storing the path to the config file |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
#=cut |
411
|
|
|
|
|
|
|
sub validate_input { |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
#Parse parameters |
414
|
0
|
|
|
0
|
0
|
0
|
my $argv_ref = shift; |
415
|
0
|
|
|
|
|
0
|
my @argv = @{$argv_ref}; |
|
0
|
|
|
|
|
0
|
|
416
|
|
|
|
|
|
|
|
417
|
0
|
|
|
|
|
0
|
my $cnv_directory_ref = shift; |
418
|
0
|
|
|
|
|
0
|
my $trans_directory_ref = shift; |
419
|
0
|
|
|
|
|
0
|
my $insertion_directory_ref = shift; |
420
|
0
|
|
|
|
|
0
|
my $loh_directory_ref = shift; |
421
|
0
|
|
|
|
|
0
|
my $tp53_mutated_ref = shift; |
422
|
0
|
|
|
|
|
0
|
my $output_directory_ref = shift; |
423
|
0
|
|
|
|
|
0
|
my $config_file_path_ref = shift; |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
#Determine number of command line arguements |
426
|
0
|
|
|
|
|
0
|
$ARGC = @argv; |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
#Parse the command line arguements |
429
|
0
|
|
|
|
|
0
|
given ($ARGC) { |
430
|
0
|
|
|
|
|
0
|
when (/^0$/) { usage(0); } #Print error message if no arguements were entered |
|
0
|
|
|
|
|
0
|
|
431
|
|
|
|
|
|
|
|
432
|
0
|
|
|
|
|
0
|
when (/^1$/) { #Check for help option |
433
|
0
|
0
|
|
|
|
0
|
if($argv[0] eq "--help"){ |
434
|
0
|
|
|
|
|
0
|
man_text(); |
435
|
|
|
|
|
|
|
} |
436
|
|
|
|
|
|
|
else{ |
437
|
0
|
|
|
|
|
0
|
usage(1); |
438
|
|
|
|
|
|
|
} |
439
|
|
|
|
|
|
|
}#case 1 |
440
|
|
|
|
|
|
|
|
441
|
0
|
|
|
|
|
0
|
default { |
442
|
|
|
|
|
|
|
|
443
|
0
|
0
|
|
|
|
0
|
if($argv[$pos] eq "--cnv"){ #Check for the cnv input directory option, this field is mandatory |
444
|
0
|
|
|
|
|
0
|
next_arg(2); |
445
|
0
|
|
|
|
|
0
|
$$cnv_directory_ref = $argv[$pos]; |
446
|
0
|
0
|
|
|
|
0
|
if(!(substr($$cnv_directory_ref,-1,1) eq '/')){ |
447
|
0
|
|
|
|
|
0
|
$$cnv_directory_ref = $$cnv_directory_ref.'/'; |
448
|
|
|
|
|
|
|
} |
449
|
0
|
|
|
|
|
0
|
next_arg(3); |
450
|
|
|
|
|
|
|
} |
451
|
|
|
|
|
|
|
else { |
452
|
0
|
|
|
|
|
0
|
usage(4); |
453
|
|
|
|
|
|
|
} |
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
|
456
|
0
|
0
|
|
|
|
0
|
if($argv[$pos] eq "--trans"){ #Check for the translocation input directory option, this field is mandatory |
457
|
0
|
|
|
|
|
0
|
next_arg(5); |
458
|
0
|
|
|
|
|
0
|
$$trans_directory_ref = $argv[$pos]; |
459
|
0
|
0
|
|
|
|
0
|
if(!(substr($$trans_directory_ref,-1,1) eq '/')){ |
460
|
0
|
|
|
|
|
0
|
$$trans_directory_ref = $$trans_directory_ref.'/'; |
461
|
|
|
|
|
|
|
} |
462
|
0
|
|
|
|
|
0
|
next_arg(6); |
463
|
|
|
|
|
|
|
} |
464
|
|
|
|
|
|
|
else { |
465
|
0
|
|
|
|
|
0
|
usage(7); |
466
|
|
|
|
|
|
|
} |
467
|
|
|
|
|
|
|
|
468
|
0
|
0
|
|
|
|
0
|
if($argv[$pos] eq "--insrt"){ #Check for the insertion input directory option |
469
|
0
|
|
|
|
|
0
|
next_arg(8); |
470
|
0
|
|
|
|
|
0
|
$$insertion_directory_ref = $argv[$pos]; |
471
|
0
|
0
|
|
|
|
0
|
if(!(substr($$insertion_directory_ref,-1,1) eq '/')){ |
472
|
0
|
|
|
|
|
0
|
$$insertion_directory_ref = $$insertion_directory_ref.'/'; |
473
|
|
|
|
|
|
|
} |
474
|
0
|
|
|
|
|
0
|
$insertion_data_present = 1; |
475
|
0
|
|
|
|
|
0
|
next_arg(9); |
476
|
|
|
|
|
|
|
} |
477
|
|
|
|
|
|
|
|
478
|
0
|
0
|
|
|
|
0
|
if($argv[$pos] eq "--loh"){ #Check for the LOH input directory option |
479
|
0
|
|
|
|
|
0
|
next_arg(10); |
480
|
0
|
|
|
|
|
0
|
$$loh_directory_ref = $argv[$pos]; |
481
|
0
|
0
|
|
|
|
0
|
if(!(substr($$loh_directory_ref,-1,1) eq '/')){ |
482
|
0
|
|
|
|
|
0
|
$$loh_directory_ref = $$loh_directory_ref.'/'; |
483
|
|
|
|
|
|
|
} |
484
|
0
|
|
|
|
|
0
|
$LOH_data_present = 1; |
485
|
0
|
|
|
|
|
0
|
next_arg(11); |
486
|
|
|
|
|
|
|
} |
487
|
|
|
|
|
|
|
|
488
|
0
|
0
|
|
|
|
0
|
if($argv[$pos] eq "--tp53"){ #Check for the TP53 gene mutation check option |
489
|
0
|
|
|
|
|
0
|
$$tp53_mutated_ref = 1; |
490
|
0
|
|
|
|
|
0
|
next_arg(12); |
491
|
|
|
|
|
|
|
} |
492
|
|
|
|
|
|
|
|
493
|
0
|
0
|
|
|
|
0
|
if($argv[$pos] eq "--config"){ #Check for the config file option, this field is mandatory |
494
|
0
|
|
|
|
|
0
|
next_arg(13); |
495
|
0
|
|
|
|
|
0
|
$$config_file_path_ref = $argv[$pos]; |
496
|
0
|
|
|
|
|
0
|
next_arg(14); |
497
|
|
|
|
|
|
|
} |
498
|
|
|
|
|
|
|
else{ |
499
|
0
|
|
|
|
|
0
|
usage(15); |
500
|
|
|
|
|
|
|
} |
501
|
|
|
|
|
|
|
|
502
|
0
|
0
|
|
|
|
0
|
if($argv[$pos] eq "--output"){ #Check for the output directory option, this field is mandatory |
503
|
0
|
|
|
|
|
0
|
next_arg(16); |
504
|
0
|
|
|
|
|
0
|
$$output_directory_ref = $argv[$pos]; |
505
|
0
|
0
|
|
|
|
0
|
if(!(substr($$output_directory_ref,-1,1) eq '/')){ |
506
|
0
|
|
|
|
|
0
|
$$output_directory_ref = $$output_directory_ref.'/'; |
507
|
|
|
|
|
|
|
} |
508
|
|
|
|
|
|
|
} |
509
|
|
|
|
|
|
|
else { |
510
|
0
|
|
|
|
|
0
|
usage(17); |
511
|
|
|
|
|
|
|
} |
512
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
#Check that there are no other command line arguments |
514
|
0
|
0
|
|
|
|
0
|
if($pos != $ARGC-1){ |
515
|
0
|
|
|
|
|
0
|
usage(18); |
516
|
|
|
|
|
|
|
} |
517
|
|
|
|
|
|
|
}#default case |
518
|
|
|
|
|
|
|
}#given ($ARGC) |
519
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
}#sub validate_input |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
#=head2 Sub-Method: analyze_cnv_data |
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
### analyze_cnv_data ############################################################################## |
525
|
|
|
|
|
|
|
# Description: |
526
|
|
|
|
|
|
|
# Reads data from files located in the CNV input directory and populates: |
527
|
|
|
|
|
|
|
# $genome_cnv_data_hash_ref |
528
|
|
|
|
|
|
|
# $chromosome_copy_number_count_hash_ref |
529
|
|
|
|
|
|
|
# $chromosome_cnv_breakpoints_hash_ref |
530
|
|
|
|
|
|
|
# |
531
|
|
|
|
|
|
|
# Input variables: |
532
|
|
|
|
|
|
|
# $output_directory: stores the path to the output directory |
533
|
|
|
|
|
|
|
# $cnv_files_array_ref: reference to array containing all the CNV input files |
534
|
|
|
|
|
|
|
# $bin_size: stores the size of the bins which the chromosome will be divided into |
535
|
|
|
|
|
|
|
# $tp53_mutation_found_ref: reference to the tp53 mutation found flag |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
#=cut |
538
|
|
|
|
|
|
|
sub analyze_cnv_data { |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
#Parse the parameters |
541
|
11
|
|
|
11
|
0
|
16050574
|
my $output_directory = shift; |
542
|
|
|
|
|
|
|
|
543
|
11
|
|
|
|
|
22
|
my $cnv_files_array_ref = shift; |
544
|
11
|
|
|
|
|
26
|
my @cnv_files = @$cnv_files_array_ref; |
545
|
|
|
|
|
|
|
|
546
|
11
|
|
|
|
|
21
|
my $bin_size = shift; |
547
|
11
|
|
|
|
|
18
|
my $tp53_mutation_found_ref = shift; |
548
|
|
|
|
|
|
|
|
549
|
11
|
|
|
|
|
19
|
my %genome_cnv_data = (); #hash |
550
|
|
|
|
|
|
|
#key: chromosome eg. 1,2,X,Y |
551
|
|
|
|
|
|
|
#value: a reference to an array where each element corresponds to a bin along the |
552
|
|
|
|
|
|
|
# chromosome |
553
|
|
|
|
|
|
|
|
554
|
11
|
|
|
|
|
19
|
my @file_data; #an array storing all the entries from every input file |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
my $CURRENT_FILE; #file handle to the current file that is open |
557
|
0
|
|
|
|
|
0
|
my $TP53_FILE; #file handle to the TP53 CNV mutation output file |
558
|
|
|
|
|
|
|
|
559
|
0
|
|
|
|
|
0
|
my $line; #stores raw line read in from file |
560
|
0
|
|
|
|
|
0
|
my @line_data; #stores tokenized line read in from file |
561
|
|
|
|
|
|
|
|
562
|
11
|
|
|
|
|
22
|
my %chromosome_copy_number_count = (); #hash {chr}{copy number}{count} |
563
|
|
|
|
|
|
|
#key1: chromosome eg. 1,2,X,Y |
564
|
|
|
|
|
|
|
#key2: a copy-number state eg 0,1,3,15 |
565
|
|
|
|
|
|
|
#value: the number of region on key1 that have a copy number of key2 |
566
|
|
|
|
|
|
|
|
567
|
11
|
|
|
|
|
205
|
my %chromsome_cnv_breakpoints = ( #hash {chr}[start and end pairs] |
568
|
|
|
|
|
|
|
X => [], #key: chromosome eg. 1,2,X,Y |
569
|
|
|
|
|
|
|
Y => [], #value: an array that stored an ordered list of CNV breakpoints on key |
570
|
|
|
|
|
|
|
1 => [], |
571
|
|
|
|
|
|
|
2 => [], |
572
|
|
|
|
|
|
|
3 => [], |
573
|
|
|
|
|
|
|
4 => [], |
574
|
|
|
|
|
|
|
5 => [], |
575
|
|
|
|
|
|
|
6 => [], |
576
|
|
|
|
|
|
|
7 => [], |
577
|
|
|
|
|
|
|
8 => [], |
578
|
|
|
|
|
|
|
9 => [], |
579
|
|
|
|
|
|
|
10 => [], |
580
|
|
|
|
|
|
|
11 => [], |
581
|
|
|
|
|
|
|
12 => [], |
582
|
|
|
|
|
|
|
13 => [], |
583
|
|
|
|
|
|
|
14 => [], |
584
|
|
|
|
|
|
|
15 => [], |
585
|
|
|
|
|
|
|
16 => [], |
586
|
|
|
|
|
|
|
17 => [], |
587
|
|
|
|
|
|
|
18 => [], |
588
|
|
|
|
|
|
|
19 => [], |
589
|
|
|
|
|
|
|
20 => [], |
590
|
|
|
|
|
|
|
21 => [], |
591
|
|
|
|
|
|
|
22 => [] |
592
|
|
|
|
|
|
|
); |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
#Read the contents of the cnv files into memory |
595
|
11
|
100
|
|
|
|
50
|
if($#cnv_files==-1){ |
596
|
1
|
|
|
|
|
7
|
die "ERROR: no cnv files found in analyze_cnv_data\n"; |
597
|
|
|
|
|
|
|
} |
598
|
|
|
|
|
|
|
|
599
|
10
|
|
|
|
|
28
|
foreach my $file (@cnv_files){ |
600
|
|
|
|
|
|
|
#Open the file |
601
|
16
|
100
|
|
|
|
559
|
open ($CURRENT_FILE, "<", $file) or die "ERROR: could not open file at path $file\n"; |
602
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
#Check that the file is not empty |
604
|
15
|
50
|
|
|
|
6695
|
if(eof($CURRENT_FILE)){ |
605
|
0
|
|
|
|
|
0
|
close ($CURRENT_FILE); |
606
|
0
|
|
|
|
|
0
|
die "ERROR: $file is empty\n"; |
607
|
|
|
|
|
|
|
} |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
#Read header line and validate |
610
|
15
|
|
|
|
|
58
|
$line = <$CURRENT_FILE>; |
611
|
15
|
|
|
|
|
29
|
chomp($line); |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
#Check that the format of the header line is correct |
614
|
15
|
100
|
|
|
|
92
|
if(!($line =~ m/^#chr\tstart\tend\tnumber\tquality$/)){ |
615
|
1
|
|
|
|
|
14
|
close ($CURRENT_FILE); |
616
|
1
|
|
|
|
|
38
|
die "ERROR: header of cnv file $file is invalid\n"; |
617
|
|
|
|
|
|
|
} |
618
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
#Read all the data lines in the file |
620
|
14
|
|
|
|
|
40
|
while( !(eof($CURRENT_FILE)) ){ |
621
|
|
|
|
|
|
|
#read data line |
622
|
836
|
|
|
|
|
1273
|
$line = <$CURRENT_FILE>; |
623
|
836
|
|
|
|
|
643
|
chomp($line); |
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
#Validate the data line |
626
|
836
|
100
|
|
|
|
2702
|
if(!($line =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])\t[0-9]+\t[0-9]+\t[0-9]+\t([0-9]+|\.)$/)){ |
627
|
1
|
|
|
|
|
32
|
die "ERROR: invalid line found ($line) in file $file\n"; |
628
|
|
|
|
|
|
|
} |
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
#Split the data line and add it to the file_data array |
631
|
835
|
|
|
|
|
2515
|
@line_data = (split (/\t/,$line)); |
632
|
|
|
|
|
|
|
|
633
|
835
|
|
|
|
|
2396
|
push(@file_data,[@line_data]); |
634
|
|
|
|
|
|
|
} |
635
|
|
|
|
|
|
|
|
636
|
13
|
|
|
|
|
155
|
close ($CURRENT_FILE); |
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
}#foreach my $file (@cnv_files) |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
#Check that there are no overlapping CNV regions with different copy-numbers |
641
|
7
|
|
|
|
|
40
|
@file_data = check_for_overlaps("cnv", \@file_data); |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
#Create TP53 directory and output folder |
645
|
7
|
100
|
|
|
|
963
|
mkdir ("$output_directory"."TP53",0770) unless (-d "$output_directory"."TP53"); |
646
|
7
|
50
|
|
|
|
115
|
if(!(-e "$output_directory"."TP53")){ |
647
|
0
|
|
|
|
|
0
|
die "ERROR: could not create folder $output_directory"."TP53\n"; |
648
|
|
|
|
|
|
|
} |
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
#Create the TP53 CNV mutation file |
651
|
7
|
50
|
|
|
|
674
|
open ($TP53_FILE, ">", "$output_directory"."TP53/TP53.spc") or die "ERROR: could not create file: $output_directory"."TP53/TP53.spc"; |
652
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
#Print the header of the file (same format as a .spc file) |
654
|
7
|
|
|
|
|
134
|
print $TP53_FILE "#chr\tstart\tend\tnumber\tquality"; |
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
#For every data line that was read in from an input file, |
657
|
|
|
|
|
|
|
#record the CNV mutation in the genome_cnv_data_hash, |
658
|
|
|
|
|
|
|
#record the exact breakpoints of the CNV, |
659
|
|
|
|
|
|
|
#update the chromosome_copy_number_count hash and |
660
|
|
|
|
|
|
|
#check if the CNV is in the TP53 region |
661
|
7
|
|
|
|
|
42
|
for (my $n = 0; $n < scalar(@file_data); $n++){ |
662
|
739
|
|
|
|
|
829
|
my $hash = {}; |
663
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
#Ensure that the chromosome value is valid |
665
|
739
|
50
|
|
|
|
3693
|
if(!($file_data[$n][0] =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])$/)){ |
666
|
0
|
|
|
|
|
0
|
die "ERROR: invalid line found in CNV input file: @{$file_data[$n]}\n"; |
|
0
|
|
|
|
|
0
|
|
667
|
|
|
|
|
|
|
} |
668
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
#Parse the chromosome |
670
|
739
|
|
|
|
|
1259
|
my $chr = $2; |
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
#Increment the copy-number count hash based on the line data |
673
|
739
|
|
|
|
|
1616
|
$chromosome_copy_number_count {$chr}{$file_data[$n][3]}++; |
674
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
#Record the exact breakpoints of the CNV |
676
|
739
|
|
|
|
|
637
|
push (@{$chromsome_cnv_breakpoints{$chr}}, ($file_data[$n][1],$file_data[$n][2])); |
|
739
|
|
|
|
|
2005
|
|
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
#Calculate the bin for the start and end breakpoint |
679
|
739
|
|
|
|
|
1289
|
my $start_index = int($file_data[$n][1]/$bin_size); |
680
|
739
|
|
|
|
|
946
|
my $end_index = int($file_data[$n][2]/$bin_size); |
681
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
my $update_bin = sub { |
683
|
0
|
|
|
0
|
|
0
|
my $source_chr = shift; |
684
|
0
|
|
|
|
|
0
|
my $index = shift; |
685
|
0
|
|
|
|
|
0
|
my $copy_num = shift; |
686
|
|
|
|
|
|
|
|
687
|
0
|
|
|
|
|
0
|
my $genome_hash_ref = shift; |
688
|
0
|
|
|
|
|
0
|
my %genome_hash = %{$genome_hash_ref}; |
|
0
|
|
|
|
|
0
|
|
689
|
|
|
|
|
|
|
|
690
|
0
|
|
|
|
|
0
|
my $hash = (); |
691
|
|
|
|
|
|
|
|
692
|
0
|
0
|
|
|
|
0
|
if(!defined(@{$genome_hash{$source_chr}}[$index])){ |
|
0
|
|
|
|
|
0
|
|
693
|
0
|
|
|
|
|
0
|
$hash->{'BPcount'} = 1; |
694
|
0
|
|
|
|
|
0
|
$hash->{$copy_num} = 0.5; |
695
|
0
|
|
|
|
|
0
|
@{$genome_hash{$source_chr}}[$index] = $hash; |
|
0
|
|
|
|
|
0
|
|
696
|
|
|
|
|
|
|
} |
697
|
|
|
|
|
|
|
else{ #if a bin does exist then increment the counts |
698
|
0
|
|
|
|
|
0
|
${@{$genome_hash{$source_chr}}[$index]}{'BPcount'}++; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
699
|
0
|
|
|
|
|
0
|
${@{$genome_hash{$source_chr}}[$index]}{$copy_num}+=0.5; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
700
|
|
|
|
|
|
|
} |
701
|
739
|
|
|
|
|
3330
|
}; |
702
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
#Check if a bin exists at the start index |
704
|
|
|
|
|
|
|
#if not, create one |
705
|
739
|
50
|
|
|
|
690
|
if(!defined(@{$genome_cnv_data{$chr}}[$start_index])){ |
|
739
|
|
|
|
|
1428
|
|
706
|
739
|
|
|
|
|
1053
|
$hash->{'BPcount'} = 1; |
707
|
739
|
|
|
|
|
986
|
$hash->{$file_data[$n][3]} = 0.5; |
708
|
739
|
|
|
|
|
621
|
@{$genome_cnv_data{$chr}}[$start_index] = $hash; |
|
739
|
|
|
|
|
84957
|
|
709
|
|
|
|
|
|
|
} |
710
|
|
|
|
|
|
|
else{ #If one does exist increment the counts |
711
|
0
|
|
|
|
|
0
|
${@{$genome_cnv_data{$chr}}[$start_index]}{'BPcount'}++; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
712
|
0
|
|
|
|
|
0
|
${@{$genome_cnv_data{$chr}}[$start_index]}{$file_data[$n][3]}+=0.5; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
713
|
|
|
|
|
|
|
} |
714
|
|
|
|
|
|
|
|
715
|
739
|
|
|
|
|
1268
|
$hash = {}; |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
#Check if a bin exists at the end index |
718
|
|
|
|
|
|
|
#if not, create one |
719
|
739
|
100
|
|
|
|
693
|
if(!defined(@{$genome_cnv_data{$chr}}[$end_index])){ |
|
739
|
|
|
|
|
1432
|
|
720
|
738
|
|
|
|
|
1158
|
$hash->{'BPcount'} = 1; |
721
|
738
|
|
|
|
|
1049
|
$hash->{$file_data[$n][3]} = 0.5; |
722
|
738
|
|
|
|
|
646
|
@{$genome_cnv_data{$chr}}[$end_index] = $hash; |
|
738
|
|
|
|
|
4641
|
|
723
|
|
|
|
|
|
|
} |
724
|
|
|
|
|
|
|
else{ #If one does exist increment the counts |
725
|
1
|
|
|
|
|
2
|
${@{$genome_cnv_data{$chr}}[$end_index]}{'BPcount'}++; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
3
|
|
726
|
1
|
|
|
|
|
2
|
${@{$genome_cnv_data{$chr}}[$end_index]}{$file_data[$n][3]}+=0.5; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
6
|
|
727
|
|
|
|
|
|
|
} |
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
#Check if the variation was in the TP53 gene |
731
|
739
|
100
|
100
|
|
|
8074
|
if( |
|
|
|
100
|
|
|
|
|
732
|
|
|
|
|
|
|
( $chr ne 'X' && $chr ne 'Y' ) && |
733
|
|
|
|
|
|
|
( $chr==17 ) |
734
|
|
|
|
|
|
|
){ |
735
|
37
|
100
|
66
|
|
|
622
|
if( |
|
|
|
33
|
|
|
|
|
736
|
|
|
|
|
|
|
( $file_data[$n][3] != 2 ) && |
737
|
|
|
|
|
|
|
( ( $file_data[$n][1] >= $TP53_start && $file_data[$n][1] <= $TP53_end ) || ( $file_data[$n][2] >= $TP53_start && $file_data[$n][2] <= $TP53_end ) ) |
738
|
|
|
|
|
|
|
){ |
739
|
|
|
|
|
|
|
#If a CNV was found in the TP53 region |
740
|
1
|
|
|
|
|
2
|
$$tp53_mutation_found_ref = 1; |
741
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
#Record the mutation in the TP53 CNV file |
743
|
1
|
|
|
|
|
3
|
print $TP53_FILE "\n"; |
744
|
1
|
|
|
|
|
2
|
for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++){ |
|
6
|
|
|
|
|
22
|
|
745
|
5
|
|
|
|
|
36
|
print $TP53_FILE "$file_data[$n][$i]"; |
746
|
5
|
100
|
|
|
|
3
|
if($i != scalar(@{$file_data[$n]})-1){ |
|
5
|
|
|
|
|
13
|
|
747
|
4
|
|
|
|
|
5
|
print $TP53_FILE "\t"; |
748
|
|
|
|
|
|
|
}#if |
749
|
|
|
|
|
|
|
}#for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++) |
750
|
|
|
|
|
|
|
}#if |
751
|
|
|
|
|
|
|
}#if |
752
|
|
|
|
|
|
|
}#for (my $n = 0; $n < scalar(@file_data); $n++) |
753
|
|
|
|
|
|
|
|
754
|
7
|
|
|
|
|
1952
|
close($TP53_FILE); |
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
#return hash |
757
|
7
|
|
|
|
|
562
|
return(\%genome_cnv_data, \%chromosome_copy_number_count, \%chromsome_cnv_breakpoints); |
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
}#sub analyze_cnv_data |
760
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
#=head2 Sub-Method: check_for_overlaps |
762
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
### check_for_overlaps ############################################################################ |
764
|
|
|
|
|
|
|
# Description: |
765
|
|
|
|
|
|
|
# Checks if there were overlapping CNV regions with different copy-numbers in the input files. |
766
|
|
|
|
|
|
|
# Also checks if there are any overlapping translocation destinations or overlapping LOH |
767
|
|
|
|
|
|
|
# regions. |
768
|
|
|
|
|
|
|
# |
769
|
|
|
|
|
|
|
# Input variables: |
770
|
|
|
|
|
|
|
# $type: flag variable indicating which type of overlap to check for "cnv", "trans", |
771
|
|
|
|
|
|
|
# or "loh" |
772
|
|
|
|
|
|
|
# $file_data_ref: reference to an array storing all the data lines read in from the specific |
773
|
|
|
|
|
|
|
# type of input file |
774
|
|
|
|
|
|
|
|
775
|
|
|
|
|
|
|
#=cut |
776
|
|
|
|
|
|
|
sub check_for_overlaps { |
777
|
|
|
|
|
|
|
|
778
|
7
|
|
|
7
|
0
|
59
|
my $type = shift; |
779
|
7
|
|
|
|
|
13
|
my $file_data_ref = shift; |
780
|
7
|
|
|
|
|
70
|
my @file_data = @$file_data_ref; |
781
|
|
|
|
|
|
|
|
782
|
7
|
|
|
|
|
39
|
my $start_overlap = 0; #Flag variable indicating if the start position of one region is overlapping with |
783
|
|
|
|
|
|
|
#another region |
784
|
|
|
|
|
|
|
|
785
|
7
|
|
|
|
|
22
|
my $end_overlap = 0; #Flag variable indicating if the end position of one region is overlapping with |
786
|
|
|
|
|
|
|
#another region |
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
#Check for overlapping regions |
789
|
|
|
|
|
|
|
#Compares each entry in the array to every following entry |
790
|
7
|
|
|
|
|
45
|
for (my $n = 0; $n < scalar(@file_data); $n++){ |
791
|
739
|
|
|
|
|
1232
|
for (my $k = $n+1; $k < scalar(@file_data); $k++){ |
792
|
|
|
|
|
|
|
|
793
|
|
|
|
|
|
|
#Check if the 2 regions in question are on the same chromosome |
794
|
55656
|
100
|
|
|
|
124506
|
if($file_data[$n][0] eq $file_data[$k][0]) { |
795
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
#Check if the end point of region 1 is within region 2 |
797
|
2862
|
100
|
100
|
|
|
7298
|
if($file_data[$n][2]>=$file_data[$k][1] && $file_data[$n][2]<=$file_data[$k][2]){ |
798
|
96
|
|
|
|
|
103
|
$end_overlap = 1; |
799
|
|
|
|
|
|
|
} |
800
|
|
|
|
|
|
|
|
801
|
|
|
|
|
|
|
#Check if the start point of region 1 is within region 2 |
802
|
2862
|
100
|
100
|
|
|
6389
|
if($file_data[$n][1]>=$file_data[$k][1] && $file_data[$n][1]<=$file_data[$k][2]){ |
803
|
78
|
|
|
|
|
95
|
$start_overlap = 1; |
804
|
|
|
|
|
|
|
} |
805
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
#If an overlap was detected |
807
|
2862
|
100
|
100
|
|
|
16682
|
if($start_overlap==1 || $end_overlap==1) { |
|
|
50
|
66
|
|
|
|
|
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
#if it was a translocation overlap then throw an error |
810
|
96
|
50
|
|
|
|
200
|
if($type eq "trans"){ |
811
|
0
|
|
|
|
|
0
|
die "ERROR: found overlapping translocation source regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n"; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
812
|
|
|
|
|
|
|
} |
813
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
#if it was a LOH overlap then throw an error |
815
|
96
|
50
|
|
|
|
171
|
if($type eq "loh"){ |
816
|
0
|
|
|
|
|
0
|
die "ERROR: found overlapping LOH regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n"; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
817
|
|
|
|
|
|
|
} |
818
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
#if it was a CNV overlap check if the copy numbers are the same |
820
|
|
|
|
|
|
|
#If they are different throw an error |
821
|
96
|
50
|
33
|
|
|
597
|
if( |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
822
|
|
|
|
|
|
|
( $type eq "cnv") && |
823
|
|
|
|
|
|
|
( $file_data[$n][3] != $file_data[$k][3] ) |
824
|
|
|
|
|
|
|
){ |
825
|
0
|
|
|
|
|
0
|
die "ERROR: found overlapping regions with different copy number values:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n"; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
826
|
|
|
|
|
|
|
} |
827
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
#if they are the same |
829
|
|
|
|
|
|
|
#and the user does not wish to collapse overlapping regions with the same copy number then throw an error |
830
|
|
|
|
|
|
|
elsif ($collapse_regions==0) { |
831
|
0
|
|
|
|
|
0
|
die "ERROR: found overlapping copy number regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n"; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
832
|
|
|
|
|
|
|
} |
833
|
|
|
|
|
|
|
|
834
|
|
|
|
|
|
|
#if the user wishes to collapses overlapping regions with the same copy number then do so |
835
|
|
|
|
|
|
|
elsif ($collapse_regions==1) { |
836
|
|
|
|
|
|
|
#Region 2 completely encompasses region 1 |
837
|
|
|
|
|
|
|
#So replace region 1 with region 2 |
838
|
96
|
100
|
66
|
|
|
340
|
if($start_overlap==1 && $end_overlap==1){ |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
839
|
78
|
|
|
|
|
158
|
$file_data[$n][1] = $file_data[$k][1]; |
840
|
78
|
|
|
|
|
102
|
$file_data[$n][2] = $file_data[$k][2]; |
841
|
|
|
|
|
|
|
} |
842
|
|
|
|
|
|
|
#The start point of region 1 is within region 2 |
843
|
|
|
|
|
|
|
#So replace the start point of region 1 with the start point of region 2 |
844
|
|
|
|
|
|
|
elsif($start_overlap==1){ |
845
|
0
|
|
|
|
|
0
|
$file_data[$n][1] = $file_data[$k][1]; |
846
|
|
|
|
|
|
|
} |
847
|
|
|
|
|
|
|
#The end point of region 1 is within region 2 |
848
|
|
|
|
|
|
|
#So replace the end point of region 1 with the end point of region 2 |
849
|
|
|
|
|
|
|
elsif($end_overlap==1){ |
850
|
18
|
|
|
|
|
41
|
$file_data[$n][2] = $file_data[$k][2]; |
851
|
|
|
|
|
|
|
} |
852
|
|
|
|
|
|
|
#If region 1 was modified then remove region 2 and re-check for overlaps |
853
|
96
|
50
|
66
|
|
|
377
|
if($start_overlap==1 || $end_overlap==1){ |
854
|
96
|
|
|
|
|
2482
|
@file_data = (@file_data[0..($k-1),($k+1)..(scalar(@file_data)-1)]); |
855
|
96
|
|
|
|
|
539
|
$start_overlap = 0; |
856
|
96
|
|
|
|
|
88
|
$end_overlap = 0; |
857
|
96
|
|
|
|
|
92
|
$k = $n+1; |
858
|
96
|
|
|
|
|
129
|
redo; |
859
|
|
|
|
|
|
|
} |
860
|
|
|
|
|
|
|
|
861
|
|
|
|
|
|
|
}#elsif ($collapse_regions==1) |
862
|
|
|
|
|
|
|
}#if($start_overlap==1 || $end_overlap==1) |
863
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
#Check if region 1 completely encompasses region 2 |
865
|
|
|
|
|
|
|
elsif($file_data[$n][1]<=$file_data[$k][1] && $file_data[$n][2]>=$file_data[$k][2]){ |
866
|
|
|
|
|
|
|
|
867
|
0
|
0
|
|
|
|
0
|
if($type eq "trans"){ |
868
|
0
|
|
|
|
|
0
|
die "ERROR: found overlapping translocation source regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n"; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
869
|
|
|
|
|
|
|
} |
870
|
|
|
|
|
|
|
|
871
|
0
|
0
|
|
|
|
0
|
if($type eq "loh"){ |
872
|
0
|
|
|
|
|
0
|
die "ERROR: found overlapping LOH regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n"; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
873
|
|
|
|
|
|
|
} |
874
|
|
|
|
|
|
|
|
875
|
|
|
|
|
|
|
#If the copy numbers are different throw an error |
876
|
0
|
0
|
0
|
|
|
0
|
if( |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
877
|
|
|
|
|
|
|
( $type eq "cnv") && |
878
|
|
|
|
|
|
|
( $file_data[$n][3] != $file_data[$k][3] ) |
879
|
|
|
|
|
|
|
){ |
880
|
0
|
|
|
|
|
0
|
die "ERROR: found overlapping regions with different copy number values:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n"; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
881
|
|
|
|
|
|
|
} |
882
|
|
|
|
|
|
|
|
883
|
|
|
|
|
|
|
#If the copy numbers are the same but the user does not want to collapse then throw an error |
884
|
|
|
|
|
|
|
elsif ($collapse_regions==0) { |
885
|
0
|
|
|
|
|
0
|
die "ERROR: found overlapping copy number regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n"; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
886
|
|
|
|
|
|
|
} |
887
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
#If the user does wish to collapse then remove the second region |
889
|
|
|
|
|
|
|
elsif ($collapse_regions==1) { |
890
|
0
|
|
|
|
|
0
|
@file_data = (@file_data[0..($k-1),($k+1)..(scalar(@file_data)-1)]); |
891
|
|
|
|
|
|
|
} |
892
|
|
|
|
|
|
|
}#elsif($file_data[$n][1]<=$file_data[$k][1] && $file_data[$n][2]>=$file_data[$k][2]) |
893
|
|
|
|
|
|
|
}#if($file_data[$n][0] eq $file_data[$k][0]) |
894
|
|
|
|
|
|
|
}#for (my $n = 0; $n < scalar(@file_data); $n++) |
895
|
|
|
|
|
|
|
}#for (my $n = 0; $n < scalar(@file_data); $n++) |
896
|
|
|
|
|
|
|
|
897
|
|
|
|
|
|
|
#Return the updated entries |
898
|
7
|
|
|
|
|
278
|
return (@file_data); |
899
|
|
|
|
|
|
|
|
900
|
|
|
|
|
|
|
}#sub check_for_overlaps |
901
|
|
|
|
|
|
|
|
902
|
|
|
|
|
|
|
#=head2 Sub-Method: analyze_trans_data |
903
|
|
|
|
|
|
|
|
904
|
|
|
|
|
|
|
### analyze_trans_data ############################################################################ |
905
|
|
|
|
|
|
|
# Description: |
906
|
|
|
|
|
|
|
# Reads data from files located in the trans input directory and popultates: |
907
|
|
|
|
|
|
|
# $genome_trans_data_hash_ref |
908
|
|
|
|
|
|
|
# $chromosome_translocation_count_hash_ref |
909
|
|
|
|
|
|
|
# $genome_trans_breakpoints_hash_ref |
910
|
|
|
|
|
|
|
# |
911
|
|
|
|
|
|
|
# Input variables: |
912
|
|
|
|
|
|
|
# $output_directory: stores the path to the output directory |
913
|
|
|
|
|
|
|
# $trans_files_array_ref: reference to array containing all the translocation |
914
|
|
|
|
|
|
|
# input files |
915
|
|
|
|
|
|
|
# $bin_size: stores the size of the bins which the chromosome will be divided into |
916
|
|
|
|
|
|
|
# $tp53_mutation_found_ref: reference to the tp53 mutation found flag |
917
|
|
|
|
|
|
|
|
918
|
|
|
|
|
|
|
#=cut |
919
|
|
|
|
|
|
|
sub analyze_trans_data { |
920
|
|
|
|
|
|
|
|
921
|
|
|
|
|
|
|
#Parse the parameters |
922
|
10
|
|
|
10
|
0
|
8172214
|
my $output_directory = shift; |
923
|
10
|
|
|
|
|
57
|
my $trans_files_array_ref = shift; |
924
|
10
|
|
|
|
|
26
|
my @trans_files = @$trans_files_array_ref; |
925
|
|
|
|
|
|
|
|
926
|
10
|
|
|
|
|
13
|
my $bin_size = shift; |
927
|
|
|
|
|
|
|
|
928
|
10
|
|
|
|
|
14
|
my $tp53_mutation_found_ref = shift; |
929
|
|
|
|
|
|
|
|
930
|
10
|
|
|
|
|
19
|
my %genome_trans_data = (); #hash |
931
|
|
|
|
|
|
|
#key: chromosome eg. 1,2,X,Y |
932
|
|
|
|
|
|
|
#value: a reference to an array where each element corresponds to a bin along the |
933
|
|
|
|
|
|
|
# chromosome |
934
|
|
|
|
|
|
|
|
935
|
10
|
|
|
|
|
14
|
my @file_data; #an array storing all the entries from every input file |
936
|
|
|
|
|
|
|
|
937
|
|
|
|
|
|
|
my $CURRENT_FILE; #file handle to the current file that is open |
938
|
0
|
|
|
|
|
0
|
my $TP53_FILE; #file handle to the TP53 translocation mutation output file |
939
|
|
|
|
|
|
|
|
940
|
0
|
|
|
|
|
0
|
my $line; #stores raw line read in from file |
941
|
0
|
|
|
|
|
0
|
my @line_data; #stores tokenized line read in from file |
942
|
|
|
|
|
|
|
|
943
|
0
|
|
|
|
|
0
|
my $chr1; |
944
|
0
|
|
|
|
|
0
|
my $chr2; |
945
|
|
|
|
|
|
|
|
946
|
10
|
|
|
|
|
16
|
my %chromosome_trans_count = (); #hash {chr}{chr}{count} |
947
|
|
|
|
|
|
|
#key1: chromosome eg. 1,2,X,Y |
948
|
|
|
|
|
|
|
#key2: chromosome eg. 1,2,X,Y |
949
|
|
|
|
|
|
|
#value: the number of translocations between key1 and key2 |
950
|
|
|
|
|
|
|
|
951
|
10
|
|
|
|
|
175
|
my %genome_trans_breakpoints = ( #hash {chr}[array] |
952
|
|
|
|
|
|
|
X => [], #key: chromosome eg. 1,2,X,Y |
953
|
|
|
|
|
|
|
Y => [], #value: an array storing all the translocation breakpoints |
954
|
|
|
|
|
|
|
1 => [], # on key |
955
|
|
|
|
|
|
|
2 => [], |
956
|
|
|
|
|
|
|
3 => [], |
957
|
|
|
|
|
|
|
4 => [], |
958
|
|
|
|
|
|
|
5 => [], |
959
|
|
|
|
|
|
|
6 => [], |
960
|
|
|
|
|
|
|
7 => [], |
961
|
|
|
|
|
|
|
8 => [], |
962
|
|
|
|
|
|
|
9 => [], |
963
|
|
|
|
|
|
|
10 => [], |
964
|
|
|
|
|
|
|
11 => [], |
965
|
|
|
|
|
|
|
12 => [], |
966
|
|
|
|
|
|
|
13 => [], |
967
|
|
|
|
|
|
|
14 => [], |
968
|
|
|
|
|
|
|
15 => [], |
969
|
|
|
|
|
|
|
16 => [], |
970
|
|
|
|
|
|
|
17 => [], |
971
|
|
|
|
|
|
|
18 => [], |
972
|
|
|
|
|
|
|
19 => [], |
973
|
|
|
|
|
|
|
20 => [], |
974
|
|
|
|
|
|
|
21 => [], |
975
|
|
|
|
|
|
|
22 => [] |
976
|
|
|
|
|
|
|
); |
977
|
|
|
|
|
|
|
|
978
|
|
|
|
|
|
|
|
979
|
|
|
|
|
|
|
#Read the contents of the cnv files into memory |
980
|
10
|
100
|
|
|
|
43
|
if($#trans_files==-1){ |
981
|
1
|
|
|
|
|
8
|
die "ERROR: no trans files found in analyze_trans_data\n"; |
982
|
|
|
|
|
|
|
} |
983
|
|
|
|
|
|
|
|
984
|
9
|
|
|
|
|
23
|
foreach my $file (@trans_files){ |
985
|
|
|
|
|
|
|
#open the file |
986
|
14
|
50
|
|
|
|
582
|
open ($CURRENT_FILE, "<", $file) or die "ERROR: could not open file at path $file\n"; |
987
|
|
|
|
|
|
|
|
988
|
|
|
|
|
|
|
#check that the file is not empty |
989
|
14
|
100
|
|
|
|
2187
|
if(eof($CURRENT_FILE)){ |
990
|
1
|
|
|
|
|
6
|
close ($CURRENT_FILE); |
991
|
1
|
|
|
|
|
24
|
die "ERROR: $file is empty\n"; |
992
|
|
|
|
|
|
|
} |
993
|
|
|
|
|
|
|
|
994
|
|
|
|
|
|
|
#read header line and validate |
995
|
13
|
|
|
|
|
53
|
$line = <$CURRENT_FILE>; |
996
|
13
|
|
|
|
|
25
|
chomp($line); |
997
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
#validate the header line |
999
|
13
|
100
|
|
|
|
81
|
if(!($line =~ m/^#chr1\tstart\tend\tchr2\tstart\tend\tquality$/)){ |
1000
|
1
|
|
|
|
|
11
|
close ($CURRENT_FILE); |
1001
|
1
|
|
|
|
|
24
|
die "ERROR: header of translocation file $file is invalid\n"; |
1002
|
|
|
|
|
|
|
} |
1003
|
|
|
|
|
|
|
|
1004
|
|
|
|
|
|
|
#read in every data line |
1005
|
12
|
|
|
|
|
37
|
while( !(eof($CURRENT_FILE)) ){ |
1006
|
|
|
|
|
|
|
#read data line |
1007
|
742
|
|
|
|
|
1245
|
$line = <$CURRENT_FILE>; |
1008
|
742
|
|
|
|
|
677
|
chomp($line); |
1009
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
#validate the format of the data line |
1011
|
742
|
100
|
|
|
|
3183
|
if(!($line =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])\t[0-9]+\t[0-9]+\t(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])\t[0-9]+\t[0-9]+\t([0-9]+|\.)$/)){ |
1012
|
1
|
|
|
|
|
33
|
die "ERROR: invalid line found in translocation input file: ($line) in file $file\n"; |
1013
|
0
|
|
|
|
|
0
|
next; |
1014
|
|
|
|
|
|
|
} |
1015
|
|
|
|
|
|
|
|
1016
|
|
|
|
|
|
|
#Split the data line and add it to the array |
1017
|
741
|
|
|
|
|
2793
|
@line_data = (split (/\t/,$line)); |
1018
|
|
|
|
|
|
|
|
1019
|
|
|
|
|
|
|
#if the start position is greater than the end position for either the source or destination throw an error |
1020
|
741
|
100
|
66
|
|
|
2797
|
if( |
1021
|
|
|
|
|
|
|
( $line_data[1] >= $line_data[2] ) || |
1022
|
|
|
|
|
|
|
( $line_data[4] >= $line_data[5] ) |
1023
|
|
|
|
|
|
|
){ |
1024
|
|
|
|
|
|
|
#warn "ERROR: invalid line found ($line) in file $file. Start or end values invalid\n"; |
1025
|
|
|
|
|
|
|
#next; |
1026
|
|
|
|
|
|
|
} |
1027
|
|
|
|
|
|
|
|
1028
|
|
|
|
|
|
|
#Add the data line to the file_data array |
1029
|
741
|
|
|
|
|
2566
|
push(@file_data,[@line_data]); |
1030
|
|
|
|
|
|
|
} |
1031
|
|
|
|
|
|
|
|
1032
|
11
|
|
|
|
|
104
|
close ($CURRENT_FILE); |
1033
|
|
|
|
|
|
|
|
1034
|
|
|
|
|
|
|
}#foreach my $file (@trans_files) |
1035
|
|
|
|
|
|
|
|
1036
|
|
|
|
|
|
|
#ignoring overlapping translocations for now |
1037
|
|
|
|
|
|
|
#@file_data = check_for_overlaps("trans", \@file_data); |
1038
|
|
|
|
|
|
|
|
1039
|
|
|
|
|
|
|
#Create TP53 directory and output folder |
1040
|
6
|
100
|
|
|
|
265
|
mkdir ("$output_directory"."TP53",0770) unless (-d "$output_directory"."TP53"); |
1041
|
6
|
50
|
|
|
|
69
|
if(!(-e "$output_directory"."TP53")){ |
1042
|
0
|
|
|
|
|
0
|
die "ERROR: could not create folder $output_directory"."TP53\n"; |
1043
|
|
|
|
|
|
|
} |
1044
|
|
|
|
|
|
|
|
1045
|
|
|
|
|
|
|
#create TP53 translocation mutation file |
1046
|
6
|
50
|
|
|
|
510
|
open ($TP53_FILE, ">", "$output_directory"."TP53/TP53.spt") or die "ERROR: could not create file: $output_directory"."TP53/TP53.spt"; |
1047
|
|
|
|
|
|
|
|
1048
|
|
|
|
|
|
|
#Print the header of the file (same format as a .spt file) |
1049
|
6
|
|
|
|
|
36
|
print $TP53_FILE "#chr1\tstart\tend\tchr2\tstart\tend\tquality"; |
1050
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
#For every data line that was read in from an input file, |
1052
|
|
|
|
|
|
|
#record the translocation mutation in the genome_trans_data_hash, |
1053
|
|
|
|
|
|
|
#record the exact breakpoints of the translocation, |
1054
|
|
|
|
|
|
|
#update the chromosome_trans_count hash and |
1055
|
|
|
|
|
|
|
#check if the translocation is in the TP53 region |
1056
|
6
|
|
|
|
|
116
|
for (my $n = 0; $n < scalar(@file_data); $n++){ |
1057
|
|
|
|
|
|
|
|
1058
|
741
|
|
|
|
|
831
|
my $hash = {}; |
1059
|
|
|
|
|
|
|
|
1060
|
|
|
|
|
|
|
#verify that the chromosome 1 is valid |
1061
|
741
|
50
|
|
|
|
3569
|
if(!($file_data[$n][0] =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])$/)){ |
1062
|
0
|
|
|
|
|
0
|
die "ERROR: invalid chromosome field detected in translocation file\n"; |
1063
|
|
|
|
|
|
|
} |
1064
|
|
|
|
|
|
|
|
1065
|
|
|
|
|
|
|
#parse out chromosome 1 |
1066
|
741
|
|
|
|
|
1400
|
my $chr1 = $2; |
1067
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
#verify that the chromosome 2 is valid |
1069
|
741
|
50
|
|
|
|
1892
|
if(!($file_data[$n][3] =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])$/)){ |
1070
|
0
|
|
|
|
|
0
|
die "ERROR: invalid chromosome field detected in translocation file\n"; |
1071
|
|
|
|
|
|
|
} |
1072
|
|
|
|
|
|
|
|
1073
|
|
|
|
|
|
|
#parse out chromosome 2 |
1074
|
741
|
|
|
|
|
825
|
my $chr2 = $2; |
1075
|
|
|
|
|
|
|
|
1076
|
|
|
|
|
|
|
#calculate the bin where each breakpoint will be placed |
1077
|
741
|
|
|
|
|
1397
|
my $start_index1 = int($file_data[$n][1]/$bin_size); |
1078
|
741
|
|
|
|
|
801
|
my $end_index1 = int($file_data[$n][2]/$bin_size); |
1079
|
|
|
|
|
|
|
|
1080
|
741
|
|
|
|
|
950
|
my $start_index2 = int($file_data[$n][4]/$bin_size); |
1081
|
741
|
|
|
|
|
848
|
my $end_index2 = int($file_data[$n][5]/$bin_size); |
1082
|
|
|
|
|
|
|
|
1083
|
|
|
|
|
|
|
my $update_bin = sub { |
1084
|
0
|
|
|
0
|
|
0
|
my $source_chr = shift; |
1085
|
0
|
|
|
|
|
0
|
my $dest_chr = shift; |
1086
|
0
|
|
|
|
|
0
|
my $index = shift; |
1087
|
0
|
|
|
|
|
0
|
my $data = shift; |
1088
|
0
|
|
|
|
|
0
|
my $type = shift; |
1089
|
|
|
|
|
|
|
|
1090
|
0
|
|
|
|
|
0
|
my $genome_hash_ref = shift; |
1091
|
0
|
|
|
|
|
0
|
my %genome_hash = %{$genome_hash_ref}; |
|
0
|
|
|
|
|
0
|
|
1092
|
|
|
|
|
|
|
|
1093
|
0
|
|
|
|
|
0
|
my $hash = (); |
1094
|
|
|
|
|
|
|
|
1095
|
0
|
0
|
|
|
|
0
|
if(!defined(@{$genome_hash{$source_chr}}[$index])){ |
|
0
|
|
|
|
|
0
|
|
1096
|
0
|
|
|
|
|
0
|
$hash->{'BPcount'} = 1; |
1097
|
0
|
|
|
|
|
0
|
push (@{$hash->{$type}{$dest_chr}}, $data); |
|
0
|
|
|
|
|
0
|
|
1098
|
0
|
|
|
|
|
0
|
@{$genome_hash{$source_chr}}[$index] = $hash; |
|
0
|
|
|
|
|
0
|
|
1099
|
|
|
|
|
|
|
} |
1100
|
|
|
|
|
|
|
else{ #if a bin does exist then increment the counts |
1101
|
0
|
|
|
|
|
0
|
${@{$genome_hash{$source_chr}}[$index]}{'BPcount'}++; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
1102
|
0
|
|
|
|
|
0
|
push (@{${@{$genome_hash{$source_chr}}[$index]}{$type}{$dest_chr}}, $data); |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
1103
|
|
|
|
|
|
|
} |
1104
|
|
|
|
|
|
|
|
1105
|
741
|
|
|
|
|
2560
|
}; |
1106
|
|
|
|
|
|
|
|
1107
|
|
|
|
|
|
|
#check if a bin exists at $start_index1 |
1108
|
|
|
|
|
|
|
#if not, create one |
1109
|
741
|
100
|
|
|
|
713
|
if(!defined(@{$genome_trans_data{$chr1}}[$start_index1])){ |
|
741
|
|
|
|
|
1665
|
|
1110
|
516
|
|
|
|
|
741
|
$hash->{'BPcount'} = 1; |
1111
|
516
|
|
|
|
|
446
|
push (@{$hash->{'out'}{$chr2}}, $file_data[$n][4]); |
|
516
|
|
|
|
|
1283
|
|
1112
|
516
|
|
|
|
|
495
|
@{$genome_trans_data{$chr1}}[$start_index1] = $hash; |
|
516
|
|
|
|
|
39736
|
|
1113
|
|
|
|
|
|
|
} |
1114
|
|
|
|
|
|
|
else{ #if a bin does exist then increment the counts |
1115
|
225
|
|
|
|
|
176
|
${@{$genome_trans_data{$chr1}}[$start_index1]}{'BPcount'}++; |
|
225
|
|
|
|
|
164
|
|
|
225
|
|
|
|
|
341
|
|
1116
|
225
|
|
|
|
|
189
|
push (@{${@{$genome_trans_data{$chr1}}[$start_index1]}{'out'}{$chr2}}, $file_data[$n][4]); |
|
225
|
|
|
|
|
163
|
|
|
225
|
|
|
|
|
168
|
|
|
225
|
|
|
|
|
531
|
|
1117
|
|
|
|
|
|
|
} |
1118
|
|
|
|
|
|
|
|
1119
|
741
|
|
|
|
|
1103
|
$hash = {}; |
1120
|
741
|
100
|
|
|
|
618
|
if(!defined(@{$genome_trans_data{$chr1}}[$end_index1])){ |
|
741
|
|
|
|
|
1328
|
|
1121
|
275
|
|
|
|
|
405
|
$hash->{'BPcount'} = 1; |
1122
|
275
|
|
|
|
|
214
|
push (@{$hash->{'out'}{$chr2}}, $file_data[$n][5]); |
|
275
|
|
|
|
|
617
|
|
1123
|
275
|
|
|
|
|
263
|
@{$genome_trans_data{$chr1}}[$end_index1] = $hash; |
|
275
|
|
|
|
|
1026
|
|
1124
|
|
|
|
|
|
|
} |
1125
|
|
|
|
|
|
|
else{ |
1126
|
466
|
|
|
|
|
320
|
${@{$genome_trans_data{$chr1}}[$end_index1]}{'BPcount'}++; |
|
466
|
|
|
|
|
350
|
|
|
466
|
|
|
|
|
614
|
|
1127
|
466
|
|
|
|
|
389
|
push (@{${@{$genome_trans_data{$chr1}}[$end_index1]}{'out'}{$chr2}}, $file_data[$n][5]); |
|
466
|
|
|
|
|
328
|
|
|
466
|
|
|
|
|
349
|
|
|
466
|
|
|
|
|
945
|
|
1128
|
|
|
|
|
|
|
} |
1129
|
|
|
|
|
|
|
|
1130
|
741
|
|
|
|
|
817
|
$hash = {}; |
1131
|
741
|
100
|
|
|
|
676
|
if(!defined(@{$genome_trans_data{$chr2}}[$start_index2])){ |
|
741
|
|
|
|
|
1168
|
|
1132
|
361
|
|
|
|
|
548
|
$hash->{'BPcount'} = 1; |
1133
|
361
|
|
|
|
|
271
|
push (@{$hash->{'in'}{$chr1}}, $file_data[$n][1]); |
|
361
|
|
|
|
|
785
|
|
1134
|
361
|
|
|
|
|
332
|
@{$genome_trans_data{$chr2}}[$start_index2] = $hash; |
|
361
|
|
|
|
|
3532
|
|
1135
|
|
|
|
|
|
|
} |
1136
|
|
|
|
|
|
|
else{ |
1137
|
380
|
|
|
|
|
276
|
${@{$genome_trans_data{$chr2}}[$start_index2]}{'BPcount'}++; |
|
380
|
|
|
|
|
399
|
|
|
380
|
|
|
|
|
549
|
|
1138
|
380
|
|
|
|
|
346
|
push (@{${@{$genome_trans_data{$chr2}}[$start_index2]}{'in'}{$chr1}}, $file_data[$n][1]); |
|
380
|
|
|
|
|
255
|
|
|
380
|
|
|
|
|
285
|
|
|
380
|
|
|
|
|
980
|
|
1139
|
|
|
|
|
|
|
} |
1140
|
|
|
|
|
|
|
|
1141
|
741
|
|
|
|
|
788
|
$hash = {}; |
1142
|
741
|
100
|
|
|
|
675
|
if(!defined(@{$genome_trans_data{$chr2}}[$end_index2])){ |
|
741
|
|
|
|
|
1134
|
|
1143
|
160
|
|
|
|
|
207
|
$hash->{'BPcount'} = 1; |
1144
|
160
|
|
|
|
|
131
|
push (@{$hash->{'in'}{$chr1}}, $file_data[$n][2]); |
|
160
|
|
|
|
|
360
|
|
1145
|
160
|
|
|
|
|
144
|
@{$genome_trans_data{$chr2}}[$end_index2] = $hash; |
|
160
|
|
|
|
|
573
|
|
1146
|
|
|
|
|
|
|
} |
1147
|
|
|
|
|
|
|
else{ |
1148
|
581
|
|
|
|
|
448
|
${@{$genome_trans_data{$chr2}}[$end_index2]}{'BPcount'}++; |
|
581
|
|
|
|
|
426
|
|
|
581
|
|
|
|
|
739
|
|
1149
|
581
|
|
|
|
|
521
|
push (@{${@{$genome_trans_data{$chr2}}[$end_index2]}{'in'}{$chr1}}, $file_data[$n][2]); |
|
581
|
|
|
|
|
411
|
|
|
581
|
|
|
|
|
420
|
|
|
581
|
|
|
|
|
1222
|
|
1150
|
|
|
|
|
|
|
} |
1151
|
|
|
|
|
|
|
|
1152
|
|
|
|
|
|
|
#Increment hash translocation counts |
1153
|
741
|
|
|
|
|
992
|
$chromosome_trans_count{$chr1}{$chr2}++; |
1154
|
|
|
|
|
|
|
#if the translocation is intra-chromosomal then don't count it twice |
1155
|
741
|
100
|
|
|
|
1294
|
if($chr1 ne $chr2){ |
1156
|
316
|
|
|
|
|
390
|
$chromosome_trans_count{$chr2}{$chr1}++; |
1157
|
|
|
|
|
|
|
} |
1158
|
|
|
|
|
|
|
|
1159
|
|
|
|
|
|
|
#store the breakpoints in their bins |
1160
|
741
|
|
|
|
|
563
|
push (@{$genome_trans_breakpoints{$chr1}}, $file_data[$n][1]); |
|
741
|
|
|
|
|
1240
|
|
1161
|
741
|
|
|
|
|
665
|
push (@{$genome_trans_breakpoints{$chr1}}, $file_data[$n][2]); |
|
741
|
|
|
|
|
989
|
|
1162
|
|
|
|
|
|
|
|
1163
|
741
|
|
|
|
|
567
|
push (@{$genome_trans_breakpoints{$chr2}}, $file_data[$n][4]); |
|
741
|
|
|
|
|
1028
|
|
1164
|
741
|
|
|
|
|
570
|
push (@{$genome_trans_breakpoints{$chr2}}, $file_data[$n][5]); |
|
741
|
|
|
|
|
995
|
|
1165
|
|
|
|
|
|
|
|
1166
|
|
|
|
|
|
|
#Check if the translocation origin was in the TP53 gene |
1167
|
741
|
100
|
100
|
|
|
3770
|
if( |
|
|
|
100
|
|
|
|
|
1168
|
|
|
|
|
|
|
( $chr1 ne 'X' && $chr1 ne 'Y' ) && |
1169
|
|
|
|
|
|
|
( $chr1==17 ) |
1170
|
|
|
|
|
|
|
){ |
1171
|
1
|
50
|
33
|
|
|
12
|
if( |
|
|
|
0
|
|
|
|
|
|
|
|
33
|
|
|
|
|
1172
|
|
|
|
|
|
|
( ( $file_data[$n][1] >= $TP53_start && $file_data[$n][1] <= $TP53_end ) || ( $file_data[$n][2] >= $TP53_start && $file_data[$n][2] <= $TP53_end ) ) |
1173
|
|
|
|
|
|
|
){ |
1174
|
|
|
|
|
|
|
#if a mutation was found, set the TP53 mutated flag |
1175
|
1
|
|
|
|
|
2
|
$$tp53_mutation_found_ref = 1; |
1176
|
1
|
|
|
|
|
3
|
print $TP53_FILE "\n"; |
1177
|
|
|
|
|
|
|
#print the translocation data line to the TP53 translocation mutation output file |
1178
|
1
|
|
|
|
|
2
|
for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++){ |
|
8
|
|
|
|
|
15
|
|
1179
|
7
|
|
|
|
|
7
|
print $TP53_FILE "$file_data[$n][$i]"; |
1180
|
7
|
100
|
|
|
|
6
|
if($i != scalar(@{$file_data[$n]})-1){ |
|
7
|
|
|
|
|
13
|
|
1181
|
6
|
|
|
|
|
8
|
print $TP53_FILE "\t"; |
1182
|
|
|
|
|
|
|
} |
1183
|
|
|
|
|
|
|
}#for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++) |
1184
|
|
|
|
|
|
|
}#if |
1185
|
|
|
|
|
|
|
}#if |
1186
|
|
|
|
|
|
|
|
1187
|
|
|
|
|
|
|
#Check if the translocation destination was in the TP53 gene |
1188
|
741
|
50
|
66
|
|
|
7231
|
if( |
|
|
|
66
|
|
|
|
|
1189
|
|
|
|
|
|
|
( $chr2 ne 'X' && $chr2 ne 'Y' ) && |
1190
|
|
|
|
|
|
|
( $chr2==17 ) |
1191
|
|
|
|
|
|
|
){ |
1192
|
0
|
0
|
0
|
|
|
0
|
if( |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
1193
|
|
|
|
|
|
|
( ( $file_data[$n][4] >= $TP53_start && $file_data[$n][4] <= $TP53_end ) || ( $file_data[$n][5] >= $TP53_start && $file_data[$n][5] <= $TP53_end ) ) |
1194
|
|
|
|
|
|
|
){ |
1195
|
0
|
|
|
|
|
0
|
$$tp53_mutation_found_ref = 1; |
1196
|
0
|
|
|
|
|
0
|
print $TP53_FILE "\n"; |
1197
|
0
|
|
|
|
|
0
|
for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++){ |
|
0
|
|
|
|
|
0
|
|
1198
|
0
|
|
|
|
|
0
|
print $TP53_FILE "$file_data[$n][$i]"; |
1199
|
0
|
0
|
|
|
|
0
|
if($i != scalar(@{$file_data[$n]})-1){ |
|
0
|
|
|
|
|
0
|
|
1200
|
0
|
|
|
|
|
0
|
print $TP53_FILE "\t"; |
1201
|
|
|
|
|
|
|
} |
1202
|
|
|
|
|
|
|
}#for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++) |
1203
|
|
|
|
|
|
|
}#if |
1204
|
|
|
|
|
|
|
}#if |
1205
|
|
|
|
|
|
|
}#for (my $n = 0; $n < scalar(@file_data); $n++) |
1206
|
|
|
|
|
|
|
|
1207
|
6
|
|
|
|
|
533
|
close($TP53_FILE); |
1208
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
#return hash |
1210
|
6
|
|
|
|
|
680
|
return(\%genome_trans_data, \%chromosome_trans_count, \%genome_trans_breakpoints); |
1211
|
|
|
|
|
|
|
|
1212
|
|
|
|
|
|
|
}#sub analyze_trans_data |
1213
|
|
|
|
|
|
|
|
1214
|
|
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
#=head2 Sub-Method: analyze_insertion_data |
1216
|
|
|
|
|
|
|
|
1217
|
|
|
|
|
|
|
### analyze_insertion_data ############################################################################## |
1218
|
|
|
|
|
|
|
# Description: |
1219
|
|
|
|
|
|
|
# Reads data from files located in the insertion input directory and populates: |
1220
|
|
|
|
|
|
|
# $genome_insertion_data_hash_ref |
1221
|
|
|
|
|
|
|
# $chromosome_insertion_count_hash_ref |
1222
|
|
|
|
|
|
|
# $genome_trans_insertion_breakpoints_hash_ref |
1223
|
|
|
|
|
|
|
# |
1224
|
|
|
|
|
|
|
# Input variables: |
1225
|
|
|
|
|
|
|
# $output_directory: stores the path to the output directory |
1226
|
|
|
|
|
|
|
# $insertion_files_array_ref: reference to array containing all the insertion input files |
1227
|
|
|
|
|
|
|
# $bin_size: stores the size of the bins which the chromosome will be divided into |
1228
|
|
|
|
|
|
|
# $genome_trans_breakpoints_hash_ref: store reference to hash that contains the translocation breakpoints on |
1229
|
|
|
|
|
|
|
# each chromosome |
1230
|
|
|
|
|
|
|
# $tp53_mutation_found_ref: reference to the tp53 mutation found flag |
1231
|
|
|
|
|
|
|
# |
1232
|
|
|
|
|
|
|
|
1233
|
|
|
|
|
|
|
#=cut |
1234
|
|
|
|
|
|
|
sub analyze_insertion_data { |
1235
|
|
|
|
|
|
|
|
1236
|
|
|
|
|
|
|
#Parse Parameters |
1237
|
0
|
|
|
0
|
0
|
0
|
my $output_directory = shift; |
1238
|
0
|
|
|
|
|
0
|
my $insertion_files_array_ref = shift; |
1239
|
0
|
|
|
|
|
0
|
my @insertion_files = @$insertion_files_array_ref; |
1240
|
|
|
|
|
|
|
|
1241
|
0
|
|
|
|
|
0
|
my $bin_size = shift; |
1242
|
|
|
|
|
|
|
|
1243
|
0
|
|
|
|
|
0
|
my $genome_trans_breakpoints_hash_ref = shift; |
1244
|
0
|
|
|
|
|
0
|
my %genome_trans_breakpoints = %$genome_trans_breakpoints_hash_ref; |
1245
|
|
|
|
|
|
|
|
1246
|
0
|
|
|
|
|
0
|
my $tp53_mutation_found_ref = shift; |
1247
|
|
|
|
|
|
|
|
1248
|
0
|
|
|
|
|
0
|
my %genome_insertion_data = (); #hash |
1249
|
|
|
|
|
|
|
#key: chromosome eg. 1,2,X,Y |
1250
|
|
|
|
|
|
|
#value: a reference to an array where each element corresponds to a bin along the |
1251
|
|
|
|
|
|
|
# chromosome |
1252
|
|
|
|
|
|
|
|
1253
|
0
|
|
|
|
|
0
|
my $CURRENT_FILE; #file handle to the current file that is open |
1254
|
|
|
|
|
|
|
my $TP53_FILE; #file handle to the TP53 insertion mutation output file |
1255
|
|
|
|
|
|
|
|
1256
|
0
|
|
|
|
|
0
|
my $line; #stores raw line read in from file |
1257
|
0
|
|
|
|
|
0
|
my @line_data; #stores tokenized line read in from file |
1258
|
|
|
|
|
|
|
|
1259
|
0
|
|
|
|
|
0
|
my $chr; |
1260
|
|
|
|
|
|
|
|
1261
|
0
|
|
|
|
|
0
|
my $file_name; |
1262
|
0
|
|
|
|
|
0
|
my $path; |
1263
|
0
|
|
|
|
|
0
|
my $suffix; |
1264
|
|
|
|
|
|
|
|
1265
|
0
|
|
|
|
|
0
|
my $rm_insertion_file_result; |
1266
|
|
|
|
|
|
|
|
1267
|
0
|
|
|
|
|
0
|
my $insertion_found = 0; |
1268
|
|
|
|
|
|
|
|
1269
|
0
|
|
|
|
|
0
|
my %chromosome_insertion_count = (); #hash {chr}{count} |
1270
|
|
|
|
|
|
|
#key: chromosome eg. 1,2,X,Y |
1271
|
|
|
|
|
|
|
#value: the number of insertions found on key |
1272
|
|
|
|
|
|
|
|
1273
|
0
|
|
|
|
|
0
|
my %genome_trans_insertion_breakpoints = ( #hash |
1274
|
|
|
|
|
|
|
X => [], #key: chromosome eg. 1,2,X,Y |
1275
|
|
|
|
|
|
|
Y => [], #value: an array storing all the insertion start positions on key |
1276
|
|
|
|
|
|
|
1 => [], |
1277
|
|
|
|
|
|
|
2 => [], |
1278
|
|
|
|
|
|
|
3 => [], |
1279
|
|
|
|
|
|
|
4 => [], |
1280
|
|
|
|
|
|
|
5 => [], |
1281
|
|
|
|
|
|
|
6 => [], |
1282
|
|
|
|
|
|
|
7 => [], |
1283
|
|
|
|
|
|
|
8 => [], |
1284
|
|
|
|
|
|
|
9 => [], |
1285
|
|
|
|
|
|
|
10 => [], |
1286
|
|
|
|
|
|
|
11 => [], |
1287
|
|
|
|
|
|
|
12 => [], |
1288
|
|
|
|
|
|
|
13 => [], |
1289
|
|
|
|
|
|
|
14 => [], |
1290
|
|
|
|
|
|
|
15 => [], |
1291
|
|
|
|
|
|
|
16 => [], |
1292
|
|
|
|
|
|
|
17 => [], |
1293
|
|
|
|
|
|
|
18 => [], |
1294
|
|
|
|
|
|
|
19 => [], |
1295
|
|
|
|
|
|
|
20 => [], |
1296
|
|
|
|
|
|
|
21 => [], |
1297
|
|
|
|
|
|
|
22 => [] |
1298
|
|
|
|
|
|
|
); |
1299
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
#Create TP53 directory and output folder |
1301
|
0
|
0
|
|
|
|
0
|
mkdir ("$output_directory"."TP53",0770) unless (-d "$output_directory"."TP53"); |
1302
|
0
|
0
|
|
|
|
0
|
if(!(-e "$output_directory"."TP53")){ |
1303
|
0
|
|
|
|
|
0
|
die "ERROR: could not create folder $output_directory"."TP53\n"; |
1304
|
|
|
|
|
|
|
} |
1305
|
|
|
|
|
|
|
|
1306
|
|
|
|
|
|
|
#for each file in the insertion input file array |
1307
|
0
|
|
|
|
|
0
|
foreach my $file (@insertion_files){ |
1308
|
|
|
|
|
|
|
|
1309
|
|
|
|
|
|
|
#Parse the file name, path and file type |
1310
|
0
|
|
|
|
|
0
|
( $file_name, $path, $suffix ) = File::Basename::fileparse( $file, "\.[^.]*"); |
1311
|
|
|
|
|
|
|
|
1312
|
|
|
|
|
|
|
#open the file |
1313
|
0
|
0
|
|
|
|
0
|
open ($CURRENT_FILE, "<", $file) or die "ERROR: could not open file at path $file\n"; |
1314
|
|
|
|
|
|
|
|
1315
|
|
|
|
|
|
|
#ensure that the file is not empty |
1316
|
0
|
0
|
|
|
|
0
|
if(eof($CURRENT_FILE)){ |
1317
|
0
|
|
|
|
|
0
|
die "ERROR: $file is empty\n"; |
1318
|
|
|
|
|
|
|
} |
1319
|
|
|
|
|
|
|
|
1320
|
|
|
|
|
|
|
#create the TP53 insertion mutation output file |
1321
|
0
|
0
|
|
|
|
0
|
open ($TP53_FILE, ">", "$output_directory"."TP53/$file_name"."$suffix") or die "ERROR: could not create file: $output_directory"."TP53/$file_name"."$suffix"; |
1322
|
|
|
|
|
|
|
|
1323
|
|
|
|
|
|
|
#read header lines |
1324
|
0
|
|
|
|
|
0
|
$line = <$CURRENT_FILE>; |
1325
|
0
|
|
|
|
|
0
|
chomp($line); |
1326
|
|
|
|
|
|
|
|
1327
|
|
|
|
|
|
|
#print the VCF header lines to the TP53 insertion mutation output file |
1328
|
0
|
|
|
|
|
0
|
while ($line =~ m/^#(.*?)/){ |
1329
|
0
|
|
|
|
|
0
|
print $TP53_FILE "$line\n"; |
1330
|
0
|
|
|
|
|
0
|
$line = <$CURRENT_FILE>; |
1331
|
0
|
|
|
|
|
0
|
chomp($line); |
1332
|
|
|
|
|
|
|
} |
1333
|
|
|
|
|
|
|
|
1334
|
|
|
|
|
|
|
#read all the data lines in the file |
1335
|
0
|
|
|
|
|
0
|
while(1){ |
1336
|
0
|
|
|
|
|
0
|
@line_data = (split (/\t/,$line)); |
1337
|
|
|
|
|
|
|
|
1338
|
|
|
|
|
|
|
#verify that the chromosome is valid and that the mutation is an insertion type |
1339
|
0
|
0
|
0
|
|
|
0
|
if( |
|
|
|
0
|
|
|
|
|
1340
|
|
|
|
|
|
|
( !($line_data[1] =~ m/^[0-9]+$/) ) || |
1341
|
|
|
|
|
|
|
( !($line_data[0] =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|x|y|[1-9])/) ) || |
1342
|
|
|
|
|
|
|
( length($line_data[4]) <= length($line_data[3]) ) |
1343
|
|
|
|
|
|
|
){ |
1344
|
0
|
|
|
|
|
0
|
warn "ERROR: invalid chromosome or non-insertion VCF data line found and skipped:\t$line\n"; |
1345
|
0
|
|
|
|
|
0
|
$line = <$CURRENT_FILE>; |
1346
|
0
|
0
|
|
|
|
0
|
unless($line){last;} |
|
0
|
|
|
|
|
0
|
|
1347
|
0
|
|
|
|
|
0
|
chomp($line); |
1348
|
0
|
|
|
|
|
0
|
next; |
1349
|
|
|
|
|
|
|
} |
1350
|
|
|
|
|
|
|
|
1351
|
|
|
|
|
|
|
#parse the chromosome |
1352
|
0
|
|
|
|
|
0
|
$chr = $2; |
1353
|
|
|
|
|
|
|
|
1354
|
|
|
|
|
|
|
#change to uppercase if 'x' or 'y' is found |
1355
|
0
|
0
|
|
|
|
0
|
if($chr eq 'x'){ |
1356
|
0
|
|
|
|
|
0
|
$chr = 'X'; |
1357
|
|
|
|
|
|
|
} |
1358
|
0
|
0
|
|
|
|
0
|
if($chr eq 'y'){ |
1359
|
0
|
|
|
|
|
0
|
$chr = 'Y'; |
1360
|
|
|
|
|
|
|
} |
1361
|
|
|
|
|
|
|
|
1362
|
|
|
|
|
|
|
#increment the insertion count of the chromosome |
1363
|
0
|
|
|
|
|
0
|
$chromosome_insertion_count{$chr}++; |
1364
|
|
|
|
|
|
|
|
1365
|
|
|
|
|
|
|
#check if a bin exists at the insertion start position |
1366
|
|
|
|
|
|
|
#if one does not, then create one |
1367
|
0
|
0
|
|
|
|
0
|
if(!defined(@{$genome_insertion_data{$chr}}[int($line_data[1]/$bin_size)])){ |
|
0
|
|
|
|
|
0
|
|
1368
|
0
|
|
|
|
|
0
|
${$genome_insertion_data{$chr}}[int($line_data[1]/$bin_size)] = 1; |
|
0
|
|
|
|
|
0
|
|
1369
|
|
|
|
|
|
|
} |
1370
|
|
|
|
|
|
|
else{ #if one does, then increment the count |
1371
|
0
|
|
|
|
|
0
|
${$genome_insertion_data{$chr}}[int($line_data[1]/$bin_size)]++; |
|
0
|
|
|
|
|
0
|
|
1372
|
|
|
|
|
|
|
} |
1373
|
|
|
|
|
|
|
|
1374
|
|
|
|
|
|
|
#Search through the list of translocation breakpoints on the same chromosome |
1375
|
0
|
|
|
|
|
0
|
foreach my $bp (@{$genome_trans_breakpoints{$chr}}){ |
|
0
|
|
|
|
|
0
|
|
1376
|
|
|
|
|
|
|
#if the insertion is within 10bps of the breakpoints and the insertion to the list stored in |
1377
|
|
|
|
|
|
|
#the genome_trans_insertion_breakpoints hash |
1378
|
0
|
0
|
0
|
|
|
0
|
if( $line_data[1] < $bp+10 && $line_data[1] > $bp-10){ |
1379
|
0
|
|
|
|
|
0
|
push (@{$genome_trans_insertion_breakpoints{$chr}}, $bp); |
|
0
|
|
|
|
|
0
|
|
1380
|
|
|
|
|
|
|
} |
1381
|
|
|
|
|
|
|
} |
1382
|
|
|
|
|
|
|
|
1383
|
|
|
|
|
|
|
#Check if the insertion was in the TP53 gene |
1384
|
0
|
0
|
0
|
|
|
0
|
if( |
|
|
|
0
|
|
|
|
|
1385
|
|
|
|
|
|
|
( $chr ne 'X' && $chr ne 'Y' ) && |
1386
|
|
|
|
|
|
|
( $chr==17 ) |
1387
|
|
|
|
|
|
|
){ |
1388
|
0
|
0
|
0
|
|
|
0
|
if($line_data[1] >= $TP53_start && $line_data[1] <= $TP53_end){ |
1389
|
0
|
|
|
|
|
0
|
$$tp53_mutation_found_ref = 1; #if a mutation was found in the region set the TP53 mutated flag |
1390
|
0
|
|
|
|
|
0
|
$insertion_found = 1; |
1391
|
0
|
|
|
|
|
0
|
print $TP53_FILE "$line\n"; #print the culprit data line to the TP53 insertion |
1392
|
|
|
|
|
|
|
#mutation output file |
1393
|
|
|
|
|
|
|
} |
1394
|
|
|
|
|
|
|
} |
1395
|
|
|
|
|
|
|
|
1396
|
|
|
|
|
|
|
#read the next line |
1397
|
0
|
|
|
|
|
0
|
$line = <$CURRENT_FILE>; |
1398
|
|
|
|
|
|
|
#check that the end of file has not been reached |
1399
|
0
|
0
|
|
|
|
0
|
unless($line){last;} |
|
0
|
|
|
|
|
0
|
|
1400
|
0
|
|
|
|
|
0
|
chomp($line); |
1401
|
|
|
|
|
|
|
}#while(1) |
1402
|
|
|
|
|
|
|
|
1403
|
|
|
|
|
|
|
#close file |
1404
|
0
|
|
|
|
|
0
|
close($CURRENT_FILE); |
1405
|
0
|
|
|
|
|
0
|
close ($TP53_FILE); |
1406
|
|
|
|
|
|
|
|
1407
|
|
|
|
|
|
|
#if an insertion was not found in the current file then delete the created TP53 insertion mutation output file |
1408
|
0
|
0
|
|
|
|
0
|
if($insertion_found!=1){ |
1409
|
0
|
|
|
|
|
0
|
my $dir = "$output_directory"."TP53/$file_name"."$suffix"; |
1410
|
0
|
|
|
|
|
0
|
$rm_insertion_file_result = `rm $dir`; |
1411
|
|
|
|
|
|
|
} |
1412
|
|
|
|
|
|
|
|
1413
|
0
|
|
|
|
|
0
|
$insertion_found = 0; |
1414
|
|
|
|
|
|
|
}#foreach my $file (@insertion_files) |
1415
|
|
|
|
|
|
|
|
1416
|
|
|
|
|
|
|
#return hash |
1417
|
0
|
|
|
|
|
0
|
return(\%genome_insertion_data, \%chromosome_insertion_count, \%genome_trans_insertion_breakpoints); |
1418
|
|
|
|
|
|
|
|
1419
|
|
|
|
|
|
|
}#sub analyze_insertion_data |
1420
|
|
|
|
|
|
|
|
1421
|
|
|
|
|
|
|
|
1422
|
|
|
|
|
|
|
#=head2 Sub-Method: analyze_loh_data |
1423
|
|
|
|
|
|
|
|
1424
|
|
|
|
|
|
|
### analyze_loh_data ############################################################################## |
1425
|
|
|
|
|
|
|
# Description: |
1426
|
|
|
|
|
|
|
# Reads data from files located in the LOH input directory and populates: |
1427
|
|
|
|
|
|
|
# $chromosome_loh_breakpoints_hash_ref |
1428
|
|
|
|
|
|
|
# |
1429
|
|
|
|
|
|
|
# Input variables: |
1430
|
|
|
|
|
|
|
# $output_directory: stores the path to the output directory |
1431
|
|
|
|
|
|
|
# $loh_files_array_ref: reference to array containing all the LOH input files |
1432
|
|
|
|
|
|
|
# $tp53_mutation_found_ref: reference to tp53 mutation found flag |
1433
|
|
|
|
|
|
|
# |
1434
|
|
|
|
|
|
|
|
1435
|
|
|
|
|
|
|
#=cut |
1436
|
|
|
|
|
|
|
|
1437
|
|
|
|
|
|
|
sub analyze_loh_data { |
1438
|
|
|
|
|
|
|
|
1439
|
|
|
|
|
|
|
#parse the parameters |
1440
|
0
|
|
|
0
|
0
|
0
|
my $output_directory = shift; |
1441
|
0
|
|
|
|
|
0
|
my $loh_files_array_ref = shift; |
1442
|
0
|
|
|
|
|
0
|
my @loh_files = @$loh_files_array_ref; |
1443
|
|
|
|
|
|
|
|
1444
|
0
|
|
|
|
|
0
|
my $tp53_mutation_found_ref = shift; |
1445
|
|
|
|
|
|
|
|
1446
|
0
|
|
|
|
|
0
|
my @file_data; #an array storing all the entries from every input file |
1447
|
|
|
|
|
|
|
|
1448
|
|
|
|
|
|
|
my $CURRENT_FILE; #file handle to the current file that is open |
1449
|
0
|
|
|
|
|
0
|
my $TP53_FILE; #file handle to the TP53 translocation mutation output file |
1450
|
|
|
|
|
|
|
|
1451
|
0
|
|
|
|
|
0
|
my $line; #stores raw line read in from file |
1452
|
0
|
|
|
|
|
0
|
my @line_data; #stores tokenized line read in from files |
1453
|
|
|
|
|
|
|
|
1454
|
0
|
|
|
|
|
0
|
my %chromsome_loh_breakpoints = ( #hash {chr}[start and end pairs] |
1455
|
|
|
|
|
|
|
X => [], #key: chromosome eg. 1,2,X,Y |
1456
|
|
|
|
|
|
|
Y => [], #value: an array that stores all the LOH breakpoints on key |
1457
|
|
|
|
|
|
|
1 => [], |
1458
|
|
|
|
|
|
|
2 => [], |
1459
|
|
|
|
|
|
|
3 => [], |
1460
|
|
|
|
|
|
|
4 => [], |
1461
|
|
|
|
|
|
|
5 => [], |
1462
|
|
|
|
|
|
|
6 => [], |
1463
|
|
|
|
|
|
|
7 => [], |
1464
|
|
|
|
|
|
|
8 => [], |
1465
|
|
|
|
|
|
|
9 => [], |
1466
|
|
|
|
|
|
|
10 => [], |
1467
|
|
|
|
|
|
|
11 => [], |
1468
|
|
|
|
|
|
|
12 => [], |
1469
|
|
|
|
|
|
|
13 => [], |
1470
|
|
|
|
|
|
|
14 => [], |
1471
|
|
|
|
|
|
|
15 => [], |
1472
|
|
|
|
|
|
|
16 => [], |
1473
|
|
|
|
|
|
|
17 => [], |
1474
|
|
|
|
|
|
|
18 => [], |
1475
|
|
|
|
|
|
|
19 => [], |
1476
|
|
|
|
|
|
|
20 => [], |
1477
|
|
|
|
|
|
|
21 => [], |
1478
|
|
|
|
|
|
|
22 => [] |
1479
|
|
|
|
|
|
|
); |
1480
|
|
|
|
|
|
|
|
1481
|
|
|
|
|
|
|
#Read the contents of the cnv files into memory |
1482
|
0
|
|
|
|
|
0
|
foreach my $file (@loh_files){ |
1483
|
|
|
|
|
|
|
#open the file |
1484
|
0
|
0
|
|
|
|
0
|
open ($CURRENT_FILE, "<", $file) or die "ERROR: could not open file at path $file\n"; |
1485
|
|
|
|
|
|
|
|
1486
|
|
|
|
|
|
|
#Ensure that the file is not empty |
1487
|
0
|
0
|
|
|
|
0
|
if(eof($CURRENT_FILE)){ |
1488
|
0
|
|
|
|
|
0
|
close ($CURRENT_FILE); |
1489
|
0
|
|
|
|
|
0
|
die "ERROR: $file is empty\n"; |
1490
|
|
|
|
|
|
|
} |
1491
|
|
|
|
|
|
|
|
1492
|
|
|
|
|
|
|
#read header line and validate |
1493
|
0
|
|
|
|
|
0
|
$line = <$CURRENT_FILE>; |
1494
|
0
|
|
|
|
|
0
|
chomp($line); |
1495
|
|
|
|
|
|
|
|
1496
|
|
|
|
|
|
|
#Validate the header line |
1497
|
0
|
0
|
|
|
|
0
|
if(!($line =~ m/^#chr\tstart\tend\tquality$/)){ |
1498
|
0
|
|
|
|
|
0
|
close ($CURRENT_FILE); |
1499
|
0
|
|
|
|
|
0
|
die "ERROR: header of loh file $file is invalid\n"; |
1500
|
|
|
|
|
|
|
} |
1501
|
|
|
|
|
|
|
|
1502
|
|
|
|
|
|
|
#Read all the data lines |
1503
|
0
|
|
|
|
|
0
|
while( !(eof($CURRENT_FILE)) ){ |
1504
|
|
|
|
|
|
|
|
1505
|
|
|
|
|
|
|
#read data line |
1506
|
0
|
|
|
|
|
0
|
$line = <$CURRENT_FILE>; |
1507
|
0
|
|
|
|
|
0
|
chomp($line); |
1508
|
|
|
|
|
|
|
|
1509
|
|
|
|
|
|
|
#validate the data line |
1510
|
0
|
0
|
|
|
|
0
|
if(!($line =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])\t[0-9]+\t[0-9]+\t([0-9]+|\.)$/)){ |
1511
|
0
|
|
|
|
|
0
|
die "ERROR: invalid line found ($line) in file $file\n"; |
1512
|
|
|
|
|
|
|
} |
1513
|
|
|
|
|
|
|
|
1514
|
|
|
|
|
|
|
#Split the data line and add it to the array |
1515
|
0
|
|
|
|
|
0
|
@line_data = (split (/\t/,$line)); |
1516
|
0
|
|
|
|
|
0
|
push(@file_data,[@line_data]); |
1517
|
|
|
|
|
|
|
} |
1518
|
|
|
|
|
|
|
|
1519
|
0
|
|
|
|
|
0
|
close ($CURRENT_FILE); |
1520
|
|
|
|
|
|
|
|
1521
|
|
|
|
|
|
|
}#foreach my $file (@cnv_files) |
1522
|
|
|
|
|
|
|
|
1523
|
|
|
|
|
|
|
#Ensure that there are no overlapping LOH regions, or join them if the user indicated |
1524
|
0
|
|
|
|
|
0
|
@file_data = check_for_overlaps("loh", \@file_data); |
1525
|
|
|
|
|
|
|
|
1526
|
|
|
|
|
|
|
#Create TP53 directory and output folder |
1527
|
0
|
0
|
|
|
|
0
|
mkdir ("$output_directory"."TP53",0770) unless (-d "$output_directory"."TP53"); |
1528
|
0
|
0
|
|
|
|
0
|
if(!(-e "$output_directory"."TP53")){ |
1529
|
0
|
|
|
|
|
0
|
die "ERROR: could not create folder $output_directory"."TP53\n"; |
1530
|
|
|
|
|
|
|
} |
1531
|
|
|
|
|
|
|
|
1532
|
|
|
|
|
|
|
#Create the TP53 LOH mutation output data file |
1533
|
0
|
0
|
|
|
|
0
|
open ($TP53_FILE, ">", "$output_directory"."TP53/TP53.spl") or die "ERROR: could not create file: $output_directory"."TP53/TP53.spl"; |
1534
|
|
|
|
|
|
|
#Print the header for the output file (same format as a .spl file) |
1535
|
0
|
|
|
|
|
0
|
print $TP53_FILE "#chr\tstart\tend\tquality"; |
1536
|
|
|
|
|
|
|
|
1537
|
|
|
|
|
|
|
#For every data line that was read in |
1538
|
0
|
|
|
|
|
0
|
for (my $n = 0; $n < scalar(@file_data); $n++){ |
1539
|
|
|
|
|
|
|
|
1540
|
|
|
|
|
|
|
#Validate the chromosome field |
1541
|
0
|
0
|
|
|
|
0
|
if(!($file_data[$n][0] =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])$/)){ |
1542
|
0
|
|
|
|
|
0
|
die "ERROR: invalid chromosome field detected\n"; |
1543
|
|
|
|
|
|
|
} |
1544
|
|
|
|
|
|
|
|
1545
|
|
|
|
|
|
|
#Parse the chromosome |
1546
|
0
|
|
|
|
|
0
|
my $chr = $2; |
1547
|
|
|
|
|
|
|
|
1548
|
|
|
|
|
|
|
#Add the breakpoints to the array for the chromosome |
1549
|
0
|
|
|
|
|
0
|
push (@{$chromsome_loh_breakpoints{$chr}}, ($file_data[$n][1],$file_data[$n][2])); |
|
0
|
|
|
|
|
0
|
|
1550
|
|
|
|
|
|
|
|
1551
|
|
|
|
|
|
|
|
1552
|
|
|
|
|
|
|
#Check if the variation was in the TP53 gene |
1553
|
0
|
0
|
0
|
|
|
0
|
if( |
|
|
|
0
|
|
|
|
|
1554
|
|
|
|
|
|
|
( $chr ne 'X' && $chr ne 'Y' ) && |
1555
|
|
|
|
|
|
|
( $chr==17 ) |
1556
|
|
|
|
|
|
|
){ |
1557
|
0
|
0
|
0
|
|
|
0
|
if( |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
1558
|
|
|
|
|
|
|
( ( $file_data[$n][1] >= $TP53_start && $file_data[$n][1] <= $TP53_end ) || ( $file_data[$n][2] >= $TP53_start && $file_data[$n][2] <= $TP53_end ) ) |
1559
|
|
|
|
|
|
|
){ |
1560
|
|
|
|
|
|
|
#If a mutation was found in the TP53 region then set the TP53 mutated flag |
1561
|
0
|
|
|
|
|
0
|
$$tp53_mutation_found_ref = 1; |
1562
|
|
|
|
|
|
|
|
1563
|
|
|
|
|
|
|
#Print the LOH data line to the TP53 LOH mutation output file |
1564
|
0
|
|
|
|
|
0
|
print $TP53_FILE "\n"; |
1565
|
0
|
|
|
|
|
0
|
for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++){ |
|
0
|
|
|
|
|
0
|
|
1566
|
0
|
|
|
|
|
0
|
print $TP53_FILE "$file_data[$n][$i]"; |
1567
|
0
|
0
|
|
|
|
0
|
if($i != scalar(@{$file_data[$n]})-1){ |
|
0
|
|
|
|
|
0
|
|
1568
|
0
|
|
|
|
|
0
|
print $TP53_FILE "\t"; |
1569
|
|
|
|
|
|
|
}#if |
1570
|
|
|
|
|
|
|
}#for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++) |
1571
|
|
|
|
|
|
|
}#if |
1572
|
|
|
|
|
|
|
}#if |
1573
|
|
|
|
|
|
|
|
1574
|
|
|
|
|
|
|
}#for (my $n = 0; $n < scalar(@file_data); $n++) |
1575
|
|
|
|
|
|
|
|
1576
|
0
|
|
|
|
|
0
|
close($TP53_FILE); |
1577
|
|
|
|
|
|
|
|
1578
|
|
|
|
|
|
|
#return hash |
1579
|
0
|
|
|
|
|
0
|
return(\%chromsome_loh_breakpoints); |
1580
|
|
|
|
|
|
|
|
1581
|
|
|
|
|
|
|
}#sub analyze_loh_data |
1582
|
|
|
|
|
|
|
|
1583
|
|
|
|
|
|
|
|
1584
|
|
|
|
|
|
|
|
1585
|
|
|
|
|
|
|
#=head2 Sub-Method: calculate_genome_localization |
1586
|
|
|
|
|
|
|
|
1587
|
|
|
|
|
|
|
### calculate_genome_localization ################################################################# |
1588
|
|
|
|
|
|
|
# Description: |
1589
|
|
|
|
|
|
|
# Caculates the mutation density for each chromosome |
1590
|
|
|
|
|
|
|
# |
1591
|
|
|
|
|
|
|
# Input variables: |
1592
|
|
|
|
|
|
|
# $output_directory: stores the path to the output directory |
1593
|
|
|
|
|
|
|
# $chromosome_copy_number_count_hash_ref: stores a reference to the hash storing the number |
1594
|
|
|
|
|
|
|
# of CNV events on each chromosome |
1595
|
|
|
|
|
|
|
# #chromosome_translocation_count_hash_ref: stores a reference to the hash storing the number |
1596
|
|
|
|
|
|
|
# of translocation events on each chromosome |
1597
|
|
|
|
|
|
|
# |
1598
|
|
|
|
|
|
|
|
1599
|
|
|
|
|
|
|
#=cut |
1600
|
|
|
|
|
|
|
|
1601
|
|
|
|
|
|
|
sub calculate_genome_localization { |
1602
|
|
|
|
|
|
|
|
1603
|
|
|
|
|
|
|
#parse the parameters |
1604
|
2
|
|
|
2
|
0
|
22
|
my $output_directory = shift; |
1605
|
2
|
|
|
|
|
5
|
my $chromosome_copy_number_count_hash_ref = shift; |
1606
|
2
|
|
|
|
|
4
|
my $chromosome_translocation_count_hash_ref = shift; |
1607
|
|
|
|
|
|
|
|
1608
|
2
|
|
|
|
|
4
|
my %chromosome_mutation_count; #hash |
1609
|
|
|
|
|
|
|
#key: chromosome eg. 1,2,X,Y |
1610
|
|
|
|
|
|
|
#value: the density of translocation and CNV events on key |
1611
|
|
|
|
|
|
|
|
1612
|
|
|
|
|
|
|
my $density; #store the mutation density for a chromosome |
1613
|
|
|
|
|
|
|
|
1614
|
0
|
|
|
|
|
0
|
my $OUTPUT_FILE; #file handle to the output file |
1615
|
|
|
|
|
|
|
|
1616
|
|
|
|
|
|
|
#initialize all the counts to 0 |
1617
|
2
|
|
|
|
|
13
|
for (my $i=1; $i<23; $i++){ |
1618
|
44
|
|
|
|
|
134
|
$chromosome_mutation_count{$i} = 0; |
1619
|
|
|
|
|
|
|
} |
1620
|
2
|
|
|
|
|
6
|
$chromosome_mutation_count{'X'} = 0; |
1621
|
2
|
|
|
|
|
4
|
$chromosome_mutation_count{'Y'} = 0; |
1622
|
|
|
|
|
|
|
|
1623
|
|
|
|
|
|
|
#add the number of CNV events on each chromosome |
1624
|
2
|
|
|
|
|
19
|
for my $cnv_key1 ( keys %$chromosome_copy_number_count_hash_ref){ |
1625
|
40
|
|
|
|
|
29
|
for my $cnv_key2 (keys %{$chromosome_copy_number_count_hash_ref->{$cnv_key1}}){ |
|
40
|
|
|
|
|
99
|
|
1626
|
160
|
|
|
|
|
217
|
$chromosome_mutation_count{$cnv_key1} += $chromosome_copy_number_count_hash_ref->{$cnv_key1}->{$cnv_key2}; |
1627
|
|
|
|
|
|
|
} |
1628
|
|
|
|
|
|
|
} |
1629
|
|
|
|
|
|
|
|
1630
|
|
|
|
|
|
|
#add the number of translocation events on each chromosome |
1631
|
2
|
|
|
|
|
10
|
for my $trans_key1 ( keys %$chromosome_translocation_count_hash_ref){ |
1632
|
26
|
|
|
|
|
19
|
for my $trans_key2 (keys %{$chromosome_translocation_count_hash_ref->{$trans_key1}}){ |
|
26
|
|
|
|
|
46
|
|
1633
|
52
|
|
|
|
|
62
|
$chromosome_mutation_count{$trans_key1} += $chromosome_translocation_count_hash_ref->{$trans_key1}->{$trans_key2}; |
1634
|
|
|
|
|
|
|
} |
1635
|
|
|
|
|
|
|
} |
1636
|
|
|
|
|
|
|
|
1637
|
|
|
|
|
|
|
#Create the output file |
1638
|
2
|
50
|
|
|
|
238
|
open ($OUTPUT_FILE, ">", "$output_directory/genome_localization.log") or die "ERROR: could not create file $output_directory/genome_localization.log\n"; |
1639
|
|
|
|
|
|
|
#Print the header |
1640
|
2
|
|
|
|
|
15
|
print $OUTPUT_FILE "#chr\tcount\tdensity\n"; |
1641
|
|
|
|
|
|
|
|
1642
|
|
|
|
|
|
|
#for each chromosome print the count and overall density |
1643
|
9
|
|
|
9
|
|
6922
|
{use sort 'stable'; |
|
9
|
|
|
|
|
5236
|
|
|
9
|
|
|
|
|
50
|
|
|
2
|
|
|
|
|
3
|
|
1644
|
2
|
|
|
|
|
24
|
for my $chr ( sort keys %chromosome_mutation_count){ |
1645
|
48
|
|
|
|
|
61
|
$density = $chromosome_mutation_count{$chr}/$chromosome_length{$chr}; |
1646
|
|
|
|
|
|
|
|
1647
|
48
|
|
|
|
|
37
|
print $OUTPUT_FILE "$chr"; |
1648
|
48
|
|
|
|
|
42
|
print $OUTPUT_FILE "\t"; |
1649
|
48
|
|
|
|
|
48
|
print $OUTPUT_FILE $chromosome_mutation_count{$chr}; |
1650
|
48
|
|
|
|
|
34
|
print $OUTPUT_FILE "\t"; |
1651
|
48
|
|
|
|
|
114
|
print $OUTPUT_FILE "$density"; |
1652
|
48
|
|
|
|
|
49
|
print $OUTPUT_FILE "\n"; |
1653
|
|
|
|
|
|
|
|
1654
|
|
|
|
|
|
|
#Replace the count with the density |
1655
|
48
|
|
|
|
|
47
|
$chromosome_mutation_count{$chr} = $density; |
1656
|
|
|
|
|
|
|
} |
1657
|
|
|
|
|
|
|
}#use sort 'stable' |
1658
|
|
|
|
|
|
|
|
1659
|
2
|
|
|
|
|
72
|
close ($OUTPUT_FILE); |
1660
|
|
|
|
|
|
|
|
1661
|
|
|
|
|
|
|
#return the hash containing the densities |
1662
|
2
|
|
|
|
|
15
|
return(\%chromosome_mutation_count); |
1663
|
|
|
|
|
|
|
|
1664
|
|
|
|
|
|
|
}#sub calculate_genome_localization |
1665
|
|
|
|
|
|
|
|
1666
|
|
|
|
|
|
|
|
1667
|
|
|
|
|
|
|
#=head2 Sub-Method: calculate_chromosome_localization |
1668
|
|
|
|
|
|
|
|
1669
|
|
|
|
|
|
|
### calculate_chromosome_localization ############################################################# |
1670
|
|
|
|
|
|
|
# Description: |
1671
|
|
|
|
|
|
|
# Performs a sliding window analysis on the CNV and translocation data. Identifies regions |
1672
|
|
|
|
|
|
|
# that have a density of mutation much greater than the average rate of mutation of the |
1673
|
|
|
|
|
|
|
# genome. |
1674
|
|
|
|
|
|
|
# |
1675
|
|
|
|
|
|
|
# Input variables: |
1676
|
|
|
|
|
|
|
# $output_directory: stores the directory where output files are created |
1677
|
|
|
|
|
|
|
# $genome_cnv_data_hash_ref: reference to hash that stores position of all CNV breakpoints in |
1678
|
|
|
|
|
|
|
# the genome |
1679
|
|
|
|
|
|
|
# $genome_trans_data_hash_ref: reference to hash that stores position of all the |
1680
|
|
|
|
|
|
|
# translocation breakpoints in the genome |
1681
|
|
|
|
|
|
|
# $bin_size: size of the bins that divide up the genome |
1682
|
|
|
|
|
|
|
# $window_size: number of bins to evaluate in each window |
1683
|
|
|
|
|
|
|
# |
1684
|
|
|
|
|
|
|
|
1685
|
|
|
|
|
|
|
#=cut |
1686
|
|
|
|
|
|
|
sub calculate_chromosome_localization { |
1687
|
|
|
|
|
|
|
|
1688
|
|
|
|
|
|
|
#parse parameters |
1689
|
2
|
|
|
2
|
0
|
15
|
my $output_directory = shift; |
1690
|
2
|
|
|
|
|
3
|
my $genome_cnv_data_hash_ref = shift; |
1691
|
2
|
|
|
|
|
4
|
my $genome_trans_data_hash_ref = shift; |
1692
|
2
|
|
|
|
|
2
|
my $bin_size = shift; |
1693
|
2
|
|
|
|
|
4
|
my $window_size = shift; |
1694
|
|
|
|
|
|
|
|
1695
|
2
|
|
|
|
|
3
|
my @suspect_regions; #array storing the start position, end position and chromosome |
1696
|
|
|
|
|
|
|
#of very highly mutated regions |
1697
|
|
|
|
|
|
|
my @likely_regions; #array storing the start position, end position and chromosome |
1698
|
|
|
|
|
|
|
#of somewhat highly mutated regions |
1699
|
|
|
|
|
|
|
|
1700
|
2
|
|
|
|
|
3
|
my $in_suspect_region = 0; #flag variables used in identifying highly mutated regions |
1701
|
2
|
|
|
|
|
3
|
my $in_likely_region = 0; |
1702
|
|
|
|
|
|
|
|
1703
|
2
|
|
|
|
|
2
|
my $suspect_chr = -1; |
1704
|
2
|
|
|
|
|
4
|
my $suspect_start = -1; |
1705
|
2
|
|
|
|
|
2
|
my $suspect_end = -1; |
1706
|
|
|
|
|
|
|
|
1707
|
2
|
|
|
|
|
4
|
my $likely_chr = -1; |
1708
|
2
|
|
|
|
|
2
|
my $likely_start = -1; |
1709
|
2
|
|
|
|
|
3
|
my $likely_end = -1; |
1710
|
|
|
|
|
|
|
|
1711
|
2
|
|
|
|
|
45
|
my %genome_cnv_data_windows = ( #hash |
1712
|
|
|
|
|
|
|
X => [], #key: chromosome eg. 1,2,X,Y |
1713
|
|
|
|
|
|
|
Y => [], #value: an array storing the count of CNVs |
1714
|
|
|
|
|
|
|
1 => [], # in each window along the chromosome |
1715
|
|
|
|
|
|
|
2 => [], |
1716
|
|
|
|
|
|
|
3 => [], |
1717
|
|
|
|
|
|
|
4 => [], |
1718
|
|
|
|
|
|
|
5 => [], |
1719
|
|
|
|
|
|
|
6 => [], |
1720
|
|
|
|
|
|
|
7 => [], |
1721
|
|
|
|
|
|
|
8 => [], |
1722
|
|
|
|
|
|
|
9 => [], |
1723
|
|
|
|
|
|
|
10 => [], |
1724
|
|
|
|
|
|
|
11 => [], |
1725
|
|
|
|
|
|
|
12 => [], |
1726
|
|
|
|
|
|
|
13 => [], |
1727
|
|
|
|
|
|
|
14 => [], |
1728
|
|
|
|
|
|
|
15 => [], |
1729
|
|
|
|
|
|
|
16 => [], |
1730
|
|
|
|
|
|
|
17 => [], |
1731
|
|
|
|
|
|
|
18 => [], |
1732
|
|
|
|
|
|
|
19 => [], |
1733
|
|
|
|
|
|
|
20 => [], |
1734
|
|
|
|
|
|
|
21 => [], |
1735
|
|
|
|
|
|
|
22 => [] |
1736
|
|
|
|
|
|
|
); |
1737
|
|
|
|
|
|
|
|
1738
|
2
|
|
|
|
|
23
|
my %genome_trans_data_windows = ( #hash |
1739
|
|
|
|
|
|
|
X => [], #key: chromosome eg. 1,2,X,Y |
1740
|
|
|
|
|
|
|
Y => [], #value: an array storing a count of translocation |
1741
|
|
|
|
|
|
|
1 => [], # in each window along the chromosome |
1742
|
|
|
|
|
|
|
2 => [], |
1743
|
|
|
|
|
|
|
3 => [], |
1744
|
|
|
|
|
|
|
4 => [], |
1745
|
|
|
|
|
|
|
5 => [], |
1746
|
|
|
|
|
|
|
6 => [], |
1747
|
|
|
|
|
|
|
7 => [], |
1748
|
|
|
|
|
|
|
8 => [], |
1749
|
|
|
|
|
|
|
9 => [], |
1750
|
|
|
|
|
|
|
10 => [], |
1751
|
|
|
|
|
|
|
11 => [], |
1752
|
|
|
|
|
|
|
12 => [], |
1753
|
|
|
|
|
|
|
13 => [], |
1754
|
|
|
|
|
|
|
14 => [], |
1755
|
|
|
|
|
|
|
15 => [], |
1756
|
|
|
|
|
|
|
16 => [], |
1757
|
|
|
|
|
|
|
17 => [], |
1758
|
|
|
|
|
|
|
18 => [], |
1759
|
|
|
|
|
|
|
19 => [], |
1760
|
|
|
|
|
|
|
20 => [], |
1761
|
|
|
|
|
|
|
21 => [], |
1762
|
|
|
|
|
|
|
22 => [] |
1763
|
|
|
|
|
|
|
); |
1764
|
|
|
|
|
|
|
|
1765
|
2
|
|
|
|
|
23
|
my %genome_mutation_data_windows = ( #hash |
1766
|
|
|
|
|
|
|
X => [], #key: chromosome eg. 1,2,X,Y |
1767
|
|
|
|
|
|
|
Y => [], #value: an array storing a count of all mutations |
1768
|
|
|
|
|
|
|
1 => [], # in each window along the chromosome |
1769
|
|
|
|
|
|
|
2 => [], |
1770
|
|
|
|
|
|
|
3 => [], |
1771
|
|
|
|
|
|
|
4 => [], |
1772
|
|
|
|
|
|
|
5 => [], |
1773
|
|
|
|
|
|
|
6 => [], |
1774
|
|
|
|
|
|
|
7 => [], |
1775
|
|
|
|
|
|
|
8 => [], |
1776
|
|
|
|
|
|
|
9 => [], |
1777
|
|
|
|
|
|
|
10 => [], |
1778
|
|
|
|
|
|
|
11 => [], |
1779
|
|
|
|
|
|
|
12 => [], |
1780
|
|
|
|
|
|
|
13 => [], |
1781
|
|
|
|
|
|
|
14 => [], |
1782
|
|
|
|
|
|
|
15 => [], |
1783
|
|
|
|
|
|
|
16 => [], |
1784
|
|
|
|
|
|
|
17 => [], |
1785
|
|
|
|
|
|
|
18 => [], |
1786
|
|
|
|
|
|
|
19 => [], |
1787
|
|
|
|
|
|
|
20 => [], |
1788
|
|
|
|
|
|
|
21 => [], |
1789
|
|
|
|
|
|
|
22 => [] |
1790
|
|
|
|
|
|
|
); |
1791
|
|
|
|
|
|
|
|
1792
|
|
|
|
|
|
|
|
1793
|
2
|
|
|
|
|
4
|
my $current_chr; #current chromosome being analyzed |
1794
|
2
|
|
|
|
|
5
|
my @current_chr_data = (); #array storing the bins for the current chromosome |
1795
|
|
|
|
|
|
|
|
1796
|
2
|
|
|
|
|
3
|
my $genome_mean_mutation_density = 0; #average density of all the windows across the genome |
1797
|
2
|
|
|
|
|
3
|
my $total_genome_windows = 0; #total number of windows across the genome |
1798
|
2
|
|
|
|
|
1
|
my $genome_mutation_density_standard_deviation = 0; #standard deviation of the mutation densities for |
1799
|
|
|
|
|
|
|
#all the windows |
1800
|
|
|
|
|
|
|
|
1801
|
2
|
|
|
|
|
3
|
my $OUTPUT_FILE; #file handle to output file |
1802
|
|
|
|
|
|
|
|
1803
|
2
|
|
|
|
|
4
|
$output_directory = $output_directory."mutation_clustering"; |
1804
|
|
|
|
|
|
|
|
1805
|
|
|
|
|
|
|
#create output directories |
1806
|
2
|
50
|
|
|
|
162
|
mkdir ("$output_directory",0770) unless (-d "$output_directory"); |
1807
|
2
|
50
|
|
|
|
25
|
if(!(-e "$output_directory")){ |
1808
|
0
|
|
|
|
|
0
|
die "ERROR: could not create folder $output_directory\n"; |
1809
|
|
|
|
|
|
|
} |
1810
|
|
|
|
|
|
|
|
1811
|
2
|
50
|
|
|
|
92
|
mkdir ("$output_directory/cnv",0770) unless (-d "$output_directory/cnv"); |
1812
|
2
|
50
|
|
|
|
21
|
if(!(-e "$output_directory")){ |
1813
|
0
|
|
|
|
|
0
|
die "ERROR: could not create folder $output_directory/cnv\n"; |
1814
|
|
|
|
|
|
|
} |
1815
|
|
|
|
|
|
|
|
1816
|
2
|
50
|
|
|
|
85
|
mkdir ("$output_directory/translocations",0770) unless (-d "$output_directory/translocations"); |
1817
|
2
|
50
|
|
|
|
20
|
if(!(-e "$output_directory")){ |
1818
|
0
|
|
|
|
|
0
|
die "ERROR: could not create folder $output_directory/translocations\n"; |
1819
|
|
|
|
|
|
|
} |
1820
|
|
|
|
|
|
|
|
1821
|
2
|
50
|
|
|
|
88
|
mkdir ("$output_directory/all_types",0770) unless (-d "$output_directory/all_types"); |
1822
|
2
|
50
|
|
|
|
30
|
if(!(-e "$output_directory")){ |
1823
|
0
|
|
|
|
|
0
|
die "ERROR: could not create folder $output_directory/all_types\n"; |
1824
|
|
|
|
|
|
|
} |
1825
|
|
|
|
|
|
|
|
1826
|
|
|
|
|
|
|
#compute the density of CNV mutations in each window |
1827
|
2
|
|
|
|
|
19
|
for my $cnv_key ( keys %$genome_cnv_data_hash_ref){ |
1828
|
|
|
|
|
|
|
#get the array storing the CNV bins for the current chromosome |
1829
|
40
|
|
|
|
|
122
|
@current_chr_data = @{$genome_cnv_data_hash_ref->{$cnv_key}}; |
|
40
|
|
|
|
|
161833
|
|
1830
|
|
|
|
|
|
|
|
1831
|
|
|
|
|
|
|
#check that the array is not empty |
1832
|
40
|
50
|
|
|
|
321
|
if(scalar(@current_chr_data) > 0){ |
1833
|
|
|
|
|
|
|
|
1834
|
|
|
|
|
|
|
#create an output file for this chromosome |
1835
|
40
|
50
|
|
|
|
5776
|
open ($OUTPUT_FILE, ">", "$output_directory/cnv/chr$cnv_key"."_cnv_localization.log") or die "ERROR: could not create file $output_directory/cnv/chr$cnv_key"."_cnv_localization.log"; |
1836
|
|
|
|
|
|
|
#print the header for the output file |
1837
|
40
|
|
|
|
|
369
|
print $OUTPUT_FILE "#chr\tstart\tend\tdensity"; |
1838
|
|
|
|
|
|
|
|
1839
|
40
|
|
|
|
|
82
|
@{$genome_cnv_data_windows{$cnv_key}}[0] = 0; #initialize the count in the first window |
|
40
|
|
|
|
|
350
|
|
1840
|
|
|
|
|
|
|
|
1841
|
|
|
|
|
|
|
#Calculate the mutation count in the first window |
1842
|
40
|
|
|
|
|
167
|
for(my $chr_pos = 0; $chr_pos < $window_size; $chr_pos++){ |
1843
|
400000
|
|
|
|
|
252297
|
my %region_hash; |
1844
|
400000
|
100
|
|
|
|
497514
|
if(!defined($current_chr_data[$chr_pos])){ |
1845
|
399924
|
|
|
|
|
554999
|
next; |
1846
|
|
|
|
|
|
|
} |
1847
|
76
|
|
|
|
|
85
|
%region_hash = %{$current_chr_data[$chr_pos]}; |
|
76
|
|
|
|
|
523
|
|
1848
|
|
|
|
|
|
|
|
1849
|
76
|
|
|
|
|
124
|
@{$genome_cnv_data_windows{$cnv_key}}[0] += $region_hash{'BPcount'}; |
|
76
|
|
|
|
|
273
|
|
1850
|
|
|
|
|
|
|
} |
1851
|
|
|
|
|
|
|
|
1852
|
|
|
|
|
|
|
#print the values from the first window to the output file |
1853
|
40
|
|
|
|
|
293
|
print $OUTPUT_FILE "\n"; |
1854
|
40
|
|
|
|
|
141
|
print $OUTPUT_FILE "$cnv_key"; |
1855
|
40
|
|
|
|
|
82
|
print $OUTPUT_FILE "\t"; |
1856
|
40
|
|
|
|
|
90
|
print $OUTPUT_FILE "0"; |
1857
|
40
|
|
|
|
|
72
|
print $OUTPUT_FILE "\t"; |
1858
|
40
|
|
|
|
|
444
|
print $OUTPUT_FILE ($window_size)*$bin_size; |
1859
|
40
|
|
|
|
|
104
|
print $OUTPUT_FILE "\t"; |
1860
|
|
|
|
|
|
|
|
1861
|
40
|
50
|
|
|
|
51
|
if(!defined(@{$genome_cnv_data_windows{$cnv_key}}[0])){ |
|
40
|
|
|
|
|
359
|
|
1862
|
0
|
|
|
|
|
0
|
print $OUTPUT_FILE "0"; |
1863
|
|
|
|
|
|
|
} |
1864
|
|
|
|
|
|
|
else{ |
1865
|
40
|
|
|
|
|
74
|
my $rounded = POSIX::ceil((@{$genome_cnv_data_windows{$cnv_key}}[0])/2); |
|
40
|
|
|
|
|
350
|
|
1866
|
40
|
|
|
|
|
377
|
print $OUTPUT_FILE ($rounded)/($window_size*$bin_size); |
1867
|
|
|
|
|
|
|
#add the cnv count to the total mutation count for the region |
1868
|
40
|
|
|
|
|
75
|
@{$genome_mutation_data_windows{$cnv_key}}[0] += $rounded; |
|
40
|
|
|
|
|
177
|
|
1869
|
|
|
|
|
|
|
} |
1870
|
|
|
|
|
|
|
|
1871
|
|
|
|
|
|
|
#perform the sliding window analysis for the rest of the chromosome |
1872
|
40
|
|
|
|
|
176
|
for(my $chr_pos = 1; $chr_pos < scalar(@current_chr_data); $chr_pos++){ |
1873
|
|
|
|
|
|
|
|
1874
|
|
|
|
|
|
|
#check that the window will not overshoot the length of the chromosome |
1875
|
4391064
|
100
|
|
|
|
7110032
|
if( (($chr_pos+($window_size-1))*$bin_size) > $chromosome_length{$cnv_key} ){ |
1876
|
24
|
|
|
|
|
63
|
last; |
1877
|
|
|
|
|
|
|
} |
1878
|
|
|
|
|
|
|
|
1879
|
4391040
|
|
|
|
|
3328028
|
@{$genome_cnv_data_windows{$cnv_key}}[$chr_pos] = 0; #initialize the count for the current window |
|
4391040
|
|
|
|
|
5322717
|
|
1880
|
|
|
|
|
|
|
|
1881
|
4391040
|
|
|
|
|
3337689
|
my %past_region_hash; |
1882
|
|
|
|
|
|
|
my %next_region_hash; |
1883
|
|
|
|
|
|
|
|
1884
|
4391040
|
|
|
|
|
3266511
|
my $prev_value = 0; |
1885
|
4391040
|
|
|
|
|
3032145
|
my $next_value = 0; |
1886
|
|
|
|
|
|
|
|
1887
|
|
|
|
|
|
|
#get the count of the from the first bin from the previous window |
1888
|
4391040
|
100
|
|
|
|
6563697
|
if(defined($current_chr_data[$chr_pos-1])){ |
1889
|
392
|
|
|
|
|
444
|
%past_region_hash = %{$current_chr_data[$chr_pos-1]}; |
|
392
|
|
|
|
|
1890
|
|
1890
|
392
|
|
|
|
|
662
|
$prev_value = $past_region_hash{'BPcount'}; |
1891
|
|
|
|
|
|
|
} |
1892
|
|
|
|
|
|
|
|
1893
|
|
|
|
|
|
|
#get the count from the bin following the last bin in the previous window |
1894
|
4391040
|
100
|
|
|
|
6265114
|
if(defined($current_chr_data[$chr_pos+($window_size-1)])){ |
1895
|
392
|
|
|
|
|
389
|
%next_region_hash = %{$current_chr_data[$chr_pos+($window_size-1)]}; |
|
392
|
|
|
|
|
2295
|
|
1896
|
392
|
|
|
|
|
767
|
$next_value = $next_region_hash{'BPcount'}; |
1897
|
|
|
|
|
|
|
} |
1898
|
|
|
|
|
|
|
|
1899
|
|
|
|
|
|
|
#the count for the current window = the count from the previous window - the first bin of the previous window + the next bin along the chromosome |
1900
|
4391040
|
|
|
|
|
3148944
|
@{$genome_cnv_data_windows{$cnv_key}}[$chr_pos] += (@{$genome_cnv_data_windows{$cnv_key}}[$chr_pos-1]) - ($prev_value) + ($next_value); |
|
4391040
|
|
|
|
|
4679775
|
|
|
4391040
|
|
|
|
|
4701707
|
|
1901
|
|
|
|
|
|
|
|
1902
|
|
|
|
|
|
|
#print the values for this window |
1903
|
4391040
|
|
|
|
|
4426893
|
print $OUTPUT_FILE "\n"; |
1904
|
4391040
|
|
|
|
|
3828238
|
print $OUTPUT_FILE "$cnv_key"; |
1905
|
4391040
|
|
|
|
|
3367321
|
print $OUTPUT_FILE "\t"; |
1906
|
4391040
|
|
|
|
|
9612242
|
print $OUTPUT_FILE $chr_pos*$bin_size; |
1907
|
4391040
|
|
|
|
|
3683462
|
print $OUTPUT_FILE "\t"; |
1908
|
4391040
|
|
|
|
|
8420479
|
print $OUTPUT_FILE ($chr_pos+$window_size)*$bin_size; |
1909
|
4391040
|
|
|
|
|
4298892
|
print $OUTPUT_FILE "\t"; |
1910
|
|
|
|
|
|
|
|
1911
|
4391040
|
50
|
|
|
|
3274786
|
if(!defined(@{$genome_cnv_data_windows{$cnv_key}}[$chr_pos])){ |
|
4391040
|
|
|
|
|
6625449
|
|
1912
|
0
|
|
|
|
|
0
|
print $OUTPUT_FILE "0"; |
1913
|
|
|
|
|
|
|
} |
1914
|
|
|
|
|
|
|
else{ |
1915
|
4391040
|
|
|
|
|
3171268
|
my $rounded = POSIX::ceil((@{$genome_cnv_data_windows{$cnv_key}}[$chr_pos])/2); |
|
4391040
|
|
|
|
|
7883313
|
|
1916
|
4391040
|
|
|
|
|
7366705
|
print $OUTPUT_FILE ($rounded)/($window_size*$bin_size); |
1917
|
|
|
|
|
|
|
#add the cnv count to the total mutation count for the region |
1918
|
4391040
|
|
|
|
|
3276286
|
@{$genome_mutation_data_windows{$cnv_key}}[$chr_pos] += $rounded; |
|
4391040
|
|
|
|
|
11546829
|
|
1919
|
|
|
|
|
|
|
} |
1920
|
|
|
|
|
|
|
}#for(my $chr_pos = 1; $chr_pos < scalar(@current_chr_data); $chr_pos++) |
1921
|
|
|
|
|
|
|
|
1922
|
40
|
|
|
|
|
5548
|
close ($OUTPUT_FILE); |
1923
|
|
|
|
|
|
|
}#if(scalar(@current_chr_data) > 0) |
1924
|
|
|
|
|
|
|
|
1925
|
40
|
|
|
|
|
73689
|
@current_chr_data = (); |
1926
|
|
|
|
|
|
|
}#for my $cnv_key ( keys %$genome_cnv_data_hash_ref) |
1927
|
|
|
|
|
|
|
|
1928
|
|
|
|
|
|
|
#perform the sliding window analysis on the translocation mutation data |
1929
|
2
|
|
|
|
|
31
|
for my $trans_key ( keys %$genome_trans_data_hash_ref){ |
1930
|
|
|
|
|
|
|
|
1931
|
|
|
|
|
|
|
#get the array storing the translocation bins for the current chromosome |
1932
|
26
|
|
|
|
|
72
|
@current_chr_data = @{$genome_trans_data_hash_ref->{$trans_key}}; |
|
26
|
|
|
|
|
81590
|
|
1933
|
|
|
|
|
|
|
|
1934
|
|
|
|
|
|
|
#check that the array is not empty |
1935
|
26
|
50
|
|
|
|
189
|
if(scalar(@current_chr_data) > 0){ |
1936
|
|
|
|
|
|
|
|
1937
|
|
|
|
|
|
|
#create the output file |
1938
|
26
|
50
|
|
|
|
3939
|
open ($OUTPUT_FILE, ">", "$output_directory/translocations/chr$trans_key"."_translocation_localization.log") or die "ERROR: could not create file $output_directory/translocations/chr$trans_key"."_translocation_localization.log"; |
1939
|
|
|
|
|
|
|
#print the header for the output file |
1940
|
26
|
|
|
|
|
217
|
print $OUTPUT_FILE "#chr\tstart\tend\tdensity"; |
1941
|
|
|
|
|
|
|
|
1942
|
|
|
|
|
|
|
#initialize the translocation count for the first window |
1943
|
26
|
|
|
|
|
59
|
@{$genome_trans_data_windows{$trans_key}}[0] = 0; |
|
26
|
|
|
|
|
164
|
|
1944
|
|
|
|
|
|
|
|
1945
|
|
|
|
|
|
|
#calculate the translocation mutation count in the first window |
1946
|
26
|
|
|
|
|
125
|
for(my $chr_pos = 0; $chr_pos < $window_size; $chr_pos++){ |
1947
|
260000
|
|
|
|
|
157701
|
my %region_hash; |
1948
|
260000
|
100
|
|
|
|
318639
|
if(!defined($current_chr_data[$chr_pos])){ |
1949
|
259970
|
|
|
|
|
359428
|
next; |
1950
|
|
|
|
|
|
|
} |
1951
|
30
|
|
|
|
|
34
|
%region_hash = %{$current_chr_data[$chr_pos]}; |
|
30
|
|
|
|
|
152
|
|
1952
|
|
|
|
|
|
|
|
1953
|
30
|
|
|
|
|
41
|
my %trans_hash_in; |
1954
|
|
|
|
|
|
|
my %trans_hash_out; |
1955
|
30
|
|
|
|
|
27
|
my $size = 0; |
1956
|
|
|
|
|
|
|
|
1957
|
|
|
|
|
|
|
#calculate the number of inbound translocation breakpoints |
1958
|
30
|
100
|
|
|
|
62
|
if(defined($region_hash{'in'})){ |
1959
|
20
|
|
|
|
|
23
|
%trans_hash_in = %{$region_hash{'in'}}; |
|
20
|
|
|
|
|
46
|
|
1960
|
|
|
|
|
|
|
|
1961
|
20
|
|
|
|
|
33
|
for my $key (keys %trans_hash_in){ |
1962
|
20
|
|
|
|
|
18
|
$size = @{$trans_hash_in{$key}}; |
|
20
|
|
|
|
|
20
|
|
1963
|
20
|
|
|
|
|
33
|
$size = $size/2; |
1964
|
20
|
|
|
|
|
19
|
@{$genome_trans_data_windows{$trans_key}}[0] += $size; |
|
20
|
|
|
|
|
57
|
|
1965
|
|
|
|
|
|
|
} |
1966
|
|
|
|
|
|
|
} |
1967
|
|
|
|
|
|
|
|
1968
|
|
|
|
|
|
|
#calculate the number of outbound translocation breakpoints |
1969
|
30
|
100
|
|
|
|
77
|
if(defined($region_hash{'out'})){ |
1970
|
18
|
|
|
|
|
17
|
%trans_hash_out = %{$region_hash{'out'}}; |
|
18
|
|
|
|
|
47
|
|
1971
|
|
|
|
|
|
|
|
1972
|
18
|
|
|
|
|
37
|
for my $key (keys %trans_hash_out){ |
1973
|
18
|
50
|
|
|
|
34
|
if($key eq $trans_key){ |
1974
|
18
|
|
|
|
|
67
|
next; |
1975
|
|
|
|
|
|
|
} |
1976
|
0
|
|
|
|
|
0
|
$size = @{$trans_hash_out{$key}}; |
|
0
|
|
|
|
|
0
|
|
1977
|
0
|
|
|
|
|
0
|
$size = $size/2; |
1978
|
0
|
|
|
|
|
0
|
@{$genome_trans_data_windows{$trans_key}}[0] += $size; |
|
0
|
|
|
|
|
0
|
|
1979
|
|
|
|
|
|
|
} |
1980
|
|
|
|
|
|
|
} |
1981
|
|
|
|
|
|
|
|
1982
|
|
|
|
|
|
|
}#for(my $chr_pos = 0; $chr_pos < $window_size; $chr_pos++) |
1983
|
|
|
|
|
|
|
|
1984
|
|
|
|
|
|
|
#print the values from the first window to the output file |
1985
|
26
|
|
|
|
|
204
|
print $OUTPUT_FILE "\n"; |
1986
|
26
|
|
|
|
|
85
|
print $OUTPUT_FILE "$trans_key"; |
1987
|
26
|
|
|
|
|
48
|
print $OUTPUT_FILE "\t"; |
1988
|
26
|
|
|
|
|
47
|
print $OUTPUT_FILE "0"; |
1989
|
26
|
|
|
|
|
158
|
print $OUTPUT_FILE "\t"; |
1990
|
26
|
|
|
|
|
333
|
print $OUTPUT_FILE ($window_size)*$bin_size; |
1991
|
26
|
|
|
|
|
52
|
print $OUTPUT_FILE "\t"; |
1992
|
|
|
|
|
|
|
|
1993
|
26
|
50
|
|
|
|
63
|
if(!defined(@{$genome_trans_data_windows{$trans_key}}[0])){ |
|
26
|
|
|
|
|
193
|
|
1994
|
0
|
|
|
|
|
0
|
print $OUTPUT_FILE "0"; |
1995
|
|
|
|
|
|
|
} |
1996
|
|
|
|
|
|
|
else{ |
1997
|
26
|
|
|
|
|
40
|
my $rounded = POSIX::ceil(@{$genome_trans_data_windows{$trans_key}}[0]); |
|
26
|
|
|
|
|
218
|
|
1998
|
26
|
|
|
|
|
158
|
print $OUTPUT_FILE ($rounded)/($window_size*$bin_size); |
1999
|
|
|
|
|
|
|
#add the translocation mutation count to the total mutation count for the region |
2000
|
26
|
|
|
|
|
56
|
@{$genome_mutation_data_windows{$trans_key}}[0] += $rounded; |
|
26
|
|
|
|
|
113
|
|
2001
|
|
|
|
|
|
|
} |
2002
|
|
|
|
|
|
|
|
2003
|
|
|
|
|
|
|
#perform the sliding window analysis for the rest of the chromosome |
2004
|
26
|
|
|
|
|
120
|
for(my $chr_pos = 1; $chr_pos < scalar(@current_chr_data); $chr_pos++){ |
2005
|
|
|
|
|
|
|
|
2006
|
2520292
|
100
|
|
|
|
3952139
|
if( (($chr_pos+($window_size-1))*$bin_size) > $chromosome_length{$trans_key} ){ |
2007
|
10
|
|
|
|
|
28
|
last; |
2008
|
|
|
|
|
|
|
} |
2009
|
|
|
|
|
|
|
|
2010
|
2520282
|
|
|
|
|
1805582
|
@{$genome_trans_data_windows{$trans_key}}[$chr_pos] = 0; |
|
2520282
|
|
|
|
|
2989656
|
|
2011
|
2520282
|
|
|
|
|
1860985
|
my %prev_region_hash; |
2012
|
|
|
|
|
|
|
my %next_region_hash; |
2013
|
|
|
|
|
|
|
|
2014
|
2520282
|
|
|
|
|
1823580
|
my $prev_value = 0; |
2015
|
2520282
|
|
|
|
|
1624361
|
my $next_value = 0; |
2016
|
|
|
|
|
|
|
|
2017
|
|
|
|
|
|
|
#Caculate the number of mutations in the first bin of the previous window |
2018
|
2520282
|
100
|
|
|
|
3578042
|
if(defined($current_chr_data[$chr_pos-1])){ |
2019
|
428
|
|
|
|
|
408
|
%prev_region_hash = %{$current_chr_data[$chr_pos-1]}; |
|
428
|
|
|
|
|
1520
|
|
2020
|
|
|
|
|
|
|
|
2021
|
428
|
|
|
|
|
479
|
my $size = 0; |
2022
|
428
|
|
|
|
|
387
|
my %prev_trans_hash_in; |
2023
|
|
|
|
|
|
|
my %prev_trans_hash_out; |
2024
|
|
|
|
|
|
|
|
2025
|
428
|
100
|
|
|
|
888
|
if(defined($prev_region_hash{'in'})){ |
2026
|
234
|
|
|
|
|
240
|
%prev_trans_hash_in = %{$prev_region_hash{'in'}}; |
|
234
|
|
|
|
|
495
|
|
2027
|
|
|
|
|
|
|
|
2028
|
234
|
|
|
|
|
385
|
for my $key (keys %prev_trans_hash_in){ |
2029
|
262
|
|
|
|
|
217
|
$size = @{$prev_trans_hash_in{$key}}; |
|
262
|
|
|
|
|
289
|
|
2030
|
262
|
|
|
|
|
289
|
$size = $size/2; |
2031
|
262
|
|
|
|
|
468
|
$prev_value += $size; |
2032
|
|
|
|
|
|
|
} |
2033
|
|
|
|
|
|
|
} |
2034
|
|
|
|
|
|
|
|
2035
|
428
|
100
|
|
|
|
862
|
if(defined($prev_region_hash{'out'})){ |
2036
|
278
|
|
|
|
|
281
|
%prev_trans_hash_out = %{$prev_region_hash{'out'}}; |
|
278
|
|
|
|
|
618
|
|
2037
|
|
|
|
|
|
|
|
2038
|
278
|
|
|
|
|
518
|
for my $key (keys %prev_trans_hash_out){ |
2039
|
284
|
100
|
|
|
|
625
|
if($key eq $trans_key){ |
2040
|
184
|
|
|
|
|
546
|
next; |
2041
|
|
|
|
|
|
|
} |
2042
|
100
|
|
|
|
|
81
|
$size = @{$prev_trans_hash_out{$key}}; |
|
100
|
|
|
|
|
112
|
|
2043
|
100
|
|
|
|
|
107
|
$size = $size/2; |
2044
|
100
|
|
|
|
|
241
|
$prev_value += $size; |
2045
|
|
|
|
|
|
|
} |
2046
|
|
|
|
|
|
|
} |
2047
|
|
|
|
|
|
|
|
2048
|
|
|
|
|
|
|
} |
2049
|
|
|
|
|
|
|
|
2050
|
|
|
|
|
|
|
#Caculate the number of mutations in the last bin of the current window |
2051
|
2520282
|
100
|
|
|
|
3544084
|
if(defined($current_chr_data[$chr_pos+($window_size-1)])){ |
2052
|
456
|
|
|
|
|
393
|
%next_region_hash = %{$current_chr_data[$chr_pos+($window_size-1)]}; |
|
456
|
|
|
|
|
2045
|
|
2053
|
|
|
|
|
|
|
|
2054
|
456
|
|
|
|
|
516
|
my $size = 0; |
2055
|
456
|
|
|
|
|
437
|
my %next_trans_hash_in; |
2056
|
|
|
|
|
|
|
my %next_trans_hash_out; |
2057
|
|
|
|
|
|
|
|
2058
|
456
|
100
|
|
|
|
849
|
if(defined($next_region_hash{'in'})){ |
2059
|
238
|
|
|
|
|
253
|
%next_trans_hash_in = %{$next_region_hash{'in'}}; |
|
238
|
|
|
|
|
573
|
|
2060
|
|
|
|
|
|
|
|
2061
|
238
|
|
|
|
|
421
|
for my $key (keys %next_trans_hash_in){ |
2062
|
266
|
|
|
|
|
197
|
$size = @{$next_trans_hash_in{$key}}; |
|
266
|
|
|
|
|
363
|
|
2063
|
266
|
|
|
|
|
288
|
$size = $size/2; |
2064
|
266
|
|
|
|
|
485
|
$next_value += $size; |
2065
|
|
|
|
|
|
|
} |
2066
|
|
|
|
|
|
|
} |
2067
|
|
|
|
|
|
|
|
2068
|
456
|
100
|
|
|
|
912
|
if(defined($next_region_hash{'out'})){ |
2069
|
306
|
|
|
|
|
250
|
%next_trans_hash_out = %{$next_region_hash{'out'}}; |
|
306
|
|
|
|
|
804
|
|
2070
|
|
|
|
|
|
|
|
2071
|
306
|
|
|
|
|
507
|
for my $key (keys %next_trans_hash_out){ |
2072
|
312
|
100
|
|
|
|
678
|
if($key eq $trans_key){ |
2073
|
192
|
|
|
|
|
540
|
next; |
2074
|
|
|
|
|
|
|
} |
2075
|
120
|
|
|
|
|
107
|
$size = @{$next_trans_hash_out{$key}}; |
|
120
|
|
|
|
|
147
|
|
2076
|
120
|
|
|
|
|
128
|
$size = $size/2; |
2077
|
120
|
|
|
|
|
286
|
$next_value += $size; |
2078
|
|
|
|
|
|
|
} |
2079
|
|
|
|
|
|
|
} |
2080
|
|
|
|
|
|
|
|
2081
|
|
|
|
|
|
|
|
2082
|
|
|
|
|
|
|
} |
2083
|
|
|
|
|
|
|
|
2084
|
|
|
|
|
|
|
#total number of translocation mutations in the current window = number of mutations in previous window - the first bin of the previous window + next bin along the chromosome |
2085
|
2520282
|
|
|
|
|
1706221
|
@{$genome_trans_data_windows{$trans_key}}[$chr_pos] += (@{$genome_trans_data_windows{$trans_key}}[$chr_pos-1]) - ($prev_value) + ($next_value); |
|
2520282
|
|
|
|
|
2618698
|
|
|
2520282
|
|
|
|
|
2522535
|
|
2086
|
|
|
|
|
|
|
|
2087
|
|
|
|
|
|
|
#print values from this window |
2088
|
2520282
|
|
|
|
|
2412434
|
print $OUTPUT_FILE "\n"; |
2089
|
2520282
|
|
|
|
|
4826823
|
print $OUTPUT_FILE "$trans_key"; |
2090
|
2520282
|
|
|
|
|
1946175
|
print $OUTPUT_FILE "\t"; |
2091
|
2520282
|
|
|
|
|
7425819
|
print $OUTPUT_FILE $chr_pos*$bin_size; |
2092
|
2520282
|
|
|
|
|
1903350
|
print $OUTPUT_FILE "\t"; |
2093
|
2520282
|
|
|
|
|
4044352
|
print $OUTPUT_FILE ($chr_pos+$window_size)*$bin_size; |
2094
|
2520282
|
|
|
|
|
2127756
|
print $OUTPUT_FILE "\t"; |
2095
|
|
|
|
|
|
|
|
2096
|
2520282
|
50
|
|
|
|
1771966
|
if(!defined(@{$genome_trans_data_windows{$trans_key}}[$chr_pos])){ |
|
2520282
|
|
|
|
|
3549301
|
|
2097
|
0
|
|
|
|
|
0
|
print $OUTPUT_FILE "0"; |
2098
|
|
|
|
|
|
|
} |
2099
|
|
|
|
|
|
|
else{ |
2100
|
2520282
|
|
|
|
|
1853634
|
my $rounded = POSIX::ceil(@{$genome_trans_data_windows{$trans_key}}[$chr_pos]); |
|
2520282
|
|
|
|
|
3837198
|
|
2101
|
2520282
|
|
|
|
|
3853291
|
print $OUTPUT_FILE ($rounded)/($window_size*$bin_size); |
2102
|
2520282
|
|
|
|
|
1811317
|
@{$genome_mutation_data_windows{$trans_key}}[$chr_pos] += $rounded; |
|
2520282
|
|
|
|
|
6147554
|
|
2103
|
|
|
|
|
|
|
} |
2104
|
|
|
|
|
|
|
}#for(my $chr_pos = 1; $chr_pos < scalar(@current_chr_data); $chr_pos++) |
2105
|
|
|
|
|
|
|
|
2106
|
|
|
|
|
|
|
|
2107
|
26
|
|
|
|
|
2682
|
close ($OUTPUT_FILE); |
2108
|
|
|
|
|
|
|
} |
2109
|
26
|
|
|
|
|
40244
|
@current_chr_data = (); |
2110
|
|
|
|
|
|
|
} |
2111
|
|
|
|
|
|
|
|
2112
|
|
|
|
|
|
|
#calculate the density of both types of mutations in each window in the genome |
2113
|
2
|
|
|
|
|
27
|
for my $mutation_key ( keys %genome_mutation_data_windows){ |
2114
|
|
|
|
|
|
|
|
2115
|
|
|
|
|
|
|
#check that some data exisits for the current chromosome |
2116
|
48
|
100
|
|
|
|
100
|
if(scalar(@{$genome_mutation_data_windows{$mutation_key}}) > 0){ |
|
48
|
|
|
|
|
280
|
|
2117
|
|
|
|
|
|
|
|
2118
|
|
|
|
|
|
|
#create the output file |
2119
|
42
|
50
|
|
|
|
277793
|
open ($OUTPUT_FILE, ">", "$output_directory/all_types/chr$mutation_key"."_mutation_localization.log") or die "ERROR: could not create file $output_directory/all_types/chr$current_chr"."_mutation_localization.log"; |
2120
|
|
|
|
|
|
|
#print the header for the output file |
2121
|
42
|
|
|
|
|
176
|
print $OUTPUT_FILE "#chr\tstart\tend\tdensity"; |
2122
|
|
|
|
|
|
|
|
2123
|
42
|
|
|
|
|
61
|
my $density; |
2124
|
|
|
|
|
|
|
|
2125
|
|
|
|
|
|
|
#for every bin along the chromosome |
2126
|
42
|
|
|
|
|
109
|
for(my $chr_pos = 0; $chr_pos < scalar(@{$genome_mutation_data_windows{$mutation_key}}); $chr_pos++){ |
|
4776020
|
|
|
|
|
7680664
|
|
2127
|
4775978
|
|
|
|
|
3117194
|
$total_genome_windows++; #increment the total number of windows in the genome |
2128
|
|
|
|
|
|
|
|
2129
|
|
|
|
|
|
|
#calculate the density of mutations in the window |
2130
|
4775978
|
50
|
|
|
|
3148584
|
if(!defined(@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])){ |
|
4775978
|
|
|
|
|
6429662
|
|
2131
|
0
|
|
|
|
|
0
|
$density = 0; |
2132
|
|
|
|
|
|
|
} |
2133
|
|
|
|
|
|
|
else{ |
2134
|
4775978
|
|
|
|
|
3119126
|
$density = (@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])/($window_size*$bin_size); |
|
4775978
|
|
|
|
|
5690124
|
|
2135
|
|
|
|
|
|
|
} |
2136
|
|
|
|
|
|
|
|
2137
|
|
|
|
|
|
|
#sum the density values to calculate the mean |
2138
|
4775978
|
|
|
|
|
3950367
|
$genome_mean_mutation_density += $density; |
2139
|
|
|
|
|
|
|
|
2140
|
|
|
|
|
|
|
#print the values for the window |
2141
|
4775978
|
|
|
|
|
3822716
|
print $OUTPUT_FILE "\n"; |
2142
|
4775978
|
|
|
|
|
3806028
|
print $OUTPUT_FILE "$mutation_key"; |
2143
|
4775978
|
|
|
|
|
3519477
|
print $OUTPUT_FILE "\t"; |
2144
|
4775978
|
|
|
|
|
9288168
|
print $OUTPUT_FILE $chr_pos*$bin_size; |
2145
|
4775978
|
|
|
|
|
3564978
|
print $OUTPUT_FILE "\t"; |
2146
|
4775978
|
|
|
|
|
6235293
|
print $OUTPUT_FILE ($chr_pos+$window_size)*$bin_size; |
2147
|
4775978
|
|
|
|
|
4516896
|
print $OUTPUT_FILE "\t"; |
2148
|
4775978
|
|
|
|
|
6815745
|
print $OUTPUT_FILE $density; |
2149
|
|
|
|
|
|
|
|
2150
|
|
|
|
|
|
|
}#for(my $chr_pos = 0; $chr_pos < scalar(@{$genome_mutation_data_windows{$mutation_key}}); $chr_pos++) |
2151
|
|
|
|
|
|
|
|
2152
|
42
|
|
|
|
|
3911
|
close ($OUTPUT_FILE); |
2153
|
|
|
|
|
|
|
} |
2154
|
|
|
|
|
|
|
}#for my $mutation_key ( keys %genome_mutation_data_windows) |
2155
|
|
|
|
|
|
|
|
2156
|
|
|
|
|
|
|
#calculate the mean mutation density for the windows in the genome |
2157
|
2
|
|
|
|
|
23
|
$genome_mean_mutation_density = $genome_mean_mutation_density/$total_genome_windows; |
2158
|
|
|
|
|
|
|
|
2159
|
|
|
|
|
|
|
#find the sum of squared difference between the density of mutation of each window and the mean density of mutation |
2160
|
2
|
|
|
|
|
20
|
for my $mutation_key ( keys %genome_mutation_data_windows){ |
2161
|
48
|
100
|
|
|
|
79
|
if(scalar(@{$genome_mutation_data_windows{$mutation_key}}) > 0){ |
|
48
|
|
|
|
|
221
|
|
2162
|
42
|
|
|
|
|
42
|
my $density; |
2163
|
|
|
|
|
|
|
|
2164
|
42
|
|
|
|
|
93
|
for(my $chr_pos = 0; $chr_pos < scalar(@{$genome_mutation_data_windows{$mutation_key}}); $chr_pos++){ |
|
4776020
|
|
|
|
|
7097656
|
|
2165
|
4775978
|
50
|
|
|
|
3122829
|
if(!defined(@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])){ |
|
4775978
|
|
|
|
|
6280127
|
|
2166
|
0
|
|
|
|
|
0
|
$density = 0; |
2167
|
|
|
|
|
|
|
} |
2168
|
|
|
|
|
|
|
else{ |
2169
|
4775978
|
|
|
|
|
3149281
|
$density = (@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])/($window_size*$bin_size); |
|
4775978
|
|
|
|
|
4898174
|
|
2170
|
|
|
|
|
|
|
} |
2171
|
|
|
|
|
|
|
|
2172
|
|
|
|
|
|
|
#sum the squared differences |
2173
|
4775978
|
|
|
|
|
5092304
|
$genome_mutation_density_standard_deviation += ($density-$genome_mean_mutation_density)**2; |
2174
|
|
|
|
|
|
|
} |
2175
|
|
|
|
|
|
|
} |
2176
|
|
|
|
|
|
|
}#for my $mutation_key ( keys %genome_mutation_data_windows) |
2177
|
|
|
|
|
|
|
|
2178
|
|
|
|
|
|
|
#divided the sum of the squared differnces by the total number of windows and take the square root |
2179
|
2
|
|
|
|
|
39
|
$genome_mutation_density_standard_deviation = ($genome_mutation_density_standard_deviation/$total_genome_windows)**0.5; |
2180
|
|
|
|
|
|
|
|
2181
|
|
|
|
|
|
|
#calculate z scores for each window and check if the window is greater than 2 SDs away from genome mean |
2182
|
|
|
|
|
|
|
#use this value to identify highly mutated regions |
2183
|
2
|
|
|
|
|
14
|
for my $mutation_key ( keys %genome_mutation_data_windows){ |
2184
|
48
|
100
|
|
|
|
64
|
if(scalar(@{$genome_mutation_data_windows{$mutation_key}}) > 0){ |
|
48
|
|
|
|
|
198
|
|
2185
|
42
|
|
|
|
|
44
|
my $density; |
2186
|
42
|
|
|
|
|
72
|
my $region_z_score = 0; |
2187
|
|
|
|
|
|
|
|
2188
|
42
|
|
|
|
|
58
|
for(my $chr_pos = 0; $chr_pos < scalar(@{$genome_mutation_data_windows{$mutation_key}}); $chr_pos++){ |
|
4776020
|
|
|
|
|
7101599
|
|
2189
|
|
|
|
|
|
|
|
2190
|
4775978
|
50
|
|
|
|
3201243
|
if(!defined(@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])){ |
|
4775978
|
|
|
|
|
6496156
|
|
2191
|
0
|
|
|
|
|
0
|
$density = 0; |
2192
|
|
|
|
|
|
|
} |
2193
|
|
|
|
|
|
|
else{ |
2194
|
4775978
|
|
|
|
|
3127809
|
$density = (@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])/($window_size*$bin_size); |
|
4775978
|
|
|
|
|
5265502
|
|
2195
|
|
|
|
|
|
|
} |
2196
|
|
|
|
|
|
|
|
2197
|
|
|
|
|
|
|
#calculate z score for the window |
2198
|
4775978
|
|
|
|
|
3629181
|
$region_z_score = ($density-$genome_mean_mutation_density)/$genome_mutation_density_standard_deviation; |
2199
|
|
|
|
|
|
|
|
2200
|
|
|
|
|
|
|
#check if the z score is above the threshold |
2201
|
4775978
|
100
|
|
|
|
8215524
|
if( $region_z_score >= $outlier_deviation ) { |
|
|
100
|
|
|
|
|
|
2202
|
88404
|
100
|
|
|
|
112082
|
if($in_suspect_region!=1){ |
2203
|
8
|
|
|
|
|
12
|
$suspect_start = $chr_pos*$bin_size; |
2204
|
8
|
|
|
|
|
12
|
$in_suspect_region = 1; |
2205
|
|
|
|
|
|
|
} |
2206
|
88404
|
|
|
|
|
64480
|
$suspect_chr = $mutation_key; |
2207
|
88404
|
|
|
|
|
68734
|
$suspect_end = ($chr_pos+$window_size)*$bin_size; |
2208
|
|
|
|
|
|
|
} |
2209
|
|
|
|
|
|
|
elsif ($in_suspect_region==1){ |
2210
|
|
|
|
|
|
|
#once a region has been called push the chromosome, start and end positions into the suspect region array |
2211
|
8
|
|
|
|
|
53
|
push (@suspect_regions, ($suspect_chr,$suspect_start,$suspect_end)); |
2212
|
8
|
|
|
|
|
12
|
$suspect_chr = -1; |
2213
|
8
|
|
|
|
|
10
|
$suspect_start = -1; |
2214
|
8
|
|
|
|
|
10
|
$suspect_end = -1; |
2215
|
8
|
|
|
|
|
8
|
$in_suspect_region = 0; |
2216
|
|
|
|
|
|
|
} |
2217
|
|
|
|
|
|
|
|
2218
|
|
|
|
|
|
|
#check if the z score is below the threshold but still suspicously high |
2219
|
4775978
|
100
|
100
|
|
|
15196012
|
if( |
|
|
100
|
|
|
|
|
|
2220
|
|
|
|
|
|
|
( $region_z_score < $outlier_deviation ) && |
2221
|
|
|
|
|
|
|
( $region_z_score >= ($outlier_deviation-1) ) |
2222
|
|
|
|
|
|
|
){ |
2223
|
66812
|
100
|
|
|
|
84473
|
if($in_likely_region!=1){ |
2224
|
26
|
|
|
|
|
33
|
$likely_start = $chr_pos*$bin_size; |
2225
|
26
|
|
|
|
|
42
|
$in_likely_region = 1; |
2226
|
|
|
|
|
|
|
} |
2227
|
66812
|
|
|
|
|
53962
|
$likely_chr = $mutation_key; |
2228
|
66812
|
|
|
|
|
65530
|
$likely_end = ($chr_pos+$window_size)*$bin_size; |
2229
|
|
|
|
|
|
|
} |
2230
|
|
|
|
|
|
|
elsif ($in_likely_region==1){ |
2231
|
24
|
|
|
|
|
105
|
push (@likely_regions, ($likely_chr,$likely_start,$likely_end)); |
2232
|
24
|
|
|
|
|
29
|
$likely_chr = -1; |
2233
|
24
|
|
|
|
|
29
|
$likely_start = -1; |
2234
|
24
|
|
|
|
|
23
|
$likely_end = -1; |
2235
|
24
|
|
|
|
|
35
|
$in_likely_region = 0; |
2236
|
|
|
|
|
|
|
} |
2237
|
|
|
|
|
|
|
|
2238
|
|
|
|
|
|
|
}#for(my $chr_pos = 0; $chr_pos < scalar(@{$genome_mutation_data_windows{$mutation_key}}); $chr_pos++) |
2239
|
|
|
|
|
|
|
|
2240
|
|
|
|
|
|
|
#check if the last region analyzed was suspicious and push its data into the appropriate array |
2241
|
42
|
50
|
|
|
|
156
|
if ($in_suspect_region==1){ |
2242
|
0
|
|
|
|
|
0
|
push (@suspect_regions, ($suspect_chr,$suspect_start,$suspect_end)); |
2243
|
0
|
|
|
|
|
0
|
$suspect_chr = -1; |
2244
|
0
|
|
|
|
|
0
|
$suspect_start = -1; |
2245
|
0
|
|
|
|
|
0
|
$suspect_end = -1; |
2246
|
0
|
|
|
|
|
0
|
$in_suspect_region = 0; |
2247
|
|
|
|
|
|
|
} |
2248
|
|
|
|
|
|
|
|
2249
|
42
|
100
|
|
|
|
220
|
if ($in_likely_region==1){ |
2250
|
2
|
|
|
|
|
18
|
push (@likely_regions, ($likely_chr,$likely_start,$likely_end)); |
2251
|
2
|
|
|
|
|
4
|
$likely_chr = -1; |
2252
|
2
|
|
|
|
|
6
|
$likely_start = -1; |
2253
|
2
|
|
|
|
|
4
|
$likely_end = -1; |
2254
|
2
|
|
|
|
|
8
|
$in_likely_region = 0; |
2255
|
|
|
|
|
|
|
} |
2256
|
|
|
|
|
|
|
} |
2257
|
|
|
|
|
|
|
}#for my $mutation_key ( keys %genome_mutation_data_windows) |
2258
|
|
|
|
|
|
|
|
2259
|
|
|
|
|
|
|
|
2260
|
2
|
|
|
|
|
50
|
return (\@suspect_regions, \@likely_regions, \%genome_cnv_data_windows, \%genome_trans_data_windows, \%genome_mutation_data_windows); |
2261
|
|
|
|
|
|
|
|
2262
|
|
|
|
|
|
|
}#sub calculate_chromosome_localization |
2263
|
|
|
|
|
|
|
|
2264
|
|
|
|
|
|
|
|
2265
|
|
|
|
|
|
|
#=head2 Sub-Method: check_copy_number_count |
2266
|
|
|
|
|
|
|
|
2267
|
|
|
|
|
|
|
### check_copy_number_count ####################################################################### |
2268
|
|
|
|
|
|
|
# Description: |
2269
|
|
|
|
|
|
|
# Produces an output file that records the number of regions of copy-number variation that |
2270
|
|
|
|
|
|
|
# are present in each chromosome. |
2271
|
|
|
|
|
|
|
# |
2272
|
|
|
|
|
|
|
# Input variables: |
2273
|
|
|
|
|
|
|
# $output_directory: stores the path to the output directory |
2274
|
|
|
|
|
|
|
# $chromosome_copy_number_count_hash_ref: reference to hash that stores the count of regions |
2275
|
|
|
|
|
|
|
# of copy-number variation on each chromosome |
2276
|
|
|
|
|
|
|
# |
2277
|
|
|
|
|
|
|
|
2278
|
|
|
|
|
|
|
#=cut |
2279
|
|
|
|
|
|
|
|
2280
|
|
|
|
|
|
|
sub check_copy_number_count { |
2281
|
|
|
|
|
|
|
|
2282
|
|
|
|
|
|
|
#parse parameters |
2283
|
1
|
|
|
1
|
0
|
9
|
my $output_directory = shift; |
2284
|
1
|
|
|
|
|
39
|
my $chromosome_copy_number_count_hash_ref = shift; |
2285
|
|
|
|
|
|
|
|
2286
|
1
|
|
|
|
|
1
|
my $OUTPUT_FILE; #file handle to output file |
2287
|
|
|
|
|
|
|
|
2288
|
|
|
|
|
|
|
#open the output file |
2289
|
1
|
50
|
|
|
|
145
|
open ($OUTPUT_FILE, ">", "$output_directory/copy_number_count.log") or die "ERROR: could not create file $output_directory/copy_number_count.log\n"; |
2290
|
|
|
|
|
|
|
|
2291
|
|
|
|
|
|
|
#print the header |
2292
|
1
|
|
|
|
|
5
|
print $OUTPUT_FILE "#chr\tcopy_number\tnumber_of_regions"; |
2293
|
|
|
|
|
|
|
|
2294
|
|
|
|
|
|
|
#for each chromosome |
2295
|
|
|
|
|
|
|
#print out the number of regions with the given copy-number |
2296
|
9
|
|
|
9
|
|
35802
|
{use sort 'stable'; |
|
9
|
|
|
|
|
23
|
|
|
9
|
|
|
|
|
69
|
|
|
1
|
|
|
|
|
1
|
|
2297
|
1
|
|
|
|
|
44
|
for my $chr (sort keys %$chromosome_copy_number_count_hash_ref){ |
2298
|
20
|
|
|
|
|
12
|
my %intermediate_hash = %{$chromosome_copy_number_count_hash_ref->{$chr}}; |
|
20
|
|
|
|
|
63
|
|
2299
|
20
|
|
|
|
|
35
|
for my $CN (sort {$a <=> $b} keys %intermediate_hash){ |
|
103
|
|
|
|
|
106
|
|
2300
|
80
|
|
|
|
|
108
|
print $OUTPUT_FILE "\n"; |
2301
|
80
|
|
|
|
|
57
|
print $OUTPUT_FILE "$chr"; #chromosome |
2302
|
80
|
|
|
|
|
53
|
print $OUTPUT_FILE "\t"; |
2303
|
80
|
|
|
|
|
57
|
print $OUTPUT_FILE "$CN"; #copy-number |
2304
|
80
|
|
|
|
|
56
|
print $OUTPUT_FILE "\t"; |
2305
|
80
|
|
|
|
|
131
|
print $OUTPUT_FILE $chromosome_copy_number_count_hash_ref->{$chr}->{$CN}; #number of regions with copy-number $CN |
2306
|
|
|
|
|
|
|
} |
2307
|
|
|
|
|
|
|
} |
2308
|
|
|
|
|
|
|
}#use sort 'stable' |
2309
|
1
|
|
|
|
|
96
|
close ($OUTPUT_FILE); |
2310
|
|
|
|
|
|
|
|
2311
|
|
|
|
|
|
|
}#sub check_copy_number_count |
2312
|
|
|
|
|
|
|
|
2313
|
|
|
|
|
|
|
#=head2 Sub-Method: check_copy_number_switches |
2314
|
|
|
|
|
|
|
|
2315
|
|
|
|
|
|
|
### check_copy_number_switches #################################################################### |
2316
|
|
|
|
|
|
|
# Description: |
2317
|
|
|
|
|
|
|
# Creates an output file that records the number of breakpoints between CNV regions on each |
2318
|
|
|
|
|
|
|
# chromosome |
2319
|
|
|
|
|
|
|
# |
2320
|
|
|
|
|
|
|
# Input variables: |
2321
|
|
|
|
|
|
|
# $output_directory: stores path to output directory |
2322
|
|
|
|
|
|
|
# $chromosome_copy_number_count_hash_ref: reference to hash that stores the count of regions |
2323
|
|
|
|
|
|
|
# of copy-number variation on each chromosome |
2324
|
|
|
|
|
|
|
# |
2325
|
|
|
|
|
|
|
|
2326
|
|
|
|
|
|
|
#=cut |
2327
|
|
|
|
|
|
|
|
2328
|
|
|
|
|
|
|
sub check_copy_number_switches { |
2329
|
|
|
|
|
|
|
|
2330
|
|
|
|
|
|
|
#parse parameters |
2331
|
1
|
|
|
1
|
0
|
45
|
my $output_directory = shift; |
2332
|
1
|
|
|
|
|
2
|
my $chromosome_copy_number_count_hash_ref = shift; |
2333
|
|
|
|
|
|
|
|
2334
|
1
|
|
|
|
|
1
|
my $switch_count = 0; |
2335
|
|
|
|
|
|
|
|
2336
|
1
|
|
|
|
|
2
|
my $OUTPUT_FILE; #file handle to output file |
2337
|
|
|
|
|
|
|
|
2338
|
|
|
|
|
|
|
#open output file |
2339
|
1
|
50
|
|
|
|
102
|
open ($OUTPUT_FILE, ">", "$output_directory/copy_number_switches.log") or die "ERROR: could not create file $output_directory/copy_number_switches.log\n"; |
2340
|
|
|
|
|
|
|
|
2341
|
|
|
|
|
|
|
#print header |
2342
|
1
|
|
|
|
|
4
|
print $OUTPUT_FILE "#chr\tswitch_count"; |
2343
|
|
|
|
|
|
|
|
2344
|
|
|
|
|
|
|
#for each chromosome |
2345
|
|
|
|
|
|
|
#sum the total number of CNV events |
2346
|
|
|
|
|
|
|
#multiply the sum by 2 to get the number of CNV breakpoints |
2347
|
9
|
|
|
9
|
|
2627
|
{use sort 'stable'; |
|
9
|
|
|
|
|
16
|
|
|
9
|
|
|
|
|
37
|
|
|
1
|
|
|
|
|
2
|
|
2348
|
1
|
|
|
|
|
15
|
for my $chr (sort keys %$chromosome_copy_number_count_hash_ref){ |
2349
|
20
|
|
|
|
|
17
|
my %intermediate_hash = %{$chromosome_copy_number_count_hash_ref->{$chr}}; |
|
20
|
|
|
|
|
53
|
|
2350
|
20
|
|
|
|
|
37
|
for my $CN (sort {$intermediate_hash{$b} <=> $intermediate_hash{$a} } keys %intermediate_hash){ |
|
106
|
|
|
|
|
94
|
|
2351
|
|
|
|
|
|
|
#only count regions that have abberant copy numbers |
2352
|
80
|
50
|
|
|
|
103
|
if($CN != 2){ |
2353
|
80
|
|
|
|
|
91
|
$switch_count += ($chromosome_copy_number_count_hash_ref->{$chr}->{$CN} * 2); |
2354
|
|
|
|
|
|
|
} |
2355
|
|
|
|
|
|
|
} |
2356
|
|
|
|
|
|
|
|
2357
|
|
|
|
|
|
|
#print values to output file |
2358
|
20
|
|
|
|
|
24
|
print $OUTPUT_FILE "\n"; |
2359
|
20
|
|
|
|
|
16
|
print $OUTPUT_FILE "$chr"; |
2360
|
20
|
|
|
|
|
14
|
print $OUTPUT_FILE "\t"; |
2361
|
20
|
|
|
|
|
30
|
print $OUTPUT_FILE $switch_count; |
2362
|
|
|
|
|
|
|
|
2363
|
20
|
|
|
|
|
23
|
$switch_count = 0; |
2364
|
|
|
|
|
|
|
} |
2365
|
|
|
|
|
|
|
}#use sort 'stable' |
2366
|
|
|
|
|
|
|
|
2367
|
1
|
|
|
|
|
33
|
close ($OUTPUT_FILE); |
2368
|
|
|
|
|
|
|
|
2369
|
|
|
|
|
|
|
}#sub check_copy_number_switches |
2370
|
|
|
|
|
|
|
|
2371
|
|
|
|
|
|
|
#=head2 Sub-Method: calculate_interchromosomal_translocation_rate |
2372
|
|
|
|
|
|
|
|
2373
|
|
|
|
|
|
|
### calculate_interchromosomal_translocation_rate ################################################# |
2374
|
|
|
|
|
|
|
# Description: |
2375
|
|
|
|
|
|
|
# Create an output file that records the number of translocations between each and every |
2376
|
|
|
|
|
|
|
# chromosome |
2377
|
|
|
|
|
|
|
# |
2378
|
|
|
|
|
|
|
# Input variables: |
2379
|
|
|
|
|
|
|
# $output_directory: stores path to output directory |
2380
|
|
|
|
|
|
|
# $chromosome_translocation_count_hash_ref: reference to hash that stores the count of |
2381
|
|
|
|
|
|
|
# translocations between each chromosome |
2382
|
|
|
|
|
|
|
# |
2383
|
|
|
|
|
|
|
|
2384
|
|
|
|
|
|
|
#=cut |
2385
|
|
|
|
|
|
|
|
2386
|
|
|
|
|
|
|
sub calculate_interchromosomal_translocation_rate { |
2387
|
|
|
|
|
|
|
|
2388
|
|
|
|
|
|
|
#parse parameters |
2389
|
1
|
|
|
1
|
0
|
9
|
my $output_directory = shift; |
2390
|
1
|
|
|
|
|
3
|
my $chromosome_translocation_count_hash_ref = shift; |
2391
|
|
|
|
|
|
|
|
2392
|
1
|
|
|
|
|
2
|
my $OUTPUT_FILE; #file handle to output file |
2393
|
|
|
|
|
|
|
|
2394
|
|
|
|
|
|
|
#open output file |
2395
|
1
|
50
|
|
|
|
153
|
open ($OUTPUT_FILE, ">", "$output_directory/interchromosomal_translocation_rate.log") or die "ERROR: could not create file $output_directory/interchromosomal_translocation_rate.log\n"; |
2396
|
|
|
|
|
|
|
|
2397
|
|
|
|
|
|
|
#print header |
2398
|
1
|
|
|
|
|
19
|
print $OUTPUT_FILE "#chr1\tchr2\tcount"; |
2399
|
|
|
|
|
|
|
|
2400
|
|
|
|
|
|
|
#for each chromosome |
2401
|
|
|
|
|
|
|
#print the number of translocations between every other chromosome |
2402
|
9
|
|
|
9
|
|
2679
|
{use sort 'stable'; |
|
9
|
|
|
|
|
18
|
|
|
9
|
|
|
|
|
45
|
|
|
1
|
|
|
|
|
2
|
|
2403
|
1
|
|
|
|
|
14
|
for my $chr1 (sort keys %$chromosome_translocation_count_hash_ref){ |
2404
|
13
|
|
|
|
|
12
|
my %intermediate_hash = %{$chromosome_translocation_count_hash_ref->{$chr1}}; |
|
13
|
|
|
|
|
34
|
|
2405
|
|
|
|
|
|
|
|
2406
|
13
|
|
|
|
|
23
|
for my $chr2 (sort keys %intermediate_hash){ |
2407
|
26
|
|
|
|
|
24
|
print $OUTPUT_FILE "\n"; |
2408
|
26
|
|
|
|
|
18
|
print $OUTPUT_FILE $chr1; |
2409
|
26
|
|
|
|
|
25
|
print $OUTPUT_FILE "\t"; |
2410
|
26
|
|
|
|
|
19
|
print $OUTPUT_FILE $chr2; |
2411
|
26
|
|
|
|
|
19
|
print $OUTPUT_FILE "\t"; |
2412
|
26
|
|
|
|
|
58
|
print $OUTPUT_FILE $chromosome_translocation_count_hash_ref->{$chr1}->{$chr2}; |
2413
|
|
|
|
|
|
|
} |
2414
|
|
|
|
|
|
|
} |
2415
|
|
|
|
|
|
|
}#use sort 'stable' |
2416
|
1
|
|
|
|
|
34
|
close ($OUTPUT_FILE); |
2417
|
|
|
|
|
|
|
|
2418
|
|
|
|
|
|
|
}#sub calculate_interchromosomal_translocation_rate |
2419
|
|
|
|
|
|
|
|
2420
|
|
|
|
|
|
|
|
2421
|
|
|
|
|
|
|
#=head2 Sub-Method: analyze_suspect_regions |
2422
|
|
|
|
|
|
|
|
2423
|
|
|
|
|
|
|
### analyze_suspect_regions ####################################################################### |
2424
|
|
|
|
|
|
|
# Description: |
2425
|
|
|
|
|
|
|
# Produces the final report output file, that includes the chromothriptic scores for each of |
2426
|
|
|
|
|
|
|
# the highly mutated regions |
2427
|
|
|
|
|
|
|
# |
2428
|
|
|
|
|
|
|
# Input variables: |
2429
|
|
|
|
|
|
|
# $output_directory: stores path to the output directory |
2430
|
|
|
|
|
|
|
# $suspect_regions_array_ref: reference to array storing the chromosome, |
2431
|
|
|
|
|
|
|
# start, and end position of highly mutated |
2432
|
|
|
|
|
|
|
# regions |
2433
|
|
|
|
|
|
|
# $genome_mutation_density_hash_ref: stores the average mutation density of each |
2434
|
|
|
|
|
|
|
# chromosome |
2435
|
|
|
|
|
|
|
# $genome_cnv_data_hash_ref: stores the position of CNV mutations on |
2436
|
|
|
|
|
|
|
# each chromosome |
2437
|
|
|
|
|
|
|
# $genome_trans_data_hash_ref: stores the position of translocation events |
2438
|
|
|
|
|
|
|
# on each chromosome |
2439
|
|
|
|
|
|
|
# $genome_trans_insertion_breakpoints_hash_ref: stores the position of insertions on each |
2440
|
|
|
|
|
|
|
# chromosome |
2441
|
|
|
|
|
|
|
# $bin_size: stores the size of a single bin |
2442
|
|
|
|
|
|
|
# $localization_window_size: stores the number of bins to include in a |
2443
|
|
|
|
|
|
|
# window |
2444
|
|
|
|
|
|
|
# $tp53_mutated: stores whether the TP53 gene is mutatated |
2445
|
|
|
|
|
|
|
# or not |
2446
|
|
|
|
|
|
|
# $tp53_mutation_found: stores whether or not a mutation was found |
2447
|
|
|
|
|
|
|
# in the TP53 loci |
2448
|
|
|
|
|
|
|
# $chromosome_cnv_breakpoints_hash_ref: stores the breakpoints of CNV mutations on |
2449
|
|
|
|
|
|
|
# each chromosome |
2450
|
|
|
|
|
|
|
# $chromosome_loh_breakpoints_hash_ref: stores the breakpoints of LOH regions on |
2451
|
|
|
|
|
|
|
# each chromosome |
2452
|
|
|
|
|
|
|
# |
2453
|
|
|
|
|
|
|
|
2454
|
|
|
|
|
|
|
#=cut |
2455
|
|
|
|
|
|
|
|
2456
|
|
|
|
|
|
|
sub analyze_suspect_regions { |
2457
|
|
|
|
|
|
|
|
2458
|
|
|
|
|
|
|
#parse parameters |
2459
|
0
|
|
|
0
|
0
|
0
|
my $output_directory = shift; |
2460
|
|
|
|
|
|
|
|
2461
|
0
|
|
|
|
|
0
|
my $suspect_regions_array_ref = shift; |
2462
|
0
|
|
|
|
|
0
|
my @suspect_regions = @$suspect_regions_array_ref; |
2463
|
|
|
|
|
|
|
|
2464
|
0
|
|
|
|
|
0
|
my $genome_mutation_density_hash_ref = shift; |
2465
|
0
|
|
|
|
|
0
|
my %genome_mutation_density_hash = %{$genome_mutation_density_hash_ref}; |
|
0
|
|
|
|
|
0
|
|
2466
|
|
|
|
|
|
|
|
2467
|
0
|
|
|
|
|
0
|
my $genome_cnv_data_hash_ref = shift; |
2468
|
0
|
|
|
|
|
0
|
my $genome_trans_data_hash_ref = shift; |
2469
|
0
|
|
|
|
|
0
|
my $genome_trans_insertion_breakpoints_hash_ref = shift; |
2470
|
|
|
|
|
|
|
|
2471
|
0
|
|
|
|
|
0
|
my $bin_size = shift; |
2472
|
0
|
|
|
|
|
0
|
my $localization_window_size = shift; |
2473
|
|
|
|
|
|
|
|
2474
|
0
|
|
|
|
|
0
|
my $tp53_mutated = shift; |
2475
|
0
|
|
|
|
|
0
|
my $tp53_mutation_found = shift; |
2476
|
|
|
|
|
|
|
|
2477
|
0
|
|
|
|
|
0
|
my $chromosome_cnv_breakpoints_hash_ref = shift; |
2478
|
0
|
|
|
|
|
0
|
my $chromosome_loh_breakpoints_hash_ref = shift; |
2479
|
|
|
|
|
|
|
|
2480
|
|
|
|
|
|
|
|
2481
|
0
|
|
|
|
|
0
|
my $suspect_regions_size = @suspect_regions; |
2482
|
|
|
|
|
|
|
|
2483
|
0
|
|
|
|
|
0
|
my $OUTPUT_FILE; |
2484
|
|
|
|
|
|
|
|
2485
|
0
|
|
|
|
|
0
|
my @suspect_region_data = (); |
2486
|
|
|
|
|
|
|
|
2487
|
0
|
|
|
|
|
0
|
my $header_string; |
2488
|
|
|
|
|
|
|
|
2489
|
|
|
|
|
|
|
#check that the suspect region data array is not malformed, should contain sets of 3 elements |
2490
|
0
|
0
|
|
|
|
0
|
if($suspect_regions_size % 3 != 0){ |
2491
|
0
|
|
|
|
|
0
|
die "ERROR: suspect_regions_array has $suspect_regions_size entries. Value must be divisible by 3.\n"; |
2492
|
|
|
|
|
|
|
} |
2493
|
|
|
|
|
|
|
|
2494
|
|
|
|
|
|
|
#create output directory for report files |
2495
|
0
|
0
|
|
|
|
0
|
mkdir ("$output_directory"."suspect_regions",0770) unless (-d "$output_directory"."suspect_regions"); |
2496
|
0
|
0
|
|
|
|
0
|
if(!(-e "$output_directory"."suspect_regions")){ |
2497
|
0
|
|
|
|
|
0
|
die "ERROR: could not create folder $output_directory"."suspect_regions"; |
2498
|
|
|
|
|
|
|
} |
2499
|
|
|
|
|
|
|
|
2500
|
|
|
|
|
|
|
#open final report output file |
2501
|
0
|
0
|
|
|
|
0
|
open ($OUTPUT_FILE, ">", "$output_directory"."suspect_regions/suspect_regions.yml") or die "ERROR: could not create file: $output_directory"."suspect_regions/suspect_regions.yml\n"; |
2502
|
|
|
|
|
|
|
|
2503
|
|
|
|
|
|
|
#construct and print header |
2504
|
0
|
|
|
|
|
0
|
$header_string = "file: Suspect Chromothriptic Regions\n"; |
2505
|
0
|
|
|
|
|
0
|
$header_string .= "bin_size:\t\t\t$bin_size\n"; |
2506
|
0
|
|
|
|
|
0
|
$header_string .= "localization_window_size:\t$localization_window_size\n"; |
2507
|
0
|
|
|
|
|
0
|
$header_string .= "\n"; |
2508
|
0
|
|
|
|
|
0
|
$header_string .= "genome_localization_score_weight:\t$genome_localization_weight\n"; |
2509
|
0
|
|
|
|
|
0
|
$header_string .= "chromosome_localization_score_weight:\t$chromosome_localization_weight\n"; |
2510
|
0
|
|
|
|
|
0
|
$header_string .= "cnv_score_weight:\t\t\t$cnv_weight\n"; |
2511
|
0
|
|
|
|
|
0
|
$header_string .= "translocation_score_weight:\t\t$translocation_weight\n"; |
2512
|
0
|
|
|
|
|
0
|
$header_string .= "insertion_breakpoint_score_weight:\t$insertion_breakpoint_weight\n"; |
2513
|
0
|
|
|
|
|
0
|
$header_string .= "loh_score_weight:\t\t\t$loh_weight\n"; |
2514
|
0
|
|
|
|
|
0
|
$header_string .= "tp53_mutation_score_weight:\t\t$tp53_mutated_weight\n"; |
2515
|
0
|
|
|
|
|
0
|
$header_string .= "\n"; |
2516
|
0
|
|
|
|
|
0
|
$header_string .= "min_mutation_density_z_score:\t$outlier_deviation\n"; |
2517
|
0
|
|
|
|
|
0
|
$header_string .= "---\n"; |
2518
|
0
|
|
|
|
|
0
|
$header_string .= "\n"; |
2519
|
0
|
|
|
|
|
0
|
print $OUTPUT_FILE $header_string; |
2520
|
|
|
|
|
|
|
|
2521
|
|
|
|
|
|
|
#calculate a chromothripsis score for each region that was present in the suspect region array |
2522
|
|
|
|
|
|
|
#store the results of the score calculation in a 2d array where elements in the first dimension correspond to each suspect region |
2523
|
|
|
|
|
|
|
#and the elements in the second dimension are the results of the score calculation |
2524
|
0
|
|
|
|
|
0
|
for (my $i = 0; $i < $suspect_regions_size; $i+=3){ |
2525
|
0
|
|
|
|
|
0
|
my @region_data = (); |
2526
|
0
|
|
|
|
|
0
|
$region_data[0] = $suspect_regions[$i]; #chr |
2527
|
0
|
|
|
|
|
0
|
$region_data[1] = $suspect_regions[$i+1]; #start |
2528
|
0
|
|
|
|
|
0
|
$region_data[2] = $suspect_regions[$i+2]; #end |
2529
|
|
|
|
|
|
|
|
2530
|
0
|
|
|
|
|
0
|
($region_data[3], $region_data[4], $region_data[5], $region_data[6], $region_data[7], $region_data[8], $region_data[9], $region_data[10], $region_data[11], $region_data[12], $region_data[13]) = calculate_score($region_data[0], $region_data[1], $region_data[2], $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $genome_mutation_density_hash_ref, $genome_trans_insertion_breakpoints_hash_ref, $tp53_mutated, $chromosome_cnv_breakpoints_hash_ref, $chromosome_loh_breakpoints_hash_ref, $bin_size); |
2531
|
|
|
|
|
|
|
|
2532
|
|
|
|
|
|
|
#add the results of the score calculation for this region to the array storing all the results |
2533
|
0
|
|
|
|
|
0
|
push @suspect_region_data, [@region_data]; |
2534
|
|
|
|
|
|
|
} |
2535
|
|
|
|
|
|
|
|
2536
|
|
|
|
|
|
|
#sort the results so that the region with the highest chromothriptic score will be printed |
2537
|
|
|
|
|
|
|
#to the final report output file first |
2538
|
9
|
|
|
9
|
|
6023
|
{use sort 'stable'; |
|
9
|
|
|
|
|
18
|
|
|
9
|
|
|
|
|
39
|
|
|
0
|
|
|
|
|
0
|
|
2539
|
0
|
|
|
|
|
0
|
@suspect_region_data = sort {$b->[3] <=> $a->[3] } @suspect_region_data; |
|
0
|
|
|
|
|
0
|
|
2540
|
|
|
|
|
|
|
}#use sort 'stable'A |
2541
|
|
|
|
|
|
|
|
2542
|
|
|
|
|
|
|
#for each score that is generated print the score and the related statistics for that region |
2543
|
0
|
|
|
|
|
0
|
foreach my $score_data (@suspect_region_data){ |
2544
|
0
|
|
|
|
|
0
|
my $chr = $score_data->[0]; #chr |
2545
|
0
|
|
|
|
|
0
|
my $start = $score_data->[1]; #start |
2546
|
0
|
|
|
|
|
0
|
my $end = $score_data->[2]; #end |
2547
|
|
|
|
|
|
|
|
2548
|
0
|
|
|
|
|
0
|
my $score = sprintf("%.5f",$score_data->[3]); |
2549
|
0
|
|
|
|
|
0
|
my $chr_z_score = $score_data->[4]; |
2550
|
0
|
|
|
|
|
0
|
my $region_density = sprintf("%e",$score_data->[5]); |
2551
|
|
|
|
|
|
|
|
2552
|
0
|
|
|
|
|
0
|
my $cnv_number_hash_ref = $score_data->[6]; |
2553
|
0
|
|
|
|
|
0
|
my %cnv_number_hash; |
2554
|
|
|
|
|
|
|
my $num_copy_num; |
2555
|
0
|
0
|
|
|
|
0
|
if(defined($cnv_number_hash_ref)){ |
2556
|
0
|
|
|
|
|
0
|
%cnv_number_hash = %{$cnv_number_hash_ref}; |
|
0
|
|
|
|
|
0
|
|
2557
|
0
|
|
|
|
|
0
|
$num_copy_num = keys %cnv_number_hash; |
2558
|
|
|
|
|
|
|
} |
2559
|
|
|
|
|
|
|
else{ |
2560
|
0
|
|
|
|
|
0
|
$num_copy_num = 0; |
2561
|
|
|
|
|
|
|
} |
2562
|
|
|
|
|
|
|
|
2563
|
0
|
|
|
|
|
0
|
my $cnv_density = sprintf("%e",$score_data->[7]); |
2564
|
|
|
|
|
|
|
|
2565
|
0
|
|
|
|
|
0
|
my $intertranslocation_hash_ref = $score_data->[8]; |
2566
|
0
|
|
|
|
|
0
|
my $translocation_density = $score_data->[9]; |
2567
|
0
|
|
|
|
|
0
|
my %intertranslocation_hash; |
2568
|
|
|
|
|
|
|
my $num_trans_chr; |
2569
|
0
|
0
|
|
|
|
0
|
if(defined($intertranslocation_hash_ref)){ |
2570
|
0
|
|
|
|
|
0
|
%intertranslocation_hash = %{$intertranslocation_hash_ref}; |
|
0
|
|
|
|
|
0
|
|
2571
|
0
|
|
|
|
|
0
|
$num_trans_chr = keys %intertranslocation_hash; |
2572
|
|
|
|
|
|
|
} |
2573
|
|
|
|
|
|
|
else{ |
2574
|
0
|
|
|
|
|
0
|
$num_trans_chr = 0; |
2575
|
|
|
|
|
|
|
} |
2576
|
|
|
|
|
|
|
|
2577
|
0
|
|
|
|
|
0
|
my $breakpoint_insertions_array_ref = $score_data->[10]; |
2578
|
0
|
|
|
|
|
0
|
my @breakpoint_insertions_array; |
2579
|
|
|
|
|
|
|
my $breakpoint_percentage; |
2580
|
0
|
0
|
|
|
|
0
|
if(defined($breakpoint_insertions_array_ref)){ |
2581
|
0
|
|
|
|
|
0
|
@breakpoint_insertions_array = @$breakpoint_insertions_array_ref; |
2582
|
0
|
|
|
|
|
0
|
$breakpoint_percentage = sprintf("%.2f",($breakpoint_insertions_array[0]/$breakpoint_insertions_array[1])*100); |
2583
|
|
|
|
|
|
|
} |
2584
|
|
|
|
|
|
|
|
2585
|
0
|
|
|
|
|
0
|
my $loh_size = $score_data->[11]; |
2586
|
0
|
|
|
|
|
0
|
my $hz_size = $score_data->[12]; |
2587
|
0
|
|
|
|
|
0
|
my $percent_hz_lost; |
2588
|
0
|
0
|
0
|
|
|
0
|
if(defined($loh_size) && defined($hz_size)){ |
2589
|
0
|
|
|
|
|
0
|
$percent_hz_lost = sprintf("%.2f",($loh_size/$hz_size)*100); |
2590
|
|
|
|
|
|
|
} |
2591
|
|
|
|
|
|
|
|
2592
|
0
|
|
|
|
|
0
|
my @score_array = @{$score_data->[13]}; |
|
0
|
|
|
|
|
0
|
|
2593
|
|
|
|
|
|
|
|
2594
|
0
|
|
|
|
|
0
|
my $chr_density = $genome_mutation_density_hash{$chr}; |
2595
|
|
|
|
|
|
|
|
2596
|
0
|
|
|
|
|
0
|
my $print_string; |
2597
|
|
|
|
|
|
|
|
2598
|
0
|
|
|
|
|
0
|
$print_string = "chromosome:\t$chr\n"; |
2599
|
0
|
|
|
|
|
0
|
$print_string .= "start:\t\t$start\n"; |
2600
|
0
|
|
|
|
|
0
|
$print_string .= "end:\t\t$end\n"; |
2601
|
0
|
|
|
|
|
0
|
$print_string .= "\n"; |
2602
|
|
|
|
|
|
|
|
2603
|
0
|
|
|
|
|
0
|
$print_string .= "final_score:\t\t\t$score\n"; |
2604
|
0
|
|
|
|
|
0
|
$print_string .= "genome_localization_score:\t".$score_array[2]*$genome_localization_weight."\t(".$score_array[2].")"."\n"; |
2605
|
0
|
|
|
|
|
0
|
$print_string .= "chromosome_localization_score:\t".$score_array[1]*$chromosome_localization_weight."\t(".$score_array[1].")"."\n"; |
2606
|
0
|
|
|
|
|
0
|
$print_string .= "cnv_score:\t\t\t".$score_array[0]*$cnv_weight."\t(".$score_array[0].")"."\n"; |
2607
|
0
|
|
|
|
|
0
|
$print_string .= "translocation_score:\t\t".$score_array[3]*$translocation_weight."\t(".$score_array[3].")"."\n"; |
2608
|
0
|
|
|
|
|
0
|
$print_string .= "insertion_breakpoint_score:\t".$score_array[4]*$insertion_breakpoint_weight."\t(".$score_array[4].")"."\n"; |
2609
|
0
|
|
|
|
|
0
|
$print_string .= "loh_score:\t\t\t".$score_array[5]*$loh_weight."\t(".$score_array[5].")"."\n"; |
2610
|
0
|
|
|
|
|
0
|
$print_string .= "tp53_score:\t\t\t".$score_array[6]*$tp53_mutated_weight."\t(".$score_array[6].")\n"; |
2611
|
|
|
|
|
|
|
|
2612
|
0
|
|
|
|
|
0
|
$print_string .= "\n"; |
2613
|
|
|
|
|
|
|
|
2614
|
0
|
|
|
|
|
0
|
$print_string .= "mutation_density_of_region:\t$region_density\n"; |
2615
|
0
|
|
|
|
|
0
|
$print_string .= "mutation_density_of_chromosome:\t$chr_density\n"; |
2616
|
0
|
|
|
|
|
0
|
$print_string .= "standard_deviations_from_mean_of_chromosome_mutation_density:\t$chr_z_score\n"; |
2617
|
0
|
|
|
|
|
0
|
$print_string .= "\n"; |
2618
|
|
|
|
|
|
|
|
2619
|
0
|
|
|
|
|
0
|
$print_string .= "density_of_copy_number_switches: $cnv_density\n"; |
2620
|
0
|
|
|
|
|
0
|
$print_string .= "number_of_aberrant_copy_number_states:\t$num_copy_num\n"; |
2621
|
0
|
0
|
|
|
|
0
|
if($num_copy_num>0){ |
2622
|
0
|
|
|
|
|
0
|
$print_string .= "aberrant_copy_number_states:\n"; |
2623
|
9
|
|
|
9
|
|
7085
|
{use sort 'stable'; |
|
9
|
|
|
|
|
30
|
|
|
9
|
|
|
|
|
47
|
|
|
0
|
|
|
|
|
0
|
|
2624
|
0
|
|
|
|
|
0
|
foreach my $key (sort {$cnv_number_hash{$b} <=> $cnv_number_hash{$a} } keys %cnv_number_hash){ |
|
0
|
|
|
|
|
0
|
|
2625
|
0
|
|
|
|
|
0
|
$print_string .= "\t$key:\t$cnv_number_hash{$key}\n"; |
2626
|
|
|
|
|
|
|
} |
2627
|
|
|
|
|
|
|
}#use sort 'stable' |
2628
|
|
|
|
|
|
|
} |
2629
|
|
|
|
|
|
|
|
2630
|
0
|
|
|
|
|
0
|
$print_string .= "\n"; |
2631
|
|
|
|
|
|
|
|
2632
|
0
|
|
|
|
|
0
|
$print_string .= "density_of_translocation_breakpoints: $translocation_density\n"; |
2633
|
0
|
|
|
|
|
0
|
$print_string .= "number_of_translocation_chromosomes:\t$num_trans_chr\n"; |
2634
|
|
|
|
|
|
|
|
2635
|
0
|
0
|
|
|
|
0
|
if($num_trans_chr>0){ |
2636
|
0
|
|
|
|
|
0
|
$print_string .= "translocation_chromosomes:\n"; |
2637
|
9
|
|
|
9
|
|
1488
|
{use sort 'stable'; |
|
9
|
|
|
|
|
14
|
|
|
9
|
|
|
|
|
34
|
|
|
0
|
|
|
|
|
0
|
|
2638
|
0
|
|
|
|
|
0
|
foreach my $key (sort {$intertranslocation_hash{$b} <=> $intertranslocation_hash{$a} } keys %intertranslocation_hash){ |
|
0
|
|
|
|
|
0
|
|
2639
|
0
|
|
|
|
|
0
|
$print_string .= "\t$key:\t$intertranslocation_hash{$key}\n"; |
2640
|
|
|
|
|
|
|
} |
2641
|
|
|
|
|
|
|
}#use sort 'stable' |
2642
|
|
|
|
|
|
|
} |
2643
|
0
|
|
|
|
|
0
|
$print_string .= "\n"; |
2644
|
|
|
|
|
|
|
|
2645
|
0
|
0
|
|
|
|
0
|
if(defined($breakpoint_insertions_array_ref)){ |
2646
|
0
|
|
|
|
|
0
|
$print_string .= "insertion_data:\n"; |
2647
|
0
|
|
|
|
|
0
|
$print_string .= "\tinsertions_found_at_translocation_breakpoints:\t$breakpoint_insertions_array[0]\n"; |
2648
|
0
|
|
|
|
|
0
|
$print_string .= "\ttotal_translocation_breakpoints:\t$breakpoint_insertions_array[1]\n"; |
2649
|
0
|
|
|
|
|
0
|
$print_string .= "\tpercentage:\t$breakpoint_percentage"."%\n"; |
2650
|
0
|
|
|
|
|
0
|
$print_string .= "\n"; |
2651
|
|
|
|
|
|
|
} |
2652
|
|
|
|
|
|
|
|
2653
|
0
|
0
|
|
|
|
0
|
if($loh_size!=-1){ |
2654
|
0
|
|
|
|
|
0
|
$print_string .= "loh_data:\n"; |
2655
|
0
|
|
|
|
|
0
|
$print_string .= "\ttotal_size_of_loh:\t$loh_size\n"; |
2656
|
0
|
|
|
|
|
0
|
$print_string .= "\ttotal_size_of_original_heterozygosity:\t$hz_size\n"; |
2657
|
0
|
|
|
|
|
0
|
$print_string .= "\tpercent_heterozygosity_lost:\t$percent_hz_lost"."%\n"; |
2658
|
0
|
|
|
|
|
0
|
$print_string .= "\n"; |
2659
|
|
|
|
|
|
|
} |
2660
|
|
|
|
|
|
|
|
2661
|
0
|
0
|
0
|
|
|
0
|
if($tp53_mutated && $tp53_mutation_found){ |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
2662
|
0
|
|
|
|
|
0
|
$print_string .= "tp53_mutation_present:\t1 (forced and mutations found)\n"; |
2663
|
|
|
|
|
|
|
} |
2664
|
|
|
|
|
|
|
elsif($tp53_mutated){ |
2665
|
0
|
|
|
|
|
0
|
$print_string .= "tp53_mutation_present:\t1 (forced)\n"; |
2666
|
|
|
|
|
|
|
} |
2667
|
|
|
|
|
|
|
elsif($tp53_mutation_found){ |
2668
|
0
|
|
|
|
|
0
|
$print_string .= "tp53_mutation_present:\t1 (mutations found)\n"; |
2669
|
|
|
|
|
|
|
} |
2670
|
|
|
|
|
|
|
else{ |
2671
|
0
|
|
|
|
|
0
|
$print_string .= "tp53_mutation_present:\t0\n"; |
2672
|
|
|
|
|
|
|
} |
2673
|
|
|
|
|
|
|
|
2674
|
|
|
|
|
|
|
|
2675
|
0
|
|
|
|
|
0
|
$print_string .= "---\n"; |
2676
|
0
|
|
|
|
|
0
|
$print_string .= "\n"; |
2677
|
0
|
|
|
|
|
0
|
print $OUTPUT_FILE $print_string; |
2678
|
0
|
|
|
|
|
0
|
$print_string = ""; |
2679
|
|
|
|
|
|
|
} |
2680
|
|
|
|
|
|
|
|
2681
|
0
|
|
|
|
|
0
|
print $OUTPUT_FILE "..."; |
2682
|
0
|
|
|
|
|
0
|
close($OUTPUT_FILE); |
2683
|
|
|
|
|
|
|
}#sub analyze_suspect_regions |
2684
|
|
|
|
|
|
|
|
2685
|
|
|
|
|
|
|
|
2686
|
|
|
|
|
|
|
#=head2 Sub-Method: analyze_likely_regions |
2687
|
|
|
|
|
|
|
|
2688
|
|
|
|
|
|
|
### analyze_likely_regions ######################################################################## |
2689
|
|
|
|
|
|
|
# Description: |
2690
|
|
|
|
|
|
|
# Generates an output file that lists the regions that have a mutation density that is less |
2691
|
|
|
|
|
|
|
# than the outlier cut off but greater than 1 - the outlier cut off |
2692
|
|
|
|
|
|
|
# |
2693
|
|
|
|
|
|
|
# Input variables: |
2694
|
|
|
|
|
|
|
# $output_directory: stores path to the output directory |
2695
|
|
|
|
|
|
|
# $likely_regions_array_ref: reference to array storing the chromosome, start, |
2696
|
|
|
|
|
|
|
# and end position of highly mutated regions |
2697
|
|
|
|
|
|
|
# $genome_mutation_density_hash_ref: stores the average mutation density of each |
2698
|
|
|
|
|
|
|
# chromosome |
2699
|
|
|
|
|
|
|
# $genome_cnv_data_hash_ref: stores the position of CNV mutations on each |
2700
|
|
|
|
|
|
|
# chromosome |
2701
|
|
|
|
|
|
|
# $genome_trans_data_hash_ref: stores the position of translocation events on each |
2702
|
|
|
|
|
|
|
# chromosome |
2703
|
|
|
|
|
|
|
# $bin_size: stores the size of a single bin |
2704
|
|
|
|
|
|
|
# |
2705
|
|
|
|
|
|
|
|
2706
|
|
|
|
|
|
|
#=cut |
2707
|
|
|
|
|
|
|
|
2708
|
|
|
|
|
|
|
sub analyze_likely_regions { |
2709
|
|
|
|
|
|
|
|
2710
|
|
|
|
|
|
|
#parse parameters |
2711
|
1
|
|
|
1
|
0
|
9
|
my $output_directory = shift; |
2712
|
|
|
|
|
|
|
|
2713
|
1
|
|
|
|
|
2
|
my $likely_regions_array_ref = shift; |
2714
|
1
|
|
|
|
|
18
|
my @likely_regions = @$likely_regions_array_ref; |
2715
|
|
|
|
|
|
|
|
2716
|
1
|
|
|
|
|
1
|
my $genome_mutation_density_hash_ref = shift; |
2717
|
1
|
|
|
|
|
3
|
my $genome_cnv_data_hash_ref = shift; |
2718
|
1
|
|
|
|
|
1
|
my $genome_trans_data_hash_ref = shift; |
2719
|
1
|
|
|
|
|
1
|
my $bin_size = shift; |
2720
|
|
|
|
|
|
|
|
2721
|
1
|
|
|
|
|
2
|
my $likely_regions_size = @likely_regions; |
2722
|
|
|
|
|
|
|
|
2723
|
1
|
|
|
|
|
1
|
my $OUTPUT_FILE; |
2724
|
|
|
|
|
|
|
|
2725
|
|
|
|
|
|
|
my @return_vals; |
2726
|
|
|
|
|
|
|
|
2727
|
1
|
|
|
|
|
2
|
my @likely_region_data = (); #stores start, end, chromsome and mutation density for each region |
2728
|
|
|
|
|
|
|
|
2729
|
|
|
|
|
|
|
#check that the likely region array is not malformed, should contain sets of 3 elements |
2730
|
1
|
50
|
|
|
|
4
|
if($likely_regions_size % 3 != 0){ |
2731
|
0
|
|
|
|
|
0
|
die "ERROR: suspect_regions_array has $likely_regions_size entries. Value must be divisible by 3.\n"; |
2732
|
|
|
|
|
|
|
} |
2733
|
|
|
|
|
|
|
|
2734
|
|
|
|
|
|
|
|
2735
|
|
|
|
|
|
|
#create output directory |
2736
|
1
|
50
|
|
|
|
116
|
mkdir ("$output_directory"."suspect_regions",0770) unless (-d "$output_directory"."suspect_regions"); |
2737
|
1
|
50
|
|
|
|
13
|
if(!(-e "$output_directory"."suspect_regions")){ |
2738
|
0
|
|
|
|
|
0
|
die "ERROR: could not create folder $output_directory"."suspect_regions"; |
2739
|
|
|
|
|
|
|
} |
2740
|
|
|
|
|
|
|
|
2741
|
|
|
|
|
|
|
#create output file |
2742
|
1
|
50
|
|
|
|
75
|
open ($OUTPUT_FILE, ">", "$output_directory"."suspect_regions/likely_regions.log") or die "ERROR: could not create file: $output_directory"."suspect_regions/likely_regions.log\n"; |
2743
|
|
|
|
|
|
|
|
2744
|
|
|
|
|
|
|
#print file header |
2745
|
1
|
|
|
|
|
11
|
print $OUTPUT_FILE "Likely Chromothriptic Regions\n"; |
2746
|
1
|
|
|
|
|
4
|
print $OUTPUT_FILE "High Mutation Density Z-Score:\t$outlier_deviation\n"; |
2747
|
1
|
|
|
|
|
2
|
print $OUTPUT_FILE "Min Mutation Density Z-Score:\t$outlier_deviation-1\n"; |
2748
|
1
|
|
|
|
|
2
|
print $OUTPUT_FILE "---------------------------------------\n"; |
2749
|
1
|
|
|
|
|
1
|
print $OUTPUT_FILE "#chr\tstart\tend\tmutation_density"; |
2750
|
|
|
|
|
|
|
|
2751
|
|
|
|
|
|
|
#for each likely region calculate the mutation density for the region and store it in the likely_region_data array |
2752
|
1
|
|
|
|
|
5
|
for (my $i = 0; $i < $likely_regions_size; $i+=3){ |
2753
|
13
|
|
|
|
|
22
|
my @region_data = (); |
2754
|
13
|
|
|
|
|
34
|
$region_data[0] = $likely_regions[$i]; #chr |
2755
|
13
|
|
|
|
|
21
|
$region_data[1] = $likely_regions[$i+1]; #start |
2756
|
13
|
|
|
|
|
23
|
$region_data[2] = $likely_regions[$i+2]; #end |
2757
|
|
|
|
|
|
|
|
2758
|
13
|
|
|
|
|
45
|
($region_data[3],$region_data[4]) = calculate_region_mutation_density_score($region_data[0], $region_data[1], $region_data[2], $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $genome_mutation_density_hash_ref, $bin_size); |
2759
|
|
|
|
|
|
|
|
2760
|
13
|
|
|
|
|
161
|
push @likely_region_data, [@region_data]; |
2761
|
|
|
|
|
|
|
} |
2762
|
|
|
|
|
|
|
|
2763
|
|
|
|
|
|
|
#sort the regions by density, largest to smallest |
2764
|
9
|
|
|
9
|
|
7599
|
{use sort 'stable'; |
|
9
|
|
|
|
|
24
|
|
|
9
|
|
|
|
|
49
|
|
|
1
|
|
|
|
|
4
|
|
2765
|
1
|
|
|
|
|
8
|
@likely_region_data = sort {$b->[3] <=> $a->[3] } @likely_region_data; |
|
34
|
|
|
|
|
51
|
|
2766
|
|
|
|
|
|
|
}#use sort 'stable' |
2767
|
|
|
|
|
|
|
|
2768
|
|
|
|
|
|
|
#print the density for each region to the output file |
2769
|
1
|
|
|
|
|
5
|
foreach my $i (@likely_region_data){ |
2770
|
13
|
|
|
|
|
17
|
my $chr = $i->[0]; #chr |
2771
|
13
|
|
|
|
|
9
|
my $start = $i->[1]; #start |
2772
|
13
|
|
|
|
|
11
|
my $end = $i->[2]; #end |
2773
|
|
|
|
|
|
|
|
2774
|
13
|
|
|
|
|
12
|
my $region_density = $i->[3]; #mutation density |
2775
|
|
|
|
|
|
|
|
2776
|
13
|
|
|
|
|
13
|
print $OUTPUT_FILE "\n"; |
2777
|
13
|
|
|
|
|
45
|
print $OUTPUT_FILE "$chr\t$start\t$end\t$region_density"; |
2778
|
13
|
|
|
|
|
33
|
push (@return_vals,$chr,$start,$end,$region_density); |
2779
|
|
|
|
|
|
|
} |
2780
|
|
|
|
|
|
|
|
2781
|
1
|
|
|
|
|
98
|
close($OUTPUT_FILE); |
2782
|
1
|
|
|
|
|
23
|
return(\@return_vals); |
2783
|
|
|
|
|
|
|
}#sub analyze_likely_regions |
2784
|
|
|
|
|
|
|
|
2785
|
|
|
|
|
|
|
|
2786
|
|
|
|
|
|
|
#=head2 Sub-Method: calculate_score |
2787
|
|
|
|
|
|
|
|
2788
|
|
|
|
|
|
|
### calculate_score ############################################################################### |
2789
|
|
|
|
|
|
|
# Description: |
2790
|
|
|
|
|
|
|
# Calculates the chromothripic score for the given region. Calls sub methods to generate the |
2791
|
|
|
|
|
|
|
# score for each hallmark |
2792
|
|
|
|
|
|
|
# |
2793
|
|
|
|
|
|
|
# Input variables: |
2794
|
|
|
|
|
|
|
# $chr: stores the chromosome on which the region |
2795
|
|
|
|
|
|
|
# is found |
2796
|
|
|
|
|
|
|
# $start: stores the start base pair of the region |
2797
|
|
|
|
|
|
|
# $end: stores the end base pair of the region |
2798
|
|
|
|
|
|
|
# $genome_cnv_data_hash_ref: stores the position of CNV mutations on |
2799
|
|
|
|
|
|
|
# each chromosome |
2800
|
|
|
|
|
|
|
# $genome_trans_data_hash_ref: stores the position of translocation events |
2801
|
|
|
|
|
|
|
# on each chromosome |
2802
|
|
|
|
|
|
|
# $genome_mutation_density_hash_ref: stores the average mutation density of each |
2803
|
|
|
|
|
|
|
# chromosome |
2804
|
|
|
|
|
|
|
# $genome_trans_insertion_breakpoints_hash_ref: stores the position of insertions on each |
2805
|
|
|
|
|
|
|
# chromosome |
2806
|
|
|
|
|
|
|
# $tp53_mutated: stores whether the TP53 gene is mutatated |
2807
|
|
|
|
|
|
|
# or not |
2808
|
|
|
|
|
|
|
# $chromosome_cnv_breakpoints_hash_ref: stores the breakpoints of CNV mutations on |
2809
|
|
|
|
|
|
|
# each chromosome |
2810
|
|
|
|
|
|
|
# $chromosome_loh_breakpoints_hash_ref: stores the breakpoints of LOH regions on |
2811
|
|
|
|
|
|
|
# each chromosome |
2812
|
|
|
|
|
|
|
# $bin_size: stores the size of a single bin |
2813
|
|
|
|
|
|
|
# |
2814
|
|
|
|
|
|
|
|
2815
|
|
|
|
|
|
|
#=cut |
2816
|
|
|
|
|
|
|
|
2817
|
|
|
|
|
|
|
sub calculate_score{ |
2818
|
|
|
|
|
|
|
|
2819
|
|
|
|
|
|
|
#parse parameters |
2820
|
0
|
|
|
0
|
0
|
0
|
my $chr = shift; |
2821
|
0
|
|
|
|
|
0
|
my $start = shift; |
2822
|
0
|
|
|
|
|
0
|
my $end = shift; |
2823
|
|
|
|
|
|
|
|
2824
|
0
|
|
|
|
|
0
|
my $genome_cnv_data_hash_ref = shift; |
2825
|
0
|
|
|
|
|
0
|
my $genome_trans_data_hash_ref = shift; |
2826
|
0
|
|
|
|
|
0
|
my $genome_mutation_density_hash_ref = shift; |
2827
|
0
|
|
|
|
|
0
|
my $genome_trans_insertion_breakpoints_hash_ref = shift; |
2828
|
0
|
|
|
|
|
0
|
my $tp53_mutated = shift; |
2829
|
0
|
|
|
|
|
0
|
my $chromosome_cnv_breakpoints_hash_ref = shift; |
2830
|
0
|
|
|
|
|
0
|
my $chromosome_loh_breakpoints_hash_ref = shift; |
2831
|
0
|
|
|
|
|
0
|
my $bin_size = shift; |
2832
|
|
|
|
|
|
|
|
2833
|
|
|
|
|
|
|
#initialize variable to store scores for each hallmark |
2834
|
0
|
|
|
|
|
0
|
my $cnv_score = 0; |
2835
|
0
|
|
|
|
|
0
|
my $mutation_density_score = 0; |
2836
|
0
|
|
|
|
|
0
|
my $genome_localization_score = 0; |
2837
|
0
|
|
|
|
|
0
|
my $translocation_score = 0; |
2838
|
0
|
|
|
|
|
0
|
my $insertion_breakpoint_score = 0; |
2839
|
0
|
|
|
|
|
0
|
my $loh_score = 0; |
2840
|
0
|
|
|
|
|
0
|
my $final_score = 0; |
2841
|
|
|
|
|
|
|
|
2842
|
0
|
|
|
|
|
0
|
my @score_array; #array in which hallmark scores will be returned |
2843
|
|
|
|
|
|
|
|
2844
|
|
|
|
|
|
|
my $chr_mutation_density; #stores the average mutation density of the chromosome where the region is found |
2845
|
0
|
|
|
|
|
0
|
my $chr_z_score; #stores the z_score of the mutation density of the chromosome where the region is found vs |
2846
|
|
|
|
|
|
|
#all the other chromosomes |
2847
|
0
|
|
|
|
|
0
|
my $cnv_number_hash_ref; #stores a hash that contains the number of regions of each abberant copy-number |
2848
|
0
|
|
|
|
|
0
|
my $cnv_density; #stores the density of cnv mutations in the region |
2849
|
0
|
|
|
|
|
0
|
my $translocation_density; #stores the density of translocation mutations in the region |
2850
|
0
|
|
|
|
|
0
|
my $mutation_density; #stores the density of all mutations in the region |
2851
|
0
|
|
|
|
|
0
|
my $intertranslocation_hash_ref; #stores the number of translocations between all other chromosomes and the region |
2852
|
0
|
|
|
|
|
0
|
my $breakpoint_insertions_array_ref; #stores the total number of translocation breakpoints, and the number that have insertions nearby |
2853
|
0
|
|
|
|
|
0
|
my $loh_size = -1; #stores the amount of heterozygosity that was lost in the region |
2854
|
0
|
|
|
|
|
0
|
my $heterozygous_size; #stores the original amount of heterozygosity in the region |
2855
|
|
|
|
|
|
|
|
2856
|
0
|
|
|
|
|
0
|
($cnv_score, $cnv_number_hash_ref, $cnv_density) = calculate_copy_number_scores($chr, $start, $end, $genome_cnv_data_hash_ref, $bin_size); |
2857
|
0
|
|
|
|
|
0
|
($genome_localization_score, $chr_z_score, $chr_mutation_density) = calculate_genome_localization_score($chr, $genome_mutation_density_hash_ref); |
2858
|
0
|
|
|
|
|
0
|
($translocation_score, $intertranslocation_hash_ref, $translocation_density) = calculate_translocation_score($chr, $start, $end, $genome_trans_data_hash_ref, $bin_size); |
2859
|
|
|
|
|
|
|
|
2860
|
0
|
0
|
|
|
|
0
|
if(defined($genome_trans_insertion_breakpoints_hash_ref)){ |
2861
|
0
|
|
|
|
|
0
|
($insertion_breakpoint_score, $breakpoint_insertions_array_ref) = calculate_insertion_breakpoint_score($chr, $start, $end, $genome_trans_data_hash_ref, $genome_trans_insertion_breakpoints_hash_ref, $bin_size); |
2862
|
|
|
|
|
|
|
} |
2863
|
|
|
|
|
|
|
|
2864
|
0
|
|
|
|
|
0
|
($mutation_density, $mutation_density_score) = calculate_region_mutation_density_score($chr, $start, $end, $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $genome_mutation_density_hash_ref, $bin_size); |
2865
|
|
|
|
|
|
|
|
2866
|
0
|
0
|
|
|
|
0
|
if(defined($chromosome_loh_breakpoints_hash_ref)){ |
2867
|
0
|
|
|
|
|
0
|
($loh_score, $loh_size, $heterozygous_size) = calculate_loh_score($chr, $start, $end, $chromosome_cnv_breakpoints_hash_ref, $chromosome_loh_breakpoints_hash_ref); |
2868
|
|
|
|
|
|
|
} |
2869
|
|
|
|
|
|
|
|
2870
|
|
|
|
|
|
|
#calculate overall score for region based on hallmark weights and scores |
2871
|
0
|
|
|
|
|
0
|
$final_score = ($cnv_score*$cnv_weight) + ($mutation_density_score*$chromosome_localization_weight) + ($genome_localization_score*$genome_localization_weight) + ($translocation_score*$translocation_weight) + ($insertion_breakpoint_score*$insertion_breakpoint_weight) + ($tp53_mutated*$tp53_mutated_weight) + ($loh_score*$loh_weight); |
2872
|
|
|
|
|
|
|
|
2873
|
|
|
|
|
|
|
#push the hallmark scores into the score array |
2874
|
0
|
|
|
|
|
0
|
push (@score_array, ($cnv_score, $mutation_density_score, $genome_localization_score, $translocation_score, $insertion_breakpoint_score, $loh_score, $tp53_mutated)); |
2875
|
|
|
|
|
|
|
|
2876
|
|
|
|
|
|
|
#return the scores and other region statistics |
2877
|
0
|
|
|
|
|
0
|
return ($final_score, $chr_z_score, $mutation_density, $cnv_number_hash_ref, $cnv_density, $intertranslocation_hash_ref , $translocation_density, $breakpoint_insertions_array_ref, $loh_size, $heterozygous_size, \@score_array); |
2878
|
|
|
|
|
|
|
}#sub calculate_score |
2879
|
|
|
|
|
|
|
|
2880
|
|
|
|
|
|
|
|
2881
|
|
|
|
|
|
|
|
2882
|
|
|
|
|
|
|
#=head2 Sub-Method: calculate_copy_number_score |
2883
|
|
|
|
|
|
|
|
2884
|
|
|
|
|
|
|
### calculate_copy_number_score ################################################################## |
2885
|
|
|
|
|
|
|
# Description: |
2886
|
|
|
|
|
|
|
# Calculates the score for the copy-number variation hallmark |
2887
|
|
|
|
|
|
|
# |
2888
|
|
|
|
|
|
|
# Input variables: |
2889
|
|
|
|
|
|
|
# $chr: stores the chromsome where the region is located |
2890
|
|
|
|
|
|
|
# $start: stores the starting location of the region |
2891
|
|
|
|
|
|
|
# $end: stores the end location of the region |
2892
|
|
|
|
|
|
|
# $genome_cnv_data_hash_ref: stores the position of CNV mutations on each chromosome |
2893
|
|
|
|
|
|
|
# $bin_size: stores the size of single bin |
2894
|
|
|
|
|
|
|
# |
2895
|
|
|
|
|
|
|
|
2896
|
|
|
|
|
|
|
#=cut |
2897
|
|
|
|
|
|
|
|
2898
|
|
|
|
|
|
|
sub calculate_copy_number_scores { |
2899
|
|
|
|
|
|
|
|
2900
|
|
|
|
|
|
|
#parse parameters |
2901
|
0
|
|
|
0
|
0
|
0
|
my $chr = shift; |
2902
|
0
|
|
|
|
|
0
|
my $start = shift; |
2903
|
0
|
|
|
|
|
0
|
my $end = shift; |
2904
|
|
|
|
|
|
|
|
2905
|
0
|
|
|
|
|
0
|
my $genome_cnv_data_hash_ref = shift; |
2906
|
0
|
|
|
|
|
0
|
my %genome_cnv_data_hash = %$genome_cnv_data_hash_ref; |
2907
|
|
|
|
|
|
|
|
2908
|
0
|
|
|
|
|
0
|
my $bin_size = shift; |
2909
|
|
|
|
|
|
|
|
2910
|
|
|
|
|
|
|
#calculate array index where the data for the first and last bins of the region are located |
2911
|
0
|
|
|
|
|
0
|
my $start_index = $start / ($bin_size); |
2912
|
0
|
|
|
|
|
0
|
my $end_index = $end / ($bin_size); |
2913
|
|
|
|
|
|
|
|
2914
|
0
|
|
|
|
|
0
|
my $cnv_score = 0; #stores final score to return |
2915
|
|
|
|
|
|
|
|
2916
|
0
|
|
|
|
|
0
|
my %cnv_number_hash; #hash |
2917
|
|
|
|
|
|
|
#key: copy number eg 0,1,3,4 |
2918
|
|
|
|
|
|
|
#value: the number of regions with the given copy number |
2919
|
|
|
|
|
|
|
|
2920
|
0
|
|
|
|
|
0
|
my $cnv_switch_count = 0; #number of switches between different copy numbers |
2921
|
0
|
|
|
|
|
0
|
my $cnv_switch_density = 0; #density of cnv events in the region |
2922
|
|
|
|
|
|
|
|
2923
|
0
|
|
|
|
|
0
|
my @chr_data; #stores all the bins for the chromosome where the region is located |
2924
|
|
|
|
|
|
|
my %cnv_hash; #stores the cnv hash from each bin |
2925
|
|
|
|
|
|
|
|
2926
|
0
|
|
|
|
|
0
|
my $mean = 0; #stores the average number of regions of aberrant copy-number |
2927
|
0
|
|
|
|
|
0
|
my $SD = 0; #stores the standard deviation of the number of regions of aberrant copy-number |
2928
|
|
|
|
|
|
|
|
2929
|
0
|
|
|
|
|
0
|
my %cnv_significant; #hash |
2930
|
|
|
|
|
|
|
#key: copy number (but only significant ones are stored) |
2931
|
|
|
|
|
|
|
#value: the number of regions with the given copy number |
2932
|
|
|
|
|
|
|
|
2933
|
0
|
|
|
|
|
0
|
my $significant_count = 0; #stores the number of unique significant copy-numbers |
2934
|
|
|
|
|
|
|
|
2935
|
|
|
|
|
|
|
#check if there is cnv data for the chromosome |
2936
|
0
|
0
|
|
|
|
0
|
if(defined($genome_cnv_data_hash{$chr})){ |
2937
|
|
|
|
|
|
|
|
2938
|
|
|
|
|
|
|
#extract the bin data for the chromosome |
2939
|
0
|
|
|
|
|
0
|
@chr_data = @{$genome_cnv_data_hash{$chr}}; |
|
0
|
|
|
|
|
0
|
|
2940
|
|
|
|
|
|
|
|
2941
|
|
|
|
|
|
|
#collect the data from the bins that contain the region |
2942
|
0
|
|
|
|
|
0
|
for (my $i = $start_index; $i < $end_index+1; $i++){ |
2943
|
0
|
0
|
|
|
|
0
|
if(!defined($chr_data[$i])){ |
2944
|
0
|
|
|
|
|
0
|
next; |
2945
|
|
|
|
|
|
|
} |
2946
|
0
|
|
|
|
|
0
|
%cnv_hash = %{$chr_data[$i]}; |
|
0
|
|
|
|
|
0
|
|
2947
|
|
|
|
|
|
|
|
2948
|
0
|
|
|
|
|
0
|
for my $key (keys %cnv_hash){ |
2949
|
0
|
0
|
|
|
|
0
|
if($key eq 'BPcount'){ |
2950
|
0
|
|
|
|
|
0
|
$cnv_switch_count += $cnv_hash{$key}; |
2951
|
|
|
|
|
|
|
} |
2952
|
|
|
|
|
|
|
else{ |
2953
|
0
|
|
|
|
|
0
|
$cnv_number_hash{$key}+= $cnv_hash{$key}; |
2954
|
|
|
|
|
|
|
} |
2955
|
|
|
|
|
|
|
} |
2956
|
|
|
|
|
|
|
} |
2957
|
|
|
|
|
|
|
|
2958
|
|
|
|
|
|
|
#calculate the breakpoint density of cnv mutations for the region |
2959
|
0
|
|
|
|
|
0
|
$cnv_switch_density = $cnv_switch_count/ ($end-$start); |
2960
|
|
|
|
|
|
|
|
2961
|
|
|
|
|
|
|
#calculate the number of cnv events in the region (half the number of breakpoints) |
2962
|
0
|
|
|
|
|
0
|
for my $key (keys %cnv_number_hash){ |
2963
|
0
|
|
|
|
|
0
|
$cnv_number_hash{$key} = POSIX::ceil($cnv_number_hash{$key}); |
2964
|
0
|
|
|
|
|
0
|
$mean += $cnv_number_hash{$key}; |
2965
|
|
|
|
|
|
|
} |
2966
|
|
|
|
|
|
|
|
2967
|
|
|
|
|
|
|
#if no cnv mutations were found return a score of 0 |
2968
|
0
|
0
|
|
|
|
0
|
if(scalar(keys %cnv_number_hash)==0){ |
2969
|
0
|
|
|
|
|
0
|
$cnv_score = 0; |
2970
|
0
|
|
|
|
|
0
|
return ($cnv_score, \%cnv_number_hash, $cnv_switch_density); |
2971
|
|
|
|
|
|
|
} |
2972
|
|
|
|
|
|
|
|
2973
|
|
|
|
|
|
|
#calculate the mean of the number regions of each copy-number |
2974
|
0
|
|
|
|
|
0
|
$mean = $mean/(scalar(keys %cnv_number_hash)); |
2975
|
|
|
|
|
|
|
|
2976
|
|
|
|
|
|
|
#calculate the standard deviation of the number of regions of each copy-number |
2977
|
0
|
|
|
|
|
0
|
for my $key (keys %cnv_number_hash){ |
2978
|
0
|
|
|
|
|
0
|
$SD += ($cnv_number_hash{$key}-$mean)**2; |
2979
|
|
|
|
|
|
|
} |
2980
|
0
|
|
|
|
|
0
|
$SD = $SD/(scalar(keys %cnv_number_hash)); |
2981
|
0
|
|
|
|
|
0
|
$SD = $SD**0.5; |
2982
|
|
|
|
|
|
|
|
2983
|
|
|
|
|
|
|
#determine which copy-numbers are significant (ie are not low out liers) |
2984
|
0
|
|
|
|
|
0
|
for my $key (keys %cnv_number_hash){ |
2985
|
0
|
0
|
0
|
|
|
0
|
if( |
2986
|
|
|
|
|
|
|
( $SD==0 )|| |
2987
|
|
|
|
|
|
|
( (($cnv_number_hash{$key}-$mean)/$SD) >= -1*$outlier_deviation ) |
2988
|
|
|
|
|
|
|
){ |
2989
|
0
|
|
|
|
|
0
|
$cnv_significant{$key} = $cnv_number_hash{$key}; |
2990
|
0
|
|
|
|
|
0
|
$cnv_score = $cnv_score + $cnv_significant{$key}**2; |
2991
|
|
|
|
|
|
|
} |
2992
|
|
|
|
|
|
|
} |
2993
|
|
|
|
|
|
|
|
2994
|
|
|
|
|
|
|
#score calculation |
2995
|
0
|
|
|
|
|
0
|
$cnv_score = $cnv_score / (scalar(keys %cnv_significant)); |
2996
|
0
|
|
|
|
|
0
|
$cnv_score = log($cnv_score)/log(2); |
2997
|
0
|
|
|
|
|
0
|
$cnv_score += 1; |
2998
|
0
|
|
|
|
|
0
|
$cnv_score = 1 - (1/$cnv_score); |
2999
|
0
|
|
|
|
|
0
|
$cnv_score = $cnv_score/(scalar(keys %cnv_significant)); |
3000
|
|
|
|
|
|
|
#$cnv_score = (1/(scalar(keys %cnv_significant)))*(0.25) + (1-(1/(1+log($cnv_score/(scalar(keys %cnv_significant)))/log(2))))*(0.75); |
3001
|
|
|
|
|
|
|
} |
3002
|
|
|
|
|
|
|
|
3003
|
0
|
|
|
|
|
0
|
return ($cnv_score, \%cnv_number_hash, $cnv_switch_density); |
3004
|
|
|
|
|
|
|
|
3005
|
|
|
|
|
|
|
}#sub calculate_copy_number_scores |
3006
|
|
|
|
|
|
|
|
3007
|
|
|
|
|
|
|
|
3008
|
|
|
|
|
|
|
#=head2 Sub-Method: calculate_genome_localization_score |
3009
|
|
|
|
|
|
|
|
3010
|
|
|
|
|
|
|
### calculate_genome_localization_score ########################################################## |
3011
|
|
|
|
|
|
|
# Description: |
3012
|
|
|
|
|
|
|
# Calculates the genome localization hallmark score |
3013
|
|
|
|
|
|
|
# |
3014
|
|
|
|
|
|
|
# Input variables: |
3015
|
|
|
|
|
|
|
# $chr: store the chromosome where the region is located |
3016
|
|
|
|
|
|
|
# $genome_mutation_density_hash_ref: stores the average mutation density of each |
3017
|
|
|
|
|
|
|
# chromosome |
3018
|
|
|
|
|
|
|
# |
3019
|
|
|
|
|
|
|
|
3020
|
|
|
|
|
|
|
#=cut |
3021
|
|
|
|
|
|
|
|
3022
|
|
|
|
|
|
|
sub calculate_genome_localization_score { |
3023
|
|
|
|
|
|
|
|
3024
|
|
|
|
|
|
|
#parse parameters |
3025
|
0
|
|
|
0
|
0
|
0
|
my $chr = shift; |
3026
|
|
|
|
|
|
|
|
3027
|
0
|
|
|
|
|
0
|
my $genome_mutation_density_hash_ref = shift; |
3028
|
0
|
|
|
|
|
0
|
my %genome_mutation_density_hash = %{$genome_mutation_density_hash_ref}; |
|
0
|
|
|
|
|
0
|
|
3029
|
|
|
|
|
|
|
|
3030
|
|
|
|
|
|
|
#read mutation density for the chromosome |
3031
|
0
|
|
|
|
|
0
|
my $chr_mutation_density = $genome_mutation_density_hash{$chr}; |
3032
|
|
|
|
|
|
|
|
3033
|
0
|
|
|
|
|
0
|
my $mean_density = 0; #stores the average density of mutations for the chromosomes |
3034
|
|
|
|
|
|
|
|
3035
|
0
|
|
|
|
|
0
|
my $standard_deviation = 0; #stores the standard deviation of the mutation densities of the chromosomes |
3036
|
|
|
|
|
|
|
|
3037
|
0
|
|
|
|
|
0
|
my $z_score = 0; #stores the z-score for the suspect chromosome |
3038
|
0
|
|
|
|
|
0
|
my $p_val = 0; #stores the p-value calculated from the z-score for the suspect chromosome |
3039
|
|
|
|
|
|
|
|
3040
|
|
|
|
|
|
|
#sum mutation densities of all the chromosomes |
3041
|
0
|
|
|
|
|
0
|
for my $key (keys %genome_mutation_density_hash){ |
3042
|
0
|
|
|
|
|
0
|
$mean_density += $genome_mutation_density_hash{$key}/$chromosome_length{$chr}; |
3043
|
|
|
|
|
|
|
} |
3044
|
|
|
|
|
|
|
|
3045
|
|
|
|
|
|
|
#calculate the mean |
3046
|
0
|
|
|
|
|
0
|
$mean_density = $mean_density / 24; |
3047
|
|
|
|
|
|
|
|
3048
|
|
|
|
|
|
|
#calculate the standard deviation |
3049
|
0
|
|
|
|
|
0
|
for my $key (keys %genome_mutation_density_hash){ |
3050
|
0
|
|
|
|
|
0
|
$standard_deviation += ((($genome_mutation_density_hash{$key}) - ($mean_density) )**2); |
3051
|
|
|
|
|
|
|
} |
3052
|
0
|
|
|
|
|
0
|
$standard_deviation = $standard_deviation / 24; |
3053
|
0
|
|
|
|
|
0
|
$standard_deviation = ($standard_deviation)**0.5; |
3054
|
|
|
|
|
|
|
|
3055
|
|
|
|
|
|
|
#check for case where the standard deviation or mean comes back as 0 |
3056
|
0
|
0
|
0
|
|
|
0
|
if($mean_density == 0 || $standard_deviation == 0){ |
3057
|
0
|
|
|
|
|
0
|
return (0, 0, $chr_mutation_density); |
3058
|
|
|
|
|
|
|
} |
3059
|
|
|
|
|
|
|
|
3060
|
|
|
|
|
|
|
#calculate the z-score for suspect chromosome |
3061
|
0
|
|
|
|
|
0
|
$z_score = ($chr_mutation_density - $mean_density) / $standard_deviation; |
3062
|
|
|
|
|
|
|
|
3063
|
|
|
|
|
|
|
#calculate the p-value from the z-score |
3064
|
0
|
|
|
|
|
0
|
$p_val = Statistics::Distributions::uprob($z_score); |
3065
|
|
|
|
|
|
|
#only consider top tail |
3066
|
0
|
|
|
|
|
0
|
$p_val = 0.5-$p_val; |
3067
|
0
|
|
|
|
|
0
|
$p_val = $p_val/0.5; |
3068
|
|
|
|
|
|
|
|
3069
|
|
|
|
|
|
|
#check for case where z-score comes back as 0 |
3070
|
0
|
0
|
|
|
|
0
|
if($z_score < 0){ |
3071
|
0
|
|
|
|
|
0
|
$p_val = 0; |
3072
|
|
|
|
|
|
|
} |
3073
|
|
|
|
|
|
|
|
3074
|
|
|
|
|
|
|
#p_val is the score for this hallmark |
3075
|
0
|
|
|
|
|
0
|
return ($p_val, $z_score, $chr_mutation_density); |
3076
|
|
|
|
|
|
|
|
3077
|
|
|
|
|
|
|
}#sub calculate_genome_localization_score |
3078
|
|
|
|
|
|
|
|
3079
|
|
|
|
|
|
|
|
3080
|
|
|
|
|
|
|
#=head2 Sub-Method: calculate_region_mutation_density_score |
3081
|
|
|
|
|
|
|
|
3082
|
|
|
|
|
|
|
### calculate_region_mutation_density_score ###################################################### |
3083
|
|
|
|
|
|
|
# Description: |
3084
|
|
|
|
|
|
|
# Calculates the chromosome localization hallmark score |
3085
|
|
|
|
|
|
|
# |
3086
|
|
|
|
|
|
|
# Input variables: |
3087
|
|
|
|
|
|
|
# $chr: chromosome where the region is located |
3088
|
|
|
|
|
|
|
# $start: starting location of the region |
3089
|
|
|
|
|
|
|
# $end: end location of the region |
3090
|
|
|
|
|
|
|
# $genome_cnv_data_hash_ref: stores the position of CNV mutations on each |
3091
|
|
|
|
|
|
|
# chromosome |
3092
|
|
|
|
|
|
|
# $genome_trans_data_hash_ref: stores the position of translocation events |
3093
|
|
|
|
|
|
|
# on each chromosome |
3094
|
|
|
|
|
|
|
# $genome_mutation_density_hash_ref: stores the average mutation density of each |
3095
|
|
|
|
|
|
|
# chromosome |
3096
|
|
|
|
|
|
|
# $bin_size: stores the size of single bin |
3097
|
|
|
|
|
|
|
# |
3098
|
|
|
|
|
|
|
|
3099
|
|
|
|
|
|
|
#=cut |
3100
|
|
|
|
|
|
|
|
3101
|
|
|
|
|
|
|
sub calculate_region_mutation_density_score { |
3102
|
|
|
|
|
|
|
|
3103
|
|
|
|
|
|
|
#parse parameters |
3104
|
13
|
|
|
13
|
0
|
25
|
my $chr = shift; |
3105
|
13
|
|
|
|
|
22
|
my $start = shift; |
3106
|
13
|
|
|
|
|
21
|
my $end = shift; |
3107
|
|
|
|
|
|
|
|
3108
|
13
|
|
|
|
|
14
|
my $genome_cnv_data_hash_ref = shift; |
3109
|
13
|
|
|
|
|
18
|
my %genome_cnv_data_hash = %{$genome_cnv_data_hash_ref}; |
|
13
|
|
|
|
|
189
|
|
3110
|
|
|
|
|
|
|
|
3111
|
13
|
|
|
|
|
25
|
my $genome_trans_data_hash_ref = shift; |
3112
|
13
|
|
|
|
|
21
|
my %genome_trans_data_hash = %{$genome_trans_data_hash_ref}; |
|
13
|
|
|
|
|
88
|
|
3113
|
|
|
|
|
|
|
|
3114
|
13
|
|
|
|
|
19
|
my $genome_mutation_density_hash_ref = shift; |
3115
|
13
|
|
|
|
|
21
|
my %genome_mutation_density_hash = %{$genome_mutation_density_hash_ref}; |
|
13
|
|
|
|
|
161
|
|
3116
|
|
|
|
|
|
|
|
3117
|
13
|
|
|
|
|
24
|
my $bin_size = shift; |
3118
|
|
|
|
|
|
|
|
3119
|
|
|
|
|
|
|
#calculate array index where the data for the first and last bins of the region are located |
3120
|
13
|
|
|
|
|
29
|
my $start_index = $start / $bin_size; |
3121
|
13
|
|
|
|
|
16
|
my $end_index = $end / $bin_size; |
3122
|
|
|
|
|
|
|
|
3123
|
|
|
|
|
|
|
#get the mean mutation density for the suspect chromosome |
3124
|
13
|
|
|
|
|
25
|
my $mean_chr_mutation_density = $genome_mutation_density_hash{$chr}; |
3125
|
|
|
|
|
|
|
|
3126
|
13
|
|
|
|
|
11
|
my $chis_stat; #stores the chi squared statistic value |
3127
|
|
|
|
|
|
|
|
3128
|
13
|
|
|
|
|
14
|
my $mutation_count = 0; #stores the total mutation count in the region |
3129
|
13
|
|
|
|
|
14
|
my $cnv_count = 0; #stores the number of cnv breakpoints in the region |
3130
|
13
|
|
|
|
|
18
|
my $trans_count = 0; #stores the number of translocation breakpoints in the region |
3131
|
13
|
|
|
|
|
13
|
my $mutation_density = 0; #store the mutation density of the region |
3132
|
13
|
|
|
|
|
14
|
my $mutation_density_score; #stores the final score for the hallmark |
3133
|
|
|
|
|
|
|
|
3134
|
|
|
|
|
|
|
my @chr_data; #stores the bin data for the chromosome |
3135
|
0
|
|
|
|
|
0
|
my %cnv_hash; #stores the cnv hash for each bin |
3136
|
0
|
|
|
|
|
0
|
my %trans_hash; #stores the translocation hash for each bin |
3137
|
|
|
|
|
|
|
|
3138
|
|
|
|
|
|
|
#check that there is cnv data for the chromosome |
3139
|
13
|
100
|
|
|
|
59
|
if(defined($genome_cnv_data_hash{$chr})){ |
3140
|
|
|
|
|
|
|
#get the cnv data for the chromosome |
3141
|
12
|
|
|
|
|
10
|
@chr_data = @{$genome_cnv_data_hash{$chr}}; |
|
12
|
|
|
|
|
41384
|
|
3142
|
|
|
|
|
|
|
|
3143
|
|
|
|
|
|
|
#sum the number of cnv breakpoints in the suspect region |
3144
|
12
|
|
|
|
|
115
|
for (my $i = $start_index; $i < $end_index+1; $i++){ |
3145
|
145335
|
100
|
|
|
|
175498
|
if(!defined($chr_data[$i])){ |
3146
|
145301
|
|
|
|
|
181774
|
next; |
3147
|
|
|
|
|
|
|
} |
3148
|
|
|
|
|
|
|
|
3149
|
34
|
|
|
|
|
38
|
%cnv_hash = %{$chr_data[$i]}; |
|
34
|
|
|
|
|
186
|
|
3150
|
34
|
|
|
|
|
89
|
$cnv_count += $cnv_hash{'BPcount'}; |
3151
|
|
|
|
|
|
|
} |
3152
|
|
|
|
|
|
|
} |
3153
|
|
|
|
|
|
|
|
3154
|
13
|
|
|
|
|
20176
|
@chr_data = (); |
3155
|
|
|
|
|
|
|
|
3156
|
|
|
|
|
|
|
#check that there is translocation data for the chromosome |
3157
|
13
|
50
|
|
|
|
155
|
if(defined($genome_trans_data_hash{$chr})){ |
3158
|
|
|
|
|
|
|
#get the translocation data for the chromosome |
3159
|
13
|
|
|
|
|
21
|
@chr_data = @{$genome_trans_data_hash{$chr}}; |
|
13
|
|
|
|
|
25997
|
|
3160
|
|
|
|
|
|
|
|
3161
|
|
|
|
|
|
|
#sum the number of translocation breakpoints in the suspect region |
3162
|
13
|
|
|
|
|
108
|
for (my $i = $start_index; $i < $end_index+1; $i++){ |
3163
|
163406
|
100
|
|
|
|
201253
|
if(!defined($chr_data[$i])){ |
3164
|
163313
|
|
|
|
|
211797
|
next; |
3165
|
|
|
|
|
|
|
} |
3166
|
93
|
|
|
|
|
68
|
%trans_hash = %{$chr_data[$i]}; |
|
93
|
|
|
|
|
404
|
|
3167
|
93
|
|
|
|
|
214
|
$trans_count += $trans_hash{'BPcount'}; |
3168
|
|
|
|
|
|
|
} |
3169
|
|
|
|
|
|
|
} |
3170
|
|
|
|
|
|
|
|
3171
|
|
|
|
|
|
|
#divide the count by 2 to get the number of translocation events |
3172
|
13
|
|
|
|
|
116
|
$trans_count = POSIX::ceil($trans_count/2); |
3173
|
|
|
|
|
|
|
|
3174
|
|
|
|
|
|
|
#calculate the total number events in the region |
3175
|
13
|
|
|
|
|
37
|
$mutation_count = $cnv_count + $trans_count; |
3176
|
|
|
|
|
|
|
|
3177
|
|
|
|
|
|
|
#calculate the mutation density for the region |
3178
|
13
|
|
|
|
|
34
|
$mutation_density = $mutation_count / ($end-$start); |
3179
|
|
|
|
|
|
|
|
3180
|
|
|
|
|
|
|
#calculate the chi squared statistic |
3181
|
13
|
|
|
|
|
101
|
$chis_stat = abs(((log($mutation_density)-log($mean_chr_mutation_density))**2)/(log($mean_chr_mutation_density))); |
3182
|
|
|
|
|
|
|
|
3183
|
|
|
|
|
|
|
#generate a p-value using the chi squared test |
3184
|
13
|
|
|
|
|
97
|
$mutation_density_score = 1-(Statistics::Distributions::chisqrprob(1,$chis_stat)); |
3185
|
|
|
|
|
|
|
|
3186
|
13
|
|
|
|
|
17643
|
return ($mutation_density, $mutation_density_score); |
3187
|
|
|
|
|
|
|
}#calculate_region_mutation_density_score |
3188
|
|
|
|
|
|
|
|
3189
|
|
|
|
|
|
|
|
3190
|
|
|
|
|
|
|
#=head2 Sub-Method: calculate_translocation_score |
3191
|
|
|
|
|
|
|
|
3192
|
|
|
|
|
|
|
### calculate_translocation_score ################################################################# |
3193
|
|
|
|
|
|
|
# Description: |
3194
|
|
|
|
|
|
|
# Calculates the translocation hallmark score |
3195
|
|
|
|
|
|
|
# |
3196
|
|
|
|
|
|
|
# Input variables: |
3197
|
|
|
|
|
|
|
# $chr: chromosome where the region is located |
3198
|
|
|
|
|
|
|
# $start: starting location of the region |
3199
|
|
|
|
|
|
|
# $end: end location of the region |
3200
|
|
|
|
|
|
|
# $genome_trans_data_hash_ref: stores the position of translocation events on each |
3201
|
|
|
|
|
|
|
# chromosome |
3202
|
|
|
|
|
|
|
# $bin_size: stores the size of single bin |
3203
|
|
|
|
|
|
|
# |
3204
|
|
|
|
|
|
|
|
3205
|
|
|
|
|
|
|
#=cut |
3206
|
|
|
|
|
|
|
|
3207
|
|
|
|
|
|
|
sub calculate_translocation_score { |
3208
|
|
|
|
|
|
|
|
3209
|
|
|
|
|
|
|
#parse parameters |
3210
|
0
|
|
|
0
|
0
|
0
|
my $chr = shift; |
3211
|
0
|
|
|
|
|
0
|
my $start = shift; |
3212
|
0
|
|
|
|
|
0
|
my $end = shift; |
3213
|
|
|
|
|
|
|
|
3214
|
0
|
|
|
|
|
0
|
my $genome_trans_data_hash_ref = shift; |
3215
|
0
|
|
|
|
|
0
|
my %genome_trans_data_hash = %{$genome_trans_data_hash_ref}; |
|
0
|
|
|
|
|
0
|
|
3216
|
|
|
|
|
|
|
|
3217
|
0
|
|
|
|
|
0
|
my $bin_size = shift; |
3218
|
|
|
|
|
|
|
|
3219
|
|
|
|
|
|
|
#calculate array index where the data for the first and last bins of the region are located |
3220
|
0
|
|
|
|
|
0
|
my $start_index = $start / ($bin_size); |
3221
|
0
|
|
|
|
|
0
|
my $end_index = $end / ($bin_size); |
3222
|
|
|
|
|
|
|
|
3223
|
0
|
|
|
|
|
0
|
my $translocation_density = 0; #stores the density of translocation events in the region |
3224
|
|
|
|
|
|
|
|
3225
|
0
|
|
|
|
|
0
|
my @chr_data; #stores the bin data for the chromosome |
3226
|
|
|
|
|
|
|
my %trans_breakpoints; #hash |
3227
|
|
|
|
|
|
|
#key: chromosome eg 1,2,X,Y |
3228
|
|
|
|
|
|
|
#value: an array storing the position of the translocation breakpoints |
3229
|
|
|
|
|
|
|
|
3230
|
0
|
|
|
|
|
0
|
my %trans_breakpoint_spreads; #stores the average distance between translocation breakpoints |
3231
|
0
|
|
|
|
|
0
|
my @significant_chrs; #stores a list of chromosomes that have a significant number of |
3232
|
|
|
|
|
|
|
#translocation to or from the region |
3233
|
0
|
|
|
|
|
0
|
my @diffs; #stores the distance between adjacent translocation breakpoints on |
3234
|
|
|
|
|
|
|
#one chromosome |
3235
|
0
|
|
|
|
|
0
|
my $diff_sum; #stores the sum of the distances |
3236
|
0
|
|
|
|
|
0
|
my $diff_count; #store the number regions between translocation breakpoints |
3237
|
|
|
|
|
|
|
|
3238
|
0
|
|
|
|
|
0
|
my %trans_number_hash; #hash |
3239
|
|
|
|
|
|
|
#key: chromosome eg 1,2,X,Y |
3240
|
|
|
|
|
|
|
#value: the number of events between the chromosome and the region |
3241
|
|
|
|
|
|
|
|
3242
|
0
|
|
|
|
|
0
|
my $mean = 0; #stores the average number of translocations between each chromosome |
3243
|
|
|
|
|
|
|
#and the region |
3244
|
0
|
|
|
|
|
0
|
my $SD = 0; #stores the standard deviation of the above value |
3245
|
|
|
|
|
|
|
|
3246
|
0
|
|
|
|
|
0
|
my $weighted_sum = 0.00; #component of the score calculation |
3247
|
|
|
|
|
|
|
|
3248
|
0
|
|
|
|
|
0
|
my $size = 0.00; #intermediate variable used to collect sums |
3249
|
0
|
|
|
|
|
0
|
my $count = 0; #intermeidate variable |
3250
|
|
|
|
|
|
|
|
3251
|
0
|
|
|
|
|
0
|
my $translocation_score = 0; #final hallmark score |
3252
|
|
|
|
|
|
|
|
3253
|
0
|
|
|
|
|
0
|
my $spread_factor = 0; |
3254
|
|
|
|
|
|
|
|
3255
|
0
|
|
|
|
|
0
|
my $translocation_count = 0; #stores the total number of translocations from significant chromosomes |
3256
|
|
|
|
|
|
|
|
3257
|
|
|
|
|
|
|
#check that there is translocation data for the chromosome |
3258
|
0
|
0
|
|
|
|
0
|
if(defined($genome_trans_data_hash{$chr})){ |
3259
|
|
|
|
|
|
|
#get the translocation data for the chromosome |
3260
|
0
|
|
|
|
|
0
|
@chr_data = @{$genome_trans_data_hash{$chr}}; |
|
0
|
|
|
|
|
0
|
|
3261
|
|
|
|
|
|
|
|
3262
|
|
|
|
|
|
|
#for each bin sum the number of translocation events |
3263
|
|
|
|
|
|
|
#and record the position of breakpoints in the trans_breakpoints hash |
3264
|
0
|
|
|
|
|
0
|
for (my $i = $start_index; $i < $end_index+1; $i++){ |
3265
|
0
|
0
|
|
|
|
0
|
if(!defined($chr_data[$i])){ |
3266
|
0
|
|
|
|
|
0
|
next; |
3267
|
|
|
|
|
|
|
} |
3268
|
0
|
|
|
|
|
0
|
my %trans_hash = %{$chr_data[$i]}; |
|
0
|
|
|
|
|
0
|
|
3269
|
0
|
|
|
|
|
0
|
my %trans_hash_in; |
3270
|
|
|
|
|
|
|
my %trans_hash_out; |
3271
|
|
|
|
|
|
|
|
3272
|
|
|
|
|
|
|
#analyze the breakpoints from translocation into the region |
3273
|
0
|
0
|
|
|
|
0
|
if(defined($trans_hash{'in'})){ |
3274
|
0
|
|
|
|
|
0
|
%trans_hash_in = %{$trans_hash{'in'}}; |
|
0
|
|
|
|
|
0
|
|
3275
|
|
|
|
|
|
|
|
3276
|
0
|
|
|
|
|
0
|
for my $key (keys %trans_hash_in){ |
3277
|
0
|
|
|
|
|
0
|
$size = @{$trans_hash_in{$key}}; |
|
0
|
|
|
|
|
0
|
|
3278
|
|
|
|
|
|
|
|
3279
|
|
|
|
|
|
|
#calculate the number of translocation events |
3280
|
0
|
|
|
|
|
0
|
$size = $size/2; |
3281
|
|
|
|
|
|
|
|
3282
|
|
|
|
|
|
|
#add the count to the appropriate hash |
3283
|
0
|
|
|
|
|
0
|
$trans_number_hash{$key} += $size; |
3284
|
|
|
|
|
|
|
|
3285
|
|
|
|
|
|
|
#add the breakpoints to the trans_breakpoints hash |
3286
|
0
|
|
|
|
|
0
|
push(@{$trans_breakpoints{$key}},(@{$trans_hash_in{$key}})); |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
3287
|
|
|
|
|
|
|
} |
3288
|
|
|
|
|
|
|
} |
3289
|
|
|
|
|
|
|
|
3290
|
|
|
|
|
|
|
#analyze the breakpoints from translocation out of the region |
3291
|
0
|
0
|
|
|
|
0
|
if(defined($trans_hash{'out'})){ |
3292
|
0
|
|
|
|
|
0
|
%trans_hash_out = %{$trans_hash{'out'}}; |
|
0
|
|
|
|
|
0
|
|
3293
|
|
|
|
|
|
|
|
3294
|
0
|
|
|
|
|
0
|
for my $key (keys %trans_hash_out){ |
3295
|
0
|
|
|
|
|
0
|
$size = @{$trans_hash_out{$key}}; |
|
0
|
|
|
|
|
0
|
|
3296
|
0
|
|
|
|
|
0
|
$count = $size; |
3297
|
0
|
0
|
|
|
|
0
|
if($key eq $chr){ |
3298
|
0
|
|
|
|
|
0
|
foreach my $val (@{$trans_hash_out{$key}}){ |
|
0
|
|
|
|
|
0
|
|
3299
|
0
|
0
|
0
|
|
|
0
|
if($val > $start && $val < $end){ |
3300
|
0
|
|
|
|
|
0
|
$count--; |
3301
|
|
|
|
|
|
|
} |
3302
|
|
|
|
|
|
|
} |
3303
|
|
|
|
|
|
|
} |
3304
|
|
|
|
|
|
|
#calculate the number of translocation events |
3305
|
0
|
|
|
|
|
0
|
$count = $count/2; |
3306
|
|
|
|
|
|
|
|
3307
|
|
|
|
|
|
|
#add the count to the appropriate hash |
3308
|
0
|
|
|
|
|
0
|
$trans_number_hash{$key} += $count; |
3309
|
|
|
|
|
|
|
|
3310
|
|
|
|
|
|
|
#add the breakpoints to the trans_breakpoints hash |
3311
|
0
|
|
|
|
|
0
|
push(@{$trans_breakpoints{$key}},(@{$trans_hash_out{$key}})); |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
3312
|
|
|
|
|
|
|
} |
3313
|
|
|
|
|
|
|
} |
3314
|
|
|
|
|
|
|
} |
3315
|
|
|
|
|
|
|
|
3316
|
0
|
|
|
|
|
0
|
$count = 0; |
3317
|
|
|
|
|
|
|
|
3318
|
|
|
|
|
|
|
#check that some translocation events were found else return a score of 0 |
3319
|
0
|
0
|
|
|
|
0
|
if (keys(%trans_number_hash) == 0){ |
3320
|
0
|
|
|
|
|
0
|
$translocation_score = 0; |
3321
|
0
|
|
|
|
|
0
|
$translocation_density = 0; |
3322
|
0
|
|
|
|
|
0
|
return ($translocation_score, \%trans_number_hash, $translocation_density); |
3323
|
|
|
|
|
|
|
} |
3324
|
|
|
|
|
|
|
|
3325
|
0
|
|
|
|
|
0
|
for my $key (keys %trans_number_hash){ |
3326
|
|
|
|
|
|
|
#round the event count up since if only one breakpoint was present at the end |
3327
|
|
|
|
|
|
|
#of the region we still count that as a whole translocation event |
3328
|
0
|
|
|
|
|
0
|
$trans_number_hash{$key} = POSIX::ceil($trans_number_hash{$key}); |
3329
|
|
|
|
|
|
|
|
3330
|
|
|
|
|
|
|
#sum the number of translocation events in the region |
3331
|
0
|
|
|
|
|
0
|
$count += $trans_number_hash{$key}; |
3332
|
|
|
|
|
|
|
} |
3333
|
|
|
|
|
|
|
|
3334
|
|
|
|
|
|
|
#calculate the translocation density for the region |
3335
|
0
|
|
|
|
|
0
|
$translocation_density = $count / ($end-$start); |
3336
|
|
|
|
|
|
|
|
3337
|
|
|
|
|
|
|
#calculate the mean and standard deviation of the number of translocation between the region |
3338
|
|
|
|
|
|
|
#and each chromosome |
3339
|
0
|
|
|
|
|
0
|
($SD, $mean) = standard_deviation_and_mean(\%trans_number_hash,0); |
3340
|
|
|
|
|
|
|
|
3341
|
|
|
|
|
|
|
#identify chromosomes that have a high number of translocations to or from the region and |
3342
|
|
|
|
|
|
|
#add them to the significant chromosome list |
3343
|
0
|
|
|
|
|
0
|
for my $key (keys %trans_number_hash){ |
3344
|
0
|
0
|
0
|
|
|
0
|
if( |
3345
|
|
|
|
|
|
|
( $SD==0 )|| |
3346
|
|
|
|
|
|
|
#( (($trans_number_hash{$key}-$mean)/$SD)>-2*$outlier_deviation) |
3347
|
|
|
|
|
|
|
( (($trans_number_hash{$key}-$mean)/$SD)>-1*$outlier_deviation) |
3348
|
|
|
|
|
|
|
){ |
3349
|
0
|
|
|
|
|
0
|
push (@significant_chrs, $key); |
3350
|
|
|
|
|
|
|
} |
3351
|
|
|
|
|
|
|
} |
3352
|
|
|
|
|
|
|
|
3353
|
|
|
|
|
|
|
#calculate the total number of translocations from significant chromosomes |
3354
|
0
|
|
|
|
|
0
|
foreach my $key (@significant_chrs){ |
3355
|
0
|
|
|
|
|
0
|
$translocation_count += $trans_number_hash{$key}; |
3356
|
|
|
|
|
|
|
} |
3357
|
|
|
|
|
|
|
|
3358
|
|
|
|
|
|
|
#for each significant chromosome calculate the spread between translocation events |
3359
|
0
|
|
|
|
|
0
|
foreach my $key (@significant_chrs){ |
3360
|
|
|
|
|
|
|
#sort the breakpoints |
3361
|
9
|
|
|
9
|
|
22217
|
{use sort 'stable'; |
|
9
|
|
|
|
|
21
|
|
|
9
|
|
|
|
|
51
|
|
|
0
|
|
|
|
|
0
|
|
3362
|
0
|
|
|
|
|
0
|
@{$trans_breakpoints{$key}} = sort {$a <=> $b} @{$trans_breakpoints{$key}}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
3363
|
|
|
|
|
|
|
}#use sort 'stable' |
3364
|
|
|
|
|
|
|
|
3365
|
0
|
|
|
|
|
0
|
$size = @{$trans_breakpoints{$key}}; |
|
0
|
|
|
|
|
0
|
|
3366
|
0
|
|
|
|
|
0
|
@diffs = (); |
3367
|
0
|
|
|
|
|
0
|
$diff_sum = 0; |
3368
|
0
|
|
|
|
|
0
|
$diff_count = 0; |
3369
|
|
|
|
|
|
|
|
3370
|
|
|
|
|
|
|
#calculate and store the distance between adjacent breakpoints |
3371
|
0
|
|
|
|
|
0
|
for (my $i = 1; $i<$size; $i++){ |
3372
|
0
|
|
|
|
|
0
|
push (@diffs, @{$trans_breakpoints{$key}}[$i]-@{$trans_breakpoints{$key}}[$i-1]); |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
3373
|
|
|
|
|
|
|
} |
3374
|
|
|
|
|
|
|
|
3375
|
|
|
|
|
|
|
#check that more that one distance was calculated |
3376
|
0
|
0
|
|
|
|
0
|
if($size==1){ |
3377
|
0
|
|
|
|
|
0
|
$trans_breakpoint_spreads{$key} = 0; |
3378
|
0
|
|
|
|
|
0
|
$diff_count = 1; |
3379
|
|
|
|
|
|
|
} |
3380
|
|
|
|
|
|
|
else{ #calculate the standard deviation and mean for the distance between breakpoints |
3381
|
0
|
|
|
|
|
0
|
($SD,$mean) = standard_deviation_and_mean(\@diffs,1); |
3382
|
|
|
|
|
|
|
|
3383
|
|
|
|
|
|
|
#sum the distances that are not high outliers, indicating distance between 2 translocation |
3384
|
|
|
|
|
|
|
#events and not distance between breakpoints of the same event |
3385
|
0
|
|
|
|
|
0
|
foreach my $val (@diffs){ |
3386
|
0
|
0
|
0
|
|
|
0
|
if( |
3387
|
|
|
|
|
|
|
( $SD==0 )|| |
3388
|
|
|
|
|
|
|
( (($val-$mean)/$SD)<$outlier_deviation ) |
3389
|
|
|
|
|
|
|
){ |
3390
|
0
|
|
|
|
|
0
|
$diff_sum += $val; |
3391
|
0
|
|
|
|
|
0
|
$diff_count++; |
3392
|
|
|
|
|
|
|
} |
3393
|
|
|
|
|
|
|
} |
3394
|
|
|
|
|
|
|
} |
3395
|
|
|
|
|
|
|
|
3396
|
|
|
|
|
|
|
#calculate the average spread of translocation breakpoints |
3397
|
0
|
|
|
|
|
0
|
$trans_breakpoint_spreads{$key} = $diff_sum / $diff_count; |
3398
|
|
|
|
|
|
|
|
3399
|
|
|
|
|
|
|
#calculate the spread factor for the chromosome |
3400
|
|
|
|
|
|
|
#my $spread_factor = (log($trans_breakpoint_spreads{$key}+1)/log(10))/((log($expected_mutation_density)/log(10))*-1); |
3401
|
0
|
|
|
|
|
0
|
$spread_factor = (log($trans_breakpoint_spreads{$key}+1)/log(10))/((log($expected_mutation_density)/log(10))*-1); |
3402
|
0
|
0
|
|
|
|
0
|
if($spread_factor==0){ |
3403
|
0
|
|
|
|
|
0
|
$spread_factor = 1; |
3404
|
|
|
|
|
|
|
} |
3405
|
|
|
|
|
|
|
|
3406
|
|
|
|
|
|
|
|
3407
|
|
|
|
|
|
|
#increase the weighted sum based on the number of translocation events and their spread multiplied by the proportion of translocations |
3408
|
|
|
|
|
|
|
#from this specific chromosome relative to the total number of translocation events |
3409
|
0
|
|
|
|
|
0
|
$weighted_sum += ($trans_number_hash{$key}/$spread_factor)*($trans_number_hash{$key}/$translocation_count); |
3410
|
|
|
|
|
|
|
}#foreach my $key (@significant_chrs) |
3411
|
|
|
|
|
|
|
|
3412
|
0
|
|
|
|
|
0
|
my $t2 = (1-(1/(log(1+$weighted_sum)/log(2)))); |
3413
|
|
|
|
|
|
|
#final hallmark score calculation |
3414
|
0
|
|
|
|
|
0
|
$size = @significant_chrs; |
3415
|
|
|
|
|
|
|
|
3416
|
|
|
|
|
|
|
#calculate second term of score |
3417
|
0
|
|
|
|
|
0
|
$translocation_score = (1-(1/(log(1+$weighted_sum)/log(2)))); |
3418
|
|
|
|
|
|
|
|
3419
|
|
|
|
|
|
|
#calculate first term of score |
3420
|
0
|
0
|
0
|
|
|
0
|
if($size<$translocation_cut_off_count && $size>2){ |
3421
|
0
|
|
|
|
|
0
|
$translocation_score = (1-(0.10*($size-2)))*$translocation_score; |
3422
|
|
|
|
|
|
|
} |
3423
|
0
|
0
|
|
|
|
0
|
if($size>=$translocation_cut_off_count){ |
3424
|
0
|
|
|
|
|
0
|
$translocation_score = 0; |
3425
|
|
|
|
|
|
|
} |
3426
|
|
|
|
|
|
|
|
3427
|
0
|
0
|
|
|
|
0
|
if($translocation_score>=1){ |
3428
|
0
|
|
|
|
|
0
|
print "score: ".$translocation_score."\n"; |
3429
|
0
|
|
|
|
|
0
|
print "ws: ".$weighted_sum."\n"; |
3430
|
0
|
|
|
|
|
0
|
print "sf: ".$spread_factor."\n"; |
3431
|
0
|
|
|
|
|
0
|
print "term 1: "; |
3432
|
0
|
|
|
|
|
0
|
print (1/(1+(log($size)/log(4)))); |
3433
|
0
|
|
|
|
|
0
|
print "\n"; |
3434
|
0
|
|
|
|
|
0
|
print "term 2: "; |
3435
|
0
|
|
|
|
|
0
|
print (1-(1/(log($weighted_sum)/log(2)))); |
3436
|
0
|
|
|
|
|
0
|
print "\n"; |
3437
|
|
|
|
|
|
|
} |
3438
|
|
|
|
|
|
|
|
3439
|
|
|
|
|
|
|
}#if(defined($genome_trans_data_hash{$chr})) |
3440
|
|
|
|
|
|
|
|
3441
|
0
|
|
|
|
|
0
|
return ($translocation_score, \%trans_number_hash, $translocation_density); |
3442
|
|
|
|
|
|
|
|
3443
|
|
|
|
|
|
|
}#sub calculate_translocation_score |
3444
|
|
|
|
|
|
|
|
3445
|
|
|
|
|
|
|
|
3446
|
|
|
|
|
|
|
#=head2 Sub-Method: calculate_insertion_breakpoint_score |
3447
|
|
|
|
|
|
|
|
3448
|
|
|
|
|
|
|
### calculate_insertion_breakpoint_score ########################################################## |
3449
|
|
|
|
|
|
|
# Description: |
3450
|
|
|
|
|
|
|
# Calculates the insertions at translocation breakpoints hallmark score |
3451
|
|
|
|
|
|
|
# |
3452
|
|
|
|
|
|
|
# Input variables: |
3453
|
|
|
|
|
|
|
# $chr: chromosome where the region is located |
3454
|
|
|
|
|
|
|
# $start: starting location of the region |
3455
|
|
|
|
|
|
|
# $end: end location of the region |
3456
|
|
|
|
|
|
|
# $genome_trans_data_hash_ref: stores the position of translocation events |
3457
|
|
|
|
|
|
|
# on each chromosome |
3458
|
|
|
|
|
|
|
# $genome_trans_insertion_breakpoints_hash_ref: stores the position of breakpoints with |
3459
|
|
|
|
|
|
|
# insertions nearby |
3460
|
|
|
|
|
|
|
# $bin_size: stores the size of single bin |
3461
|
|
|
|
|
|
|
# |
3462
|
|
|
|
|
|
|
|
3463
|
|
|
|
|
|
|
#=cut |
3464
|
|
|
|
|
|
|
|
3465
|
|
|
|
|
|
|
sub calculate_insertion_breakpoint_score { |
3466
|
|
|
|
|
|
|
|
3467
|
|
|
|
|
|
|
#parse parameters |
3468
|
0
|
|
|
0
|
0
|
0
|
my $chr = shift; |
3469
|
0
|
|
|
|
|
0
|
my $start = shift; |
3470
|
0
|
|
|
|
|
0
|
my $end = shift; |
3471
|
|
|
|
|
|
|
|
3472
|
0
|
|
|
|
|
0
|
my $genome_trans_data_hash_ref = shift; |
3473
|
0
|
|
|
|
|
0
|
my %genome_trans_data_hash = %{$genome_trans_data_hash_ref}; |
|
0
|
|
|
|
|
0
|
|
3474
|
|
|
|
|
|
|
|
3475
|
0
|
|
|
|
|
0
|
my $genome_trans_insertion_breakpoints_hash_ref = shift; |
3476
|
0
|
|
|
|
|
0
|
my %genome_trans_insertion_breakpoints_hash = %{$genome_trans_insertion_breakpoints_hash_ref}; |
|
0
|
|
|
|
|
0
|
|
3477
|
|
|
|
|
|
|
|
3478
|
0
|
|
|
|
|
0
|
my $bin_size = shift; |
3479
|
|
|
|
|
|
|
|
3480
|
|
|
|
|
|
|
#calculate array index where the data for the first and last bins of the region are located |
3481
|
0
|
|
|
|
|
0
|
my $start_index = $start / ($bin_size); |
3482
|
0
|
|
|
|
|
0
|
my $end_index = $end / ($bin_size); |
3483
|
|
|
|
|
|
|
|
3484
|
0
|
|
|
|
|
0
|
my $total_breakpoints = 0; #total number of breakpoints in the region |
3485
|
0
|
|
|
|
|
0
|
my $inserted_breakpoints = 0; #total number of breakpoints with nearby insertions in the region |
3486
|
|
|
|
|
|
|
|
3487
|
0
|
|
|
|
|
0
|
my $insertion_breakpoint_score = 0; |
3488
|
|
|
|
|
|
|
|
3489
|
0
|
|
|
|
|
0
|
my @chr_data; #stores the bin data for the chromosome |
3490
|
|
|
|
|
|
|
my %trans_hash; #stores translocation hash for each bin |
3491
|
|
|
|
|
|
|
|
3492
|
0
|
|
|
|
|
0
|
my @inserted_breakpoint_list; #stores the breakpoints that have insertions nearby |
3493
|
0
|
|
|
|
|
0
|
my @breakpoint_data; #stores $total_breakpoints and $inserted_breakpoints for return |
3494
|
|
|
|
|
|
|
|
3495
|
|
|
|
|
|
|
#check if there is translocation data for the region |
3496
|
0
|
0
|
|
|
|
0
|
if(defined($genome_trans_data_hash{$chr})){ |
3497
|
|
|
|
|
|
|
|
3498
|
|
|
|
|
|
|
#get the translocation data |
3499
|
0
|
|
|
|
|
0
|
@chr_data = @{$genome_trans_data_hash{$chr}}; |
|
0
|
|
|
|
|
0
|
|
3500
|
|
|
|
|
|
|
|
3501
|
|
|
|
|
|
|
#for each bin in the region sum the number of breakpoints |
3502
|
0
|
|
|
|
|
0
|
for (my $i = $start_index; $i < $end_index+1; $i++){ |
3503
|
0
|
0
|
|
|
|
0
|
if(!defined($chr_data[$i])){ |
3504
|
0
|
|
|
|
|
0
|
next; |
3505
|
|
|
|
|
|
|
} |
3506
|
0
|
|
|
|
|
0
|
%trans_hash = %{$chr_data[$i]}; |
|
0
|
|
|
|
|
0
|
|
3507
|
0
|
|
|
|
|
0
|
$total_breakpoints += $trans_hash{'BPcount'}; |
3508
|
|
|
|
|
|
|
} |
3509
|
|
|
|
|
|
|
|
3510
|
|
|
|
|
|
|
#get the list of breakpoints with insertions nearby on the chromosome |
3511
|
0
|
|
|
|
|
0
|
@inserted_breakpoint_list = @{$genome_trans_insertion_breakpoints_hash{$chr}}; |
|
0
|
|
|
|
|
0
|
|
3512
|
|
|
|
|
|
|
|
3513
|
|
|
|
|
|
|
#sort the above list |
3514
|
9
|
|
|
9
|
|
7784
|
{use sort 'stable'; |
|
9
|
|
|
|
|
22
|
|
|
9
|
|
|
|
|
39
|
|
|
0
|
|
|
|
|
0
|
|
3515
|
0
|
|
|
|
|
0
|
@inserted_breakpoint_list = sort {$a <=> $b} @inserted_breakpoint_list; |
|
0
|
|
|
|
|
0
|
|
3516
|
|
|
|
|
|
|
}#use sort 'stable' |
3517
|
|
|
|
|
|
|
|
3518
|
|
|
|
|
|
|
#calculate how many of the breakpoints with insertions are in the region |
3519
|
0
|
|
|
|
|
0
|
foreach my $breakpoint (@inserted_breakpoint_list){ |
3520
|
0
|
0
|
|
|
|
0
|
if($breakpoint > $end){ |
3521
|
0
|
|
|
|
|
0
|
last; |
3522
|
|
|
|
|
|
|
} |
3523
|
0
|
0
|
|
|
|
0
|
if($breakpoint > $start){ |
3524
|
0
|
|
|
|
|
0
|
$inserted_breakpoints++; |
3525
|
|
|
|
|
|
|
} |
3526
|
|
|
|
|
|
|
} |
3527
|
|
|
|
|
|
|
|
3528
|
|
|
|
|
|
|
#calculate the hallmark score |
3529
|
0
|
0
|
|
|
|
0
|
if ($total_breakpoints > 0) { |
3530
|
0
|
|
|
|
|
0
|
$insertion_breakpoint_score = $inserted_breakpoints/$total_breakpoints; |
3531
|
|
|
|
|
|
|
} |
3532
|
|
|
|
|
|
|
|
3533
|
0
|
0
|
|
|
|
0
|
if($insertion_breakpoint_score > 1){ |
3534
|
0
|
|
|
|
|
0
|
die "ERROR: found a insertion_breakpoint_score greater than 1\n"; |
3535
|
|
|
|
|
|
|
} |
3536
|
|
|
|
|
|
|
|
3537
|
|
|
|
|
|
|
} |
3538
|
|
|
|
|
|
|
|
3539
|
0
|
|
|
|
|
0
|
push (@breakpoint_data, $inserted_breakpoints); |
3540
|
0
|
|
|
|
|
0
|
push (@breakpoint_data, $total_breakpoints); |
3541
|
|
|
|
|
|
|
|
3542
|
0
|
|
|
|
|
0
|
return ($insertion_breakpoint_score, \@breakpoint_data); |
3543
|
|
|
|
|
|
|
} |
3544
|
|
|
|
|
|
|
|
3545
|
|
|
|
|
|
|
|
3546
|
|
|
|
|
|
|
#=head2 Sub-Method: calculate_loh_score |
3547
|
|
|
|
|
|
|
|
3548
|
|
|
|
|
|
|
### calculate_loh_score ########################################################################### |
3549
|
|
|
|
|
|
|
# Description: |
3550
|
|
|
|
|
|
|
# Calculates the loss of heterozgozity hallmark score |
3551
|
|
|
|
|
|
|
# |
3552
|
|
|
|
|
|
|
# Input variables: |
3553
|
|
|
|
|
|
|
# $chr: chromosome where the region is located |
3554
|
|
|
|
|
|
|
# $start: starting location of the region |
3555
|
|
|
|
|
|
|
# $end: end location of the region |
3556
|
|
|
|
|
|
|
# $chromosome_cnv_breakpoints_hash_ref: stores the breakpoints of CNV mutations on each |
3557
|
|
|
|
|
|
|
# chromosome |
3558
|
|
|
|
|
|
|
# $chromosome_loh_breakpoints_hash_ref: stores the breakpoints of LOH regions on each |
3559
|
|
|
|
|
|
|
# chromosome |
3560
|
|
|
|
|
|
|
# |
3561
|
|
|
|
|
|
|
|
3562
|
|
|
|
|
|
|
#=cut |
3563
|
|
|
|
|
|
|
|
3564
|
|
|
|
|
|
|
sub calculate_loh_score { |
3565
|
|
|
|
|
|
|
|
3566
|
|
|
|
|
|
|
#parse parameters |
3567
|
0
|
|
|
0
|
0
|
0
|
my $chr = shift; |
3568
|
0
|
|
|
|
|
0
|
my $start = shift; |
3569
|
0
|
|
|
|
|
0
|
my $end = shift; |
3570
|
|
|
|
|
|
|
|
3571
|
0
|
|
|
|
|
0
|
my $chromosome_cnv_breakpoints_hash_ref = shift; |
3572
|
0
|
|
|
|
|
0
|
my %chromosome_cnv_breakpoints_hash = %{$chromosome_cnv_breakpoints_hash_ref}; |
|
0
|
|
|
|
|
0
|
|
3573
|
|
|
|
|
|
|
|
3574
|
0
|
|
|
|
|
0
|
my $chromosome_loh_breakpoints_hash_ref = shift; |
3575
|
0
|
|
|
|
|
0
|
my %chromosome_loh_breakpoints_hash = %{$chromosome_loh_breakpoints_hash_ref}; |
|
0
|
|
|
|
|
0
|
|
3576
|
|
|
|
|
|
|
|
3577
|
0
|
|
|
|
|
0
|
my @cnv_breakpoints; #stores cnv breakpoints in the region |
3578
|
|
|
|
|
|
|
my $cnv_breakpoints_size; #stores the number of cnv breakpoints in the region |
3579
|
|
|
|
|
|
|
|
3580
|
0
|
|
|
|
|
0
|
my @loh_breakpoints; #stores the LOH breakpoints in the region |
3581
|
0
|
|
|
|
|
0
|
my $loh_breakpoints_size; #stores the number of LOH breakpoints in the region |
3582
|
|
|
|
|
|
|
|
3583
|
|
|
|
|
|
|
#calculate maximum potential amount of heterozygosity |
3584
|
0
|
|
|
|
|
0
|
my $original_heterozygous_size = $end - $start; |
3585
|
|
|
|
|
|
|
|
3586
|
|
|
|
|
|
|
#calculate maximum pontentail amount of heterozygosity that can remain |
3587
|
0
|
|
|
|
|
0
|
my $remaining_heterozygous_size = $end - $start; |
3588
|
|
|
|
|
|
|
|
3589
|
0
|
|
|
|
|
0
|
my $loh_size = 0; #stores the size of all LOH regions in the region |
3590
|
|
|
|
|
|
|
|
3591
|
0
|
|
|
|
|
0
|
my $loh_score = 0; #final hallmark score |
3592
|
|
|
|
|
|
|
|
3593
|
|
|
|
|
|
|
#check if there is any cnv data for the chromosome |
3594
|
0
|
0
|
0
|
|
|
0
|
if( |
3595
|
|
|
|
|
|
|
( !defined($chromosome_cnv_breakpoints_hash{$chr}) ) || |
3596
|
|
|
|
|
|
|
( !defined($chromosome_loh_breakpoints_hash{$chr}) ) |
3597
|
|
|
|
|
|
|
){ |
3598
|
0
|
|
|
|
|
0
|
$loh_score = 0; |
3599
|
0
|
|
|
|
|
0
|
$loh_size = -1; |
3600
|
0
|
|
|
|
|
0
|
$remaining_heterozygous_size = -1; |
3601
|
0
|
|
|
|
|
0
|
return($loh_score, $loh_size, $remaining_heterozygous_size); |
3602
|
|
|
|
|
|
|
} |
3603
|
|
|
|
|
|
|
|
3604
|
|
|
|
|
|
|
#get the cnv breakpoints for the chromosome |
3605
|
0
|
|
|
|
|
0
|
@cnv_breakpoints = @{$chromosome_cnv_breakpoints_hash{$chr}}; |
|
0
|
|
|
|
|
0
|
|
3606
|
|
|
|
|
|
|
|
3607
|
|
|
|
|
|
|
#sort the list |
3608
|
9
|
|
|
9
|
|
3714
|
{use sort 'stable'; |
|
9
|
|
|
|
|
21
|
|
|
9
|
|
|
|
|
36
|
|
|
0
|
|
|
|
|
0
|
|
3609
|
0
|
|
|
|
|
0
|
@cnv_breakpoints = sort {$a <=> $b} @cnv_breakpoints; |
|
0
|
|
|
|
|
0
|
|
3610
|
|
|
|
|
|
|
}#use sort 'stable' |
3611
|
|
|
|
|
|
|
|
3612
|
|
|
|
|
|
|
#get the number of breakpoints |
3613
|
0
|
|
|
|
|
0
|
$cnv_breakpoints_size = @cnv_breakpoints; |
3614
|
|
|
|
|
|
|
|
3615
|
|
|
|
|
|
|
#find all the cnv events that occur in the region and subtract the size of these regions |
3616
|
|
|
|
|
|
|
#from the $original_heterozygous_size value |
3617
|
0
|
|
|
|
|
0
|
for (my $i = 0; $i< $cnv_breakpoints_size; $i+=2){ |
3618
|
0
|
|
|
|
|
0
|
my $cnv_start = $i; |
3619
|
0
|
|
|
|
|
0
|
my $cnv_end = $i+1; |
3620
|
|
|
|
|
|
|
|
3621
|
0
|
|
|
|
|
0
|
my $end_overlap = 0; |
3622
|
0
|
|
|
|
|
0
|
my $start_overlap = 0; |
3623
|
|
|
|
|
|
|
|
3624
|
0
|
0
|
|
|
|
0
|
if($cnv_breakpoints[$cnv_start] > $end){ |
3625
|
0
|
|
|
|
|
0
|
last; |
3626
|
|
|
|
|
|
|
} |
3627
|
|
|
|
|
|
|
|
3628
|
|
|
|
|
|
|
#Check if the end point of the cnv region is within the suspect region |
3629
|
0
|
0
|
0
|
|
|
0
|
if($cnv_breakpoints[$cnv_end] >= $start && $cnv_breakpoints[$cnv_end] <= $end){ |
3630
|
0
|
|
|
|
|
0
|
$end_overlap = 1; |
3631
|
|
|
|
|
|
|
} |
3632
|
|
|
|
|
|
|
|
3633
|
|
|
|
|
|
|
#Check if the start point of the region cnv is within the suspect region |
3634
|
0
|
0
|
0
|
|
|
0
|
if($cnv_breakpoints[$cnv_start] >= $start && $cnv_breakpoints[$cnv_start] <= $end){ |
3635
|
0
|
|
|
|
|
0
|
$start_overlap = 1; |
3636
|
|
|
|
|
|
|
} |
3637
|
|
|
|
|
|
|
|
3638
|
|
|
|
|
|
|
#If an overlap was detected |
3639
|
0
|
0
|
0
|
|
|
0
|
if($start_overlap==1 && $end_overlap==1) { |
|
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
3640
|
0
|
|
|
|
|
0
|
$remaining_heterozygous_size -= ($cnv_breakpoints[$cnv_end] - $cnv_breakpoints[$cnv_start]); |
3641
|
|
|
|
|
|
|
} |
3642
|
|
|
|
|
|
|
elsif($start_overlap==1){ |
3643
|
0
|
|
|
|
|
0
|
$remaining_heterozygous_size -= ($end-$cnv_breakpoints[$cnv_start]); |
3644
|
|
|
|
|
|
|
} |
3645
|
|
|
|
|
|
|
elsif($end_overlap==1){ |
3646
|
0
|
|
|
|
|
0
|
$remaining_heterozygous_size -= ($cnv_breakpoints[$cnv_end]-$start); |
3647
|
|
|
|
|
|
|
} |
3648
|
|
|
|
|
|
|
elsif($cnv_breakpoints[$cnv_start] < $start && $cnv_breakpoints[$cnv_end] > $end){ |
3649
|
0
|
|
|
|
|
0
|
$remaining_heterozygous_size = 0; |
3650
|
|
|
|
|
|
|
} |
3651
|
|
|
|
|
|
|
} |
3652
|
|
|
|
|
|
|
|
3653
|
|
|
|
|
|
|
#check if there is no potential heterozygous regions in the suspect region or if there are no cnv events |
3654
|
|
|
|
|
|
|
#in the suspect region, if either is the case return a score of 0 |
3655
|
0
|
0
|
0
|
|
|
0
|
if($remaining_heterozygous_size == 0 || $remaining_heterozygous_size == $original_heterozygous_size){ |
3656
|
0
|
|
|
|
|
0
|
$loh_score = 0; |
3657
|
0
|
|
|
|
|
0
|
$loh_size = -1; |
3658
|
0
|
|
|
|
|
0
|
$remaining_heterozygous_size = -1; |
3659
|
0
|
|
|
|
|
0
|
return($loh_score, $loh_size, $remaining_heterozygous_size); |
3660
|
|
|
|
|
|
|
} |
3661
|
|
|
|
|
|
|
|
3662
|
|
|
|
|
|
|
#get a list of the LOH breakpoints on the chromosome |
3663
|
0
|
|
|
|
|
0
|
@loh_breakpoints = @{$chromosome_loh_breakpoints_hash{$chr}}; |
|
0
|
|
|
|
|
0
|
|
3664
|
|
|
|
|
|
|
|
3665
|
|
|
|
|
|
|
#sort the list |
3666
|
9
|
|
|
9
|
|
2594
|
{use sort 'stable'; |
|
9
|
|
|
|
|
19
|
|
|
9
|
|
|
|
|
38
|
|
|
0
|
|
|
|
|
0
|
|
3667
|
0
|
|
|
|
|
0
|
@loh_breakpoints = sort {$a <=> $b} @loh_breakpoints; |
|
0
|
|
|
|
|
0
|
|
3668
|
|
|
|
|
|
|
}#use sort 'stable' |
3669
|
|
|
|
|
|
|
|
3670
|
|
|
|
|
|
|
#get the number of breakpoints |
3671
|
0
|
|
|
|
|
0
|
$loh_breakpoints_size = @loh_breakpoints; |
3672
|
|
|
|
|
|
|
|
3673
|
|
|
|
|
|
|
#determine which LOH regions are in the suspect region |
3674
|
0
|
|
|
|
|
0
|
for (my $i = 0; $i< $loh_breakpoints_size; $i+=2){ |
3675
|
0
|
|
|
|
|
0
|
my $start_overlap_region_loh = 0; |
3676
|
0
|
|
|
|
|
0
|
my $end_overlap_region_loh = 0; |
3677
|
|
|
|
|
|
|
|
3678
|
0
|
|
|
|
|
0
|
my $loh_start = $i; |
3679
|
0
|
|
|
|
|
0
|
my $loh_end = $i+1; |
3680
|
|
|
|
|
|
|
|
3681
|
0
|
|
|
|
|
0
|
my $loh_start_breakpoint = $loh_breakpoints[$loh_start]; |
3682
|
0
|
|
|
|
|
0
|
my $loh_end_breakpoint = $loh_breakpoints[$loh_end]; |
3683
|
0
|
|
|
|
|
0
|
my $loh_region_size; |
3684
|
|
|
|
|
|
|
|
3685
|
0
|
0
|
|
|
|
0
|
if($loh_breakpoints[$loh_start] > $end){ |
3686
|
0
|
|
|
|
|
0
|
last; |
3687
|
|
|
|
|
|
|
} |
3688
|
|
|
|
|
|
|
|
3689
|
|
|
|
|
|
|
#Check if the end point of the cnv region is within the loh region |
3690
|
0
|
0
|
0
|
|
|
0
|
if($loh_breakpoints[$loh_end] >= $start && $loh_breakpoints[$loh_end] <= $end){ |
3691
|
0
|
|
|
|
|
0
|
$end_overlap_region_loh = 1; |
3692
|
|
|
|
|
|
|
} |
3693
|
|
|
|
|
|
|
|
3694
|
|
|
|
|
|
|
#Check if the start point of region 1 is within region 2 |
3695
|
0
|
0
|
0
|
|
|
0
|
if($loh_breakpoints[$loh_start] >= $start && $loh_breakpoints[$loh_start] <= $end){ |
3696
|
0
|
|
|
|
|
0
|
$start_overlap_region_loh = 1; |
3697
|
|
|
|
|
|
|
} |
3698
|
|
|
|
|
|
|
|
3699
|
|
|
|
|
|
|
#If an overlap was detected |
3700
|
0
|
0
|
0
|
|
|
0
|
if($start_overlap_region_loh==1 && $end_overlap_region_loh!=1){ |
|
|
0
|
0
|
|
|
|
|
|
|
0
|
0
|
|
|
|
|
3701
|
0
|
|
|
|
|
0
|
$loh_end_breakpoint = $end; |
3702
|
|
|
|
|
|
|
} |
3703
|
|
|
|
|
|
|
elsif($end_overlap_region_loh==1 && $start_overlap_region_loh!=1){ |
3704
|
0
|
|
|
|
|
0
|
$loh_start_breakpoint = $start; |
3705
|
|
|
|
|
|
|
} |
3706
|
|
|
|
|
|
|
elsif($loh_breakpoints[$loh_start] < $start && $loh_breakpoints[$loh_end] > $end){ |
3707
|
0
|
|
|
|
|
0
|
$loh_start_breakpoint = $start; |
3708
|
0
|
|
|
|
|
0
|
$loh_end_breakpoint = $end; |
3709
|
0
|
|
|
|
|
0
|
$start_overlap_region_loh=1; |
3710
|
0
|
|
|
|
|
0
|
$end_overlap_region_loh=1; |
3711
|
|
|
|
|
|
|
} |
3712
|
0
|
0
|
0
|
|
|
0
|
if($start_overlap_region_loh != 1 && $end_overlap_region_loh != 1){ #if the loh region is not in the suspect region go to the next loh region |
3713
|
0
|
|
|
|
|
0
|
next; |
3714
|
|
|
|
|
|
|
} |
3715
|
|
|
|
|
|
|
|
3716
|
|
|
|
|
|
|
#if the loh region is in the suspect region then reduce the size of the loh by subtracting the size of cnv regions that over lap with it |
3717
|
|
|
|
|
|
|
#this will tell us how much original heterozygosity remains |
3718
|
0
|
|
|
|
|
0
|
$loh_region_size = $loh_end_breakpoint - $loh_start_breakpoint; |
3719
|
|
|
|
|
|
|
|
3720
|
|
|
|
|
|
|
#check for overlaps between the LOH region and cnv regions |
3721
|
0
|
|
|
|
|
0
|
for (my $k = 0; $k< $cnv_breakpoints_size; $k+=2){ |
3722
|
0
|
|
|
|
|
0
|
my $start_overlap_loh_cnv = 0; |
3723
|
0
|
|
|
|
|
0
|
my $end_overlap_loh_cnv = 0; |
3724
|
|
|
|
|
|
|
|
3725
|
0
|
|
|
|
|
0
|
my $cnv_start = $k; |
3726
|
0
|
|
|
|
|
0
|
my $cnv_end = $k+1; |
3727
|
|
|
|
|
|
|
|
3728
|
0
|
0
|
|
|
|
0
|
if($cnv_breakpoints[$cnv_start] > $loh_end_breakpoint){ |
3729
|
0
|
|
|
|
|
0
|
last; |
3730
|
|
|
|
|
|
|
} |
3731
|
|
|
|
|
|
|
|
3732
|
|
|
|
|
|
|
#Check if the end point of the cnv region is within the loh region |
3733
|
0
|
0
|
0
|
|
|
0
|
if($cnv_breakpoints[$cnv_end] >= $loh_start_breakpoint && $cnv_breakpoints[$cnv_end] <= $loh_end_breakpoint){ |
3734
|
0
|
|
|
|
|
0
|
$end_overlap_loh_cnv = 1; |
3735
|
|
|
|
|
|
|
} |
3736
|
|
|
|
|
|
|
|
3737
|
|
|
|
|
|
|
#Check if the start point of region 1 is within region 2 |
3738
|
0
|
0
|
0
|
|
|
0
|
if($cnv_breakpoints[$cnv_start] >= $loh_start_breakpoint && $cnv_breakpoints[$cnv_start] <= $loh_end_breakpoint){ |
3739
|
0
|
|
|
|
|
0
|
$start_overlap_loh_cnv = 1; |
3740
|
|
|
|
|
|
|
} |
3741
|
|
|
|
|
|
|
|
3742
|
|
|
|
|
|
|
#If an overlap was detected |
3743
|
0
|
0
|
0
|
|
|
0
|
if($start_overlap_loh_cnv==1 && $end_overlap_loh_cnv==1) { |
|
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
3744
|
0
|
|
|
|
|
0
|
$loh_region_size -= ($cnv_breakpoints[$cnv_end] - $cnv_breakpoints[$cnv_start]); |
3745
|
|
|
|
|
|
|
} |
3746
|
|
|
|
|
|
|
elsif($start_overlap_loh_cnv==1){ |
3747
|
0
|
|
|
|
|
0
|
$loh_region_size -= ($loh_end_breakpoint-$cnv_breakpoints[$cnv_start]); |
3748
|
|
|
|
|
|
|
} |
3749
|
|
|
|
|
|
|
elsif($end_overlap_loh_cnv==1){ |
3750
|
0
|
|
|
|
|
0
|
$loh_region_size -= ($cnv_breakpoints[$cnv_end]-$loh_start_breakpoint); |
3751
|
|
|
|
|
|
|
} |
3752
|
|
|
|
|
|
|
elsif($cnv_breakpoints[$cnv_start] < $loh_start_breakpoint && $cnv_breakpoints[$cnv_end] > $loh_end_breakpoint){ |
3753
|
0
|
|
|
|
|
0
|
$loh_region_size = 0; |
3754
|
|
|
|
|
|
|
} |
3755
|
|
|
|
|
|
|
} |
3756
|
0
|
|
|
|
|
0
|
$loh_size += $loh_region_size; |
3757
|
|
|
|
|
|
|
} |
3758
|
|
|
|
|
|
|
|
3759
|
|
|
|
|
|
|
|
3760
|
|
|
|
|
|
|
#calculate the LOH score |
3761
|
0
|
|
|
|
|
0
|
$loh_score = 1 - ($loh_size/$remaining_heterozygous_size); |
3762
|
|
|
|
|
|
|
|
3763
|
0
|
0
|
|
|
|
0
|
if($loh_size> $remaining_heterozygous_size){ |
3764
|
0
|
|
|
|
|
0
|
die "ERROR: invalid LOH size value found\n"; |
3765
|
|
|
|
|
|
|
} |
3766
|
|
|
|
|
|
|
|
3767
|
0
|
|
|
|
|
0
|
return($loh_score, $loh_size, $remaining_heterozygous_size); |
3768
|
|
|
|
|
|
|
|
3769
|
|
|
|
|
|
|
}#sub calculate_loh_score |
3770
|
|
|
|
|
|
|
|
3771
|
|
|
|
|
|
|
|
3772
|
|
|
|
|
|
|
#=head2 Sub-Method: standard_deviation_and_mean |
3773
|
|
|
|
|
|
|
|
3774
|
|
|
|
|
|
|
### standard_deviation_and_mean ################################################################### |
3775
|
|
|
|
|
|
|
# Description: |
3776
|
|
|
|
|
|
|
# Calculates the standard deviation and mean for a given set of values |
3777
|
|
|
|
|
|
|
# |
3778
|
|
|
|
|
|
|
# Input variables: |
3779
|
|
|
|
|
|
|
# $data_ref: reference to either a hash or an array |
3780
|
|
|
|
|
|
|
# $type: 0 indicates a hash, 1 indicates an array |
3781
|
|
|
|
|
|
|
# |
3782
|
|
|
|
|
|
|
|
3783
|
|
|
|
|
|
|
#=cut |
3784
|
|
|
|
|
|
|
|
3785
|
|
|
|
|
|
|
sub standard_deviation_and_mean{ |
3786
|
|
|
|
|
|
|
|
3787
|
|
|
|
|
|
|
#parse parameters |
3788
|
11
|
|
|
11
|
0
|
2881
|
my $data_ref = shift; |
3789
|
11
|
|
|
|
|
12
|
my $type = shift; |
3790
|
|
|
|
|
|
|
|
3791
|
11
|
|
|
|
|
9
|
my %hash; |
3792
|
|
|
|
|
|
|
my @array; |
3793
|
0
|
|
|
|
|
0
|
my $size; |
3794
|
|
|
|
|
|
|
|
3795
|
11
|
|
|
|
|
9
|
my $mean = 0; |
3796
|
11
|
|
|
|
|
11
|
my $SD = 0; |
3797
|
|
|
|
|
|
|
|
3798
|
11
|
100
|
|
|
|
24
|
if($type==0){ |
|
|
100
|
|
|
|
|
|
3799
|
5
|
|
|
|
|
5
|
%hash = %{$data_ref}; |
|
5
|
|
|
|
|
27
|
|
3800
|
|
|
|
|
|
|
|
3801
|
3
|
100
|
|
|
|
10
|
if((scalar(keys %hash))==0){ |
3802
|
1
|
|
|
|
|
5
|
die"Found sample size of 0 when calculating SD-hash\n"; |
3803
|
|
|
|
|
|
|
} |
3804
|
|
|
|
|
|
|
|
3805
|
|
|
|
|
|
|
#calculate mean |
3806
|
2
|
|
|
|
|
6
|
for my $key (keys %hash){ |
3807
|
11
|
|
|
|
|
12
|
$mean += $hash{$key}; |
3808
|
|
|
|
|
|
|
} |
3809
|
|
|
|
|
|
|
|
3810
|
2
|
|
|
|
|
5
|
$mean = $mean/(scalar(keys %hash)); |
3811
|
|
|
|
|
|
|
|
3812
|
|
|
|
|
|
|
#calculate sum of squared differences |
3813
|
2
|
|
|
|
|
3
|
for my $key (keys %hash){ |
3814
|
11
|
|
|
|
|
16
|
$SD += ($hash{$key}-$mean)**2; |
3815
|
|
|
|
|
|
|
} |
3816
|
|
|
|
|
|
|
|
3817
|
|
|
|
|
|
|
#calculate final standard deviation value |
3818
|
2
|
|
|
|
|
4
|
$SD = $SD/(scalar(keys %hash)); |
3819
|
2
|
|
|
|
|
4
|
$SD = $SD**0.5; |
3820
|
|
|
|
|
|
|
} |
3821
|
|
|
|
|
|
|
elsif($type==1){ |
3822
|
5
|
|
|
|
|
5
|
@array = @{$data_ref}; |
|
5
|
|
|
|
|
24
|
|
3823
|
3
|
|
|
|
|
2
|
$size = @array; |
3824
|
|
|
|
|
|
|
|
3825
|
3
|
100
|
|
|
|
8
|
if($size==0){ |
3826
|
1
|
|
|
|
|
8
|
die"Found sample size of 0 when calculating SD-array\n"; |
3827
|
|
|
|
|
|
|
} |
3828
|
|
|
|
|
|
|
|
3829
|
|
|
|
|
|
|
#calculate mean |
3830
|
2
|
|
|
|
|
3
|
foreach my $val (@array){ |
3831
|
11
|
|
|
|
|
12
|
$mean += $val; |
3832
|
|
|
|
|
|
|
} |
3833
|
|
|
|
|
|
|
|
3834
|
2
|
|
|
|
|
3
|
$mean = $mean/$size; |
3835
|
|
|
|
|
|
|
|
3836
|
|
|
|
|
|
|
#calculate sum of squared differences |
3837
|
2
|
|
|
|
|
3
|
foreach my $val (@array){ |
3838
|
11
|
|
|
|
|
22
|
$SD += ($val-$mean)**2; |
3839
|
|
|
|
|
|
|
} |
3840
|
|
|
|
|
|
|
|
3841
|
|
|
|
|
|
|
#calculate final standard deviation value |
3842
|
2
|
|
|
|
|
4
|
$SD = $SD/$size; |
3843
|
2
|
|
|
|
|
10
|
$SD = $SD**0.5; |
3844
|
|
|
|
|
|
|
|
3845
|
|
|
|
|
|
|
} |
3846
|
|
|
|
|
|
|
else{ |
3847
|
1
|
|
|
|
|
6
|
die"ERROR: invalid SD/mean type found\n"; |
3848
|
|
|
|
|
|
|
} |
3849
|
4
|
|
|
|
|
13
|
return ($SD, $mean); |
3850
|
|
|
|
|
|
|
|
3851
|
|
|
|
|
|
|
}#sub standard_deviation_and_mean |
3852
|
|
|
|
|
|
|
|
3853
|
|
|
|
|
|
|
|
3854
|
|
|
|
|
|
|
### next_arg ###################################################################################### |
3855
|
|
|
|
|
|
|
# Parse the next arguement from the command line |
3856
|
|
|
|
|
|
|
# |
3857
|
|
|
|
|
|
|
sub next_arg { |
3858
|
0
|
|
|
0
|
0
|
0
|
my $code = shift; |
3859
|
0
|
|
|
|
|
0
|
$pos++; |
3860
|
0
|
0
|
|
|
|
0
|
if($pos == $ARGC){ |
3861
|
0
|
|
|
|
|
0
|
usage($code); |
3862
|
|
|
|
|
|
|
} |
3863
|
|
|
|
|
|
|
}#sub next_arg |
3864
|
|
|
|
|
|
|
|
3865
|
|
|
|
|
|
|
### man_text ###################################################################################### |
3866
|
|
|
|
|
|
|
# Print the manual help text |
3867
|
|
|
|
|
|
|
# |
3868
|
|
|
|
|
|
|
sub man_text { |
3869
|
0
|
|
|
0
|
0
|
0
|
print "Main Usage:\n"; |
3870
|
0
|
|
|
|
|
0
|
print "\tperl -w shatterproof.pl --cnv --trans [--insrt ] [--loh ] [--tp53] --config --output \n"; |
3871
|
0
|
|
|
|
|
0
|
print "\n"; |
3872
|
0
|
|
|
|
|
0
|
print "\tArguments:\n"; |
3873
|
0
|
|
|
|
|
0
|
print "\t\t--cnv\t\tDefine the path to the directory containing the CNV input files\n"; |
3874
|
0
|
|
|
|
|
0
|
print "\t\t--trans\t\tDefine the path to the directory containing the Translocation input files\n"; |
3875
|
0
|
|
|
|
|
0
|
print "\t\t--insrt\t\tDefine the path to the directory containing the insertion VCF input files\n"; |
3876
|
0
|
|
|
|
|
0
|
print "\t\t--loh\t\tDefine the path to the directory containing the LOH input files\n"; |
3877
|
0
|
|
|
|
|
0
|
print "\t\t--tp53\t\tIndicate that TP53 should be considered mutated, regardless of data\n"; |
3878
|
0
|
|
|
|
|
0
|
print "\t\t--config\tDefine the path to the ShatterProof config file\n"; |
3879
|
0
|
|
|
|
|
0
|
print "\t\t--output\tDefine the path to the directory where output should be placed\n"; |
3880
|
0
|
|
|
|
|
0
|
print "\t\tdir\t\tPath to a directory\n"; |
3881
|
0
|
|
|
|
|
0
|
print "\t\tpath\t\tPath to a file\n"; |
3882
|
0
|
|
|
|
|
0
|
print "\n"; |
3883
|
0
|
|
|
|
|
0
|
print "Help Usage:\n"; |
3884
|
0
|
|
|
|
|
0
|
print "\tperl -w shatterproof.pl --help\t\tThis help message.\n"; |
3885
|
0
|
|
|
|
|
0
|
print "\n"; |
3886
|
0
|
|
|
|
|
0
|
exit 0; |
3887
|
|
|
|
|
|
|
}#sub man_text |
3888
|
|
|
|
|
|
|
|
3889
|
|
|
|
|
|
|
### usage ######################################################################################### |
3890
|
|
|
|
|
|
|
# Prints an error message when invalid command line arguements are found |
3891
|
|
|
|
|
|
|
# |
3892
|
|
|
|
|
|
|
sub usage { |
3893
|
0
|
|
|
0
|
0
|
0
|
my $usage_msg = shift; |
3894
|
|
|
|
|
|
|
|
3895
|
0
|
|
|
|
|
0
|
print "u $usage_msg \n"; |
3896
|
|
|
|
|
|
|
|
3897
|
0
|
|
|
|
|
0
|
given($usage_msg){ |
3898
|
0
|
|
|
|
|
0
|
when (/^0/) { print "ERROR: missing arguments\n"; } |
|
0
|
|
|
|
|
0
|
|
3899
|
0
|
|
|
|
|
0
|
when (/^1/) { print "ERROR: 2nd argument missing\n"; } |
|
0
|
|
|
|
|
0
|
|
3900
|
0
|
|
|
|
|
0
|
when (/^2/) { print "ERROR: CNV directory missing\n"; } |
|
0
|
|
|
|
|
0
|
|
3901
|
0
|
|
|
|
|
0
|
when (/^3/) { print "ERROR: --trans option missing\n" } |
|
0
|
|
|
|
|
0
|
|
3902
|
0
|
|
|
|
|
0
|
when (/^4/) { print "ERROR: --cnv option missing\n" } |
|
0
|
|
|
|
|
0
|
|
3903
|
0
|
|
|
|
|
0
|
when (/^5/) { print "ERROR: Translocation directory missing\n" } |
|
0
|
|
|
|
|
0
|
|
3904
|
0
|
|
|
|
|
0
|
when (/^6/) { print "ERROR: --config option missing\n" } |
|
0
|
|
|
|
|
0
|
|
3905
|
0
|
|
|
|
|
0
|
when (/^7/) { print "ERROR: --trans option missing\n" } |
|
0
|
|
|
|
|
0
|
|
3906
|
0
|
|
|
|
|
0
|
when (/^8/) { print "ERROR: insertion directory missing\n" } |
|
0
|
|
|
|
|
0
|
|
3907
|
0
|
|
|
|
|
0
|
when (/^9/) { print "ERROR: --config option missing\n" } |
|
0
|
|
|
|
|
0
|
|
3908
|
0
|
|
|
|
|
0
|
when (/^10/) { print "ERROR: LOH directory missing \n" } |
|
0
|
|
|
|
|
0
|
|
3909
|
0
|
|
|
|
|
0
|
when (/^11/) { print "ERROR: --config option missing\n" } |
|
0
|
|
|
|
|
0
|
|
3910
|
0
|
|
|
|
|
0
|
when (/^12/) { print "ERROR: --config option missing\n" } |
|
0
|
|
|
|
|
0
|
|
3911
|
0
|
|
|
|
|
0
|
when (/^13/) { print "ERROR: Path to config file missing\n" } |
|
0
|
|
|
|
|
0
|
|
3912
|
0
|
|
|
|
|
0
|
when (/^14/) { print "ERROR: --output option missing\n" } |
|
0
|
|
|
|
|
0
|
|
3913
|
0
|
|
|
|
|
0
|
when (/^15/) { print "ERROR: --config option missing\n" } |
|
0
|
|
|
|
|
0
|
|
3914
|
0
|
|
|
|
|
0
|
when (/^16/) { print "ERROR: Output directory missing\n" } |
|
0
|
|
|
|
|
0
|
|
3915
|
0
|
|
|
|
|
0
|
when (/^17/) { print "ERROR: --output option missing\n" } |
|
0
|
|
|
|
|
0
|
|
3916
|
0
|
|
|
|
|
0
|
when (/^18/) { print "ERROR: too many arguments\n" } |
|
0
|
|
|
|
|
0
|
|
3917
|
|
|
|
|
|
|
} |
3918
|
0
|
|
|
|
|
0
|
print "Try perl -w shatteproof.pl --help\n"; |
3919
|
0
|
|
|
|
|
0
|
exit 0; |
3920
|
|
|
|
|
|
|
}#sub usage |
3921
|
|
|
|
|
|
|
|
3922
|
|
|
|
|
|
|
### initialize_genome_hash ######################################################################## |
3923
|
|
|
|
|
|
|
# Description: |
3924
|
|
|
|
|
|
|
# Initializes a hash to store an array for each chromosome |
3925
|
|
|
|
|
|
|
# |
3926
|
|
|
|
|
|
|
sub initialize_genome_hash { |
3927
|
|
|
|
|
|
|
|
3928
|
0
|
|
|
0
|
0
|
0
|
my %genome_region_data = ( #{chr}[region_num]->%region_data |
3929
|
|
|
|
|
|
|
X => [], |
3930
|
|
|
|
|
|
|
Y => [], |
3931
|
|
|
|
|
|
|
1 => [], |
3932
|
|
|
|
|
|
|
2 => [], |
3933
|
|
|
|
|
|
|
3 => [], |
3934
|
|
|
|
|
|
|
4 => [], |
3935
|
|
|
|
|
|
|
5 => [], |
3936
|
|
|
|
|
|
|
6 => [], |
3937
|
|
|
|
|
|
|
7 => [], |
3938
|
|
|
|
|
|
|
8 => [], |
3939
|
|
|
|
|
|
|
9 => [], |
3940
|
|
|
|
|
|
|
10 => [], |
3941
|
|
|
|
|
|
|
11 => [], |
3942
|
|
|
|
|
|
|
12 => [], |
3943
|
|
|
|
|
|
|
13 => [], |
3944
|
|
|
|
|
|
|
14 => [], |
3945
|
|
|
|
|
|
|
15 => [], |
3946
|
|
|
|
|
|
|
16 => [], |
3947
|
|
|
|
|
|
|
17 => [], |
3948
|
|
|
|
|
|
|
18 => [], |
3949
|
|
|
|
|
|
|
19 => [], |
3950
|
|
|
|
|
|
|
20 => [], |
3951
|
|
|
|
|
|
|
21 => [], |
3952
|
|
|
|
|
|
|
22 => [] |
3953
|
|
|
|
|
|
|
); |
3954
|
|
|
|
|
|
|
|
3955
|
0
|
|
|
|
|
0
|
return (\%genome_region_data); |
3956
|
|
|
|
|
|
|
}#sub initialize_genome_hash |
3957
|
|
|
|
|
|
|
|
3958
|
|
|
|
|
|
|
### load_config_file ######################################################################################### |
3959
|
|
|
|
|
|
|
# Description: |
3960
|
|
|
|
|
|
|
# Opens the config file and reads the parameter values from it |
3961
|
|
|
|
|
|
|
# |
3962
|
|
|
|
|
|
|
# Input variables: |
3963
|
|
|
|
|
|
|
# $path: path to config file |
3964
|
|
|
|
|
|
|
# |
3965
|
|
|
|
|
|
|
sub load_config_file { |
3966
|
|
|
|
|
|
|
|
3967
|
8
|
|
|
8
|
0
|
1730
|
my $path = shift; |
3968
|
8
|
|
|
|
|
520
|
print "\nLoading configuration file"; |
3969
|
|
|
|
|
|
|
|
3970
|
|
|
|
|
|
|
#Load the configuration file config.pl |
3971
|
8
|
|
|
|
|
19
|
my $CONFIG; |
3972
|
8
|
50
|
|
|
|
386
|
open($CONFIG, "<","$path") or die "COULD NOT OPEN CONFIG FILE at path: $path \n"; |
3973
|
8
|
|
|
|
|
1009
|
eval (<$CONFIG>) while (!eof($CONFIG)); |
3974
|
8
|
|
|
|
|
93
|
close($CONFIG); |
3975
|
|
|
|
|
|
|
|
3976
|
8
|
|
|
|
|
728
|
print " - Done\n"; |
3977
|
8
|
|
|
|
|
87
|
1; |
3978
|
|
|
|
|
|
|
}#sub load_config_file |
3979
|
|
|
|
|
|
|
|
3980
|
|
|
|
|
|
|
=head1 NAME |
3981
|
|
|
|
|
|
|
|
3982
|
|
|
|
|
|
|
ShatterProof - a script for analyzing next-generation sequencing data |
3983
|
|
|
|
|
|
|
|
3984
|
|
|
|
|
|
|
=head1 SYNOPSIS |
3985
|
|
|
|
|
|
|
|
3986
|
|
|
|
|
|
|
use Shatterproof |
3987
|
|
|
|
|
|
|
|
3988
|
|
|
|
|
|
|
See "shatterproof.pl" in the scripts directory for a simple perl script which calls the ShatterProof module |
3989
|
|
|
|
|
|
|
|
3990
|
|
|
|
|
|
|
Call ShatterProof via: |
3991
|
|
|
|
|
|
|
|
3992
|
|
|
|
|
|
|
ShatterProof::run(\@ARGV); |
3993
|
|
|
|
|
|
|
|
3994
|
|
|
|
|
|
|
=head1 DESCRIPTION |
3995
|
|
|
|
|
|
|
|
3996
|
|
|
|
|
|
|
ShatterProof is a tool that can be used to analyze next generation sequencing data for signs of chromothripsis. ShatterProof is implemented as a Perl module that processes input files and produces output files in both tab-delimited and YAML format. Perl version 5.0 or greater is required to run ShatterProof. Link to publication will be posted soon. |
3997
|
|
|
|
|
|
|
|
3998
|
|
|
|
|
|
|
=head1 README |
3999
|
|
|
|
|
|
|
|
4000
|
|
|
|
|
|
|
=head2 Installing ShatterProof |
4001
|
|
|
|
|
|
|
|
4002
|
|
|
|
|
|
|
To install this module type the following: |
4003
|
|
|
|
|
|
|
|
4004
|
|
|
|
|
|
|
perl Makefile.PL |
4005
|
|
|
|
|
|
|
make |
4006
|
|
|
|
|
|
|
make test |
4007
|
|
|
|
|
|
|
make install |
4008
|
|
|
|
|
|
|
|
4009
|
|
|
|
|
|
|
Make sure that you have admin permission rights when running the previous commands. |
4010
|
|
|
|
|
|
|
|
4011
|
|
|
|
|
|
|
=head2 Input File Types |
4012
|
|
|
|
|
|
|
|
4013
|
|
|
|
|
|
|
ShatterProof bases its analysis of genomic data on calls of translocations, copy number variations (CNV), loss of heterozygosity (LOH) and insertions. |
4014
|
|
|
|
|
|
|
ShatterProof can takes as input 4 different types of input files. See the scripts/conversion_scripts directory for some Perl scripts which will convert some common tools' output to the required input formats. |
4015
|
|
|
|
|
|
|
|
4016
|
|
|
|
|
|
|
=head3 Translocation Input Files (.spt) |
4017
|
|
|
|
|
|
|
|
4018
|
|
|
|
|
|
|
Tab delimited columns |
4019
|
|
|
|
|
|
|
First line is header line: |
4020
|
|
|
|
|
|
|
#chr1 start end chr2 start end quality |
4021
|
|
|
|
|
|
|
|
4022
|
|
|
|
|
|
|
Example data entry line: |
4023
|
|
|
|
|
|
|
|
4024
|
|
|
|
|
|
|
1 1000 2000 4 4000 5000 78 |
4025
|
|
|
|
|
|
|
|
4026
|
|
|
|
|
|
|
If no value is available for quality, use a "." eg.: |
4027
|
|
|
|
|
|
|
|
4028
|
|
|
|
|
|
|
1 1000 2000 4 4000 5000 . |
4029
|
|
|
|
|
|
|
|
4030
|
|
|
|
|
|
|
|
4031
|
|
|
|
|
|
|
=head3 Copy-Number Input Files (.spc) |
4032
|
|
|
|
|
|
|
|
4033
|
|
|
|
|
|
|
Tab delimited columns |
4034
|
|
|
|
|
|
|
First line is header line: |
4035
|
|
|
|
|
|
|
#chr start end number quality |
4036
|
|
|
|
|
|
|
|
4037
|
|
|
|
|
|
|
|
4038
|
|
|
|
|
|
|
Example data entry line: |
4039
|
|
|
|
|
|
|
12 2000 3000 2 63 |
4040
|
|
|
|
|
|
|
|
4041
|
|
|
|
|
|
|
If no value is available for quality, use a "." eg.: |
4042
|
|
|
|
|
|
|
|
4043
|
|
|
|
|
|
|
12 2000 3000 2 . |
4044
|
|
|
|
|
|
|
|
4045
|
|
|
|
|
|
|
=head3 Loss of Heterozygozity Input Files (.spl) |
4046
|
|
|
|
|
|
|
|
4047
|
|
|
|
|
|
|
Tab delimited columns |
4048
|
|
|
|
|
|
|
First line is header line: |
4049
|
|
|
|
|
|
|
#chr start end quality |
4050
|
|
|
|
|
|
|
|
4051
|
|
|
|
|
|
|
|
4052
|
|
|
|
|
|
|
Example data entry line: |
4053
|
|
|
|
|
|
|
|
4054
|
|
|
|
|
|
|
12 2000 3000 63 |
4055
|
|
|
|
|
|
|
|
4056
|
|
|
|
|
|
|
If no value is available for quality, use a "." eg.: |
4057
|
|
|
|
|
|
|
|
4058
|
|
|
|
|
|
|
12 2000 3000 . |
4059
|
|
|
|
|
|
|
|
4060
|
|
|
|
|
|
|
=head3 Insertion Input Files (.vcf) |
4061
|
|
|
|
|
|
|
|
4062
|
|
|
|
|
|
|
Additionally, ShatterProof accepts insertion calls in VCF files as input. See http://www.1000genomes.org/node/101 for details on the VCF file format. |
4063
|
|
|
|
|
|
|
ShatterProof analyzes the CHROM and POS fields of these files. |
4064
|
|
|
|
|
|
|
|
4065
|
|
|
|
|
|
|
|
4066
|
|
|
|
|
|
|
=head2 Configuring ShatterProof |
4067
|
|
|
|
|
|
|
|
4068
|
|
|
|
|
|
|
See the config.pl file in the scripts directory for a sample ShatterProof configuration file. |
4069
|
|
|
|
|
|
|
|
4070
|
|
|
|
|
|
|
$bin_size: number (integer) of base pairs to include in each bin of the sliding window analysis |
4071
|
|
|
|
|
|
|
|
4072
|
|
|
|
|
|
|
$localization_window_size: number (integer) of bins to include in each window of the sliding window analysis |
4073
|
|
|
|
|
|
|
|
4074
|
|
|
|
|
|
|
$expected_mutation_density: a reference value (double) used in determining if the concentration of translocation events on a particular chromosome is higher than expected. |
4075
|
|
|
|
|
|
|
|
4076
|
|
|
|
|
|
|
$collapse_regions: |
4077
|
|
|
|
|
|
|
|
4078
|
|
|
|
|
|
|
flag variable |
4079
|
|
|
|
|
|
|
|
4080
|
|
|
|
|
|
|
value 1: merge overlapping CNV regions that have the same copy number |
4081
|
|
|
|
|
|
|
|
4082
|
|
|
|
|
|
|
value 0: do not merge overlapping CNV regions that have the same copy number. If such regions are found an error is thrown |
4083
|
|
|
|
|
|
|
|
4084
|
|
|
|
|
|
|
$outlier_deviations: the number of standard deviations away from the mean a value has to be in order to be considered non-significant. Used to identify highly mutated regions. |
4085
|
|
|
|
|
|
|
|
4086
|
|
|
|
|
|
|
$translocation_cut_off_count: the maximum number of translocation chromosomes to tolerate before the translocation score for a region is set to 0. |
4087
|
|
|
|
|
|
|
|
4088
|
|
|
|
|
|
|
$genome_localization_weight: weight given to the localization of mutations to one chromosome hallmark |
4089
|
|
|
|
|
|
|
|
4090
|
|
|
|
|
|
|
$chromosome_localization_weight: weight given to the localization of mutations to one area of a particular chromosome hallmark |
4091
|
|
|
|
|
|
|
|
4092
|
|
|
|
|
|
|
$cnv_weight: weight given to the concentrated CNV hallmark |
4093
|
|
|
|
|
|
|
|
4094
|
|
|
|
|
|
|
$translocation_weight: weight give to the concentrated translocations hallmark |
4095
|
|
|
|
|
|
|
|
4096
|
|
|
|
|
|
|
$insertion_breakpoint_weight: weight given the the short breakpoint insertions hallmark |
4097
|
|
|
|
|
|
|
|
4098
|
|
|
|
|
|
|
$loh_weight: weight given to the loss/retention of heterozygosity hallmark |
4099
|
|
|
|
|
|
|
|
4100
|
|
|
|
|
|
|
$tp53_mutated_weight: weight given to the TP53 mutation hallmark |
4101
|
|
|
|
|
|
|
|
4102
|
|
|
|
|
|
|
|
4103
|
|
|
|
|
|
|
=head2 Running ShatterProof |
4104
|
|
|
|
|
|
|
|
4105
|
|
|
|
|
|
|
From the scripts directory run execute the shatterproof.pl file using Perl. |
4106
|
|
|
|
|
|
|
|
4107
|
|
|
|
|
|
|
Main Usage: |
4108
|
|
|
|
|
|
|
|
4109
|
|
|
|
|
|
|
perl -w shatterproof.pl --cnv --trans [--insrt ] [--loh ] [--tp53] --config --output |
4110
|
|
|
|
|
|
|
|
4111
|
|
|
|
|
|
|
Arguments: |
4112
|
|
|
|
|
|
|
|
4113
|
|
|
|
|
|
|
--cnv Define the path to the directory containing the CNV input files |
4114
|
|
|
|
|
|
|
|
4115
|
|
|
|
|
|
|
--trans Define the path to the directory containing the Translocation input files |
4116
|
|
|
|
|
|
|
|
4117
|
|
|
|
|
|
|
--insrt Define the path to the directory containing the insertion VCF input files |
4118
|
|
|
|
|
|
|
|
4119
|
|
|
|
|
|
|
--loh Define the path to the directory containing the LOH input files |
4120
|
|
|
|
|
|
|
|
4121
|
|
|
|
|
|
|
--tp53 Indicate that TP53 should be considered mutated, regardless of data |
4122
|
|
|
|
|
|
|
|
4123
|
|
|
|
|
|
|
--config Define the path to the ShatterProof config file |
4124
|
|
|
|
|
|
|
|
4125
|
|
|
|
|
|
|
--output Define the path to the directory where output should be placed |
4126
|
|
|
|
|
|
|
|
4127
|
|
|
|
|
|
|
dir Path to a directory |
4128
|
|
|
|
|
|
|
|
4129
|
|
|
|
|
|
|
path Path to a file |
4130
|
|
|
|
|
|
|
|
4131
|
|
|
|
|
|
|
=head1 PREREQUISITES |
4132
|
|
|
|
|
|
|
|
4133
|
|
|
|
|
|
|
strict; |
4134
|
|
|
|
|
|
|
warnings; |
4135
|
|
|
|
|
|
|
Carp; |
4136
|
|
|
|
|
|
|
Switch; |
4137
|
|
|
|
|
|
|
File::Basename; |
4138
|
|
|
|
|
|
|
List::Util qw[min max]; |
4139
|
|
|
|
|
|
|
Statistics::Distributions; |
4140
|
|
|
|
|
|
|
POSIX |
4141
|
|
|
|
|
|
|
|
4142
|
|
|
|
|
|
|
=pod OSNAMES |
4143
|
|
|
|
|
|
|
|
4144
|
|
|
|
|
|
|
any |
4145
|
|
|
|
|
|
|
|
4146
|
|
|
|
|
|
|
=pod SCRIPT CATEGORIES |
4147
|
|
|
|
|
|
|
|
4148
|
|
|
|
|
|
|
CPAN |
4149
|
|
|
|
|
|
|
|
4150
|
|
|
|
|
|
|
=cut |
4151
|
|
|
|
|
|
|
|
4152
|
|
|
|
|
|
|
1; |
4153
|
|
|
|
|
|
|
__END__ |