| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Bio::Tradis::Analysis::InsertSite; |
|
2
|
|
|
|
|
|
|
# ABSTRACT: Take in a bam file and plot the start position of each read |
|
3
|
|
|
|
|
|
|
$Bio::Tradis::Analysis::InsertSite::VERSION = '1.3.2'; |
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
4
|
|
|
4
|
|
89207
|
use Moose; |
|
|
4
|
|
|
|
|
404035
|
|
|
|
4
|
|
|
|
|
36
|
|
|
7
|
4
|
|
|
4
|
|
25188
|
use Bio::Tradis::Parser::Bam; |
|
|
4
|
|
|
|
|
13
|
|
|
|
4
|
|
|
|
|
136
|
|
|
8
|
4
|
|
|
4
|
|
1587
|
use Bio::Tradis::Parser::Cigar; |
|
|
4
|
|
|
|
|
12
|
|
|
|
4
|
|
|
|
|
3840
|
|
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
has 'filename' => ( is => 'rw', isa => 'Str', required => 1 ); |
|
11
|
|
|
|
|
|
|
has 'output_base_filename' => ( is => 'rw', isa => 'Str', required => 1 ); |
|
12
|
|
|
|
|
|
|
has 'mapping_score' => ( is => 'ro', isa => 'Int', required => 1 ); |
|
13
|
|
|
|
|
|
|
has 'samtools_exec' => ( is => 'ro', isa => 'Str', default => 'samtools' ); |
|
14
|
|
|
|
|
|
|
has '_output_file_handles' => ( is => 'rw', isa => 'HashRef', lazy_build => 1 ); |
|
15
|
|
|
|
|
|
|
has '_sequence_names' => ( is => 'rw', isa => 'ArrayRef', lazy_build => 1 ); |
|
16
|
|
|
|
|
|
|
has '_sequence_base_counters' => |
|
17
|
|
|
|
|
|
|
( is => 'rw', isa => 'HashRef', lazy_build => 1 ); |
|
18
|
|
|
|
|
|
|
has '_sequence_information' => |
|
19
|
|
|
|
|
|
|
( is => 'rw', isa => 'HashRef', lazy_build => 1 ); |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
has '_frequency_of_read_start' => ( |
|
22
|
|
|
|
|
|
|
is => 'rw', |
|
23
|
|
|
|
|
|
|
isa => 'HashRef', |
|
24
|
|
|
|
|
|
|
lazy => 1, |
|
25
|
|
|
|
|
|
|
builder => '_build__frequency_of_read_start' |
|
26
|
|
|
|
|
|
|
); |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
sub _build__sequence_information { |
|
29
|
3
|
|
|
3
|
|
7
|
my ($self) = @_; |
|
30
|
3
|
|
|
|
|
72
|
my %all_sequences_info = |
|
31
|
|
|
|
|
|
|
Bio::Tradis::Parser::Bam->new( file => $self->filename, samtools_exec => $self->samtools_exec )->seq_info; |
|
32
|
3
|
|
|
|
|
258
|
return \%all_sequences_info; |
|
33
|
|
|
|
|
|
|
} |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
sub _build__sequence_names { |
|
36
|
3
|
|
|
3
|
|
13
|
my ($self) = @_; |
|
37
|
3
|
|
|
|
|
6
|
my @sequence_names = keys %{ $self->_sequence_information }; |
|
|
3
|
|
|
|
|
93
|
|
|
38
|
3
|
|
|
|
|
75
|
return \@sequence_names; |
|
39
|
|
|
|
|
|
|
} |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
sub _build__sequence_base_counters { |
|
42
|
0
|
|
|
0
|
|
0
|
my ($self) = @_; |
|
43
|
0
|
|
|
|
|
0
|
my %sequence_base_counters; |
|
44
|
0
|
|
|
|
|
0
|
for my $sequence_name ( @{ $self->_sequence_names } ) { |
|
|
0
|
|
|
|
|
0
|
|
|
45
|
0
|
|
|
|
|
0
|
$sequence_base_counters{$sequence_name} = 0; |
|
46
|
|
|
|
|
|
|
} |
|
47
|
0
|
|
|
|
|
0
|
return \%sequence_base_counters; |
|
48
|
|
|
|
|
|
|
} |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
sub _build__output_file_handles { |
|
51
|
3
|
|
|
3
|
|
9
|
my ($self) = @_; |
|
52
|
3
|
|
|
|
|
88
|
my $out = $self->output_base_filename; |
|
53
|
3
|
|
|
|
|
10
|
chomp $out; |
|
54
|
|
|
|
|
|
|
|
|
55
|
3
|
|
|
|
|
5
|
my %output_file_handles; |
|
56
|
3
|
|
|
|
|
5
|
for my $sequence_name ( @{ $self->_sequence_names } ) { |
|
|
3
|
|
|
|
|
61
|
|
|
57
|
0
|
|
|
|
|
0
|
my $file_sequence_name = $sequence_name; |
|
58
|
0
|
|
|
|
|
0
|
$file_sequence_name =~ s/[^\w\d\.]/_/g; |
|
59
|
0
|
|
|
|
|
0
|
my $cmd = "gzip > $out.$file_sequence_name.insert_site_plot.gz"; |
|
60
|
0
|
0
|
|
|
|
0
|
open( $output_file_handles{$sequence_name}, '|-', $cmd ) |
|
61
|
|
|
|
|
|
|
|| Bio::Tradis::Analysis::Exceptions::FailedToCreateOutputFileHandle |
|
62
|
|
|
|
|
|
|
->throw( error => |
|
63
|
|
|
|
|
|
|
"Couldnt create output file handle for saving insertsite plot results for " |
|
64
|
|
|
|
|
|
|
. $sequence_name . " in " |
|
65
|
|
|
|
|
|
|
. $self->filename |
|
66
|
|
|
|
|
|
|
. " and output base " |
|
67
|
|
|
|
|
|
|
. $self->output_base_filename ); |
|
68
|
|
|
|
|
|
|
} |
|
69
|
|
|
|
|
|
|
|
|
70
|
3
|
|
|
|
|
63
|
return \%output_file_handles; |
|
71
|
|
|
|
|
|
|
} |
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
sub _number_of_forward_reads { |
|
74
|
0
|
|
|
0
|
|
0
|
my ( $self, $sequence_name, $read_coord ) = @_; |
|
75
|
0
|
|
|
|
|
0
|
return $self->_number_of_reads( $sequence_name, $read_coord, 1 ); |
|
76
|
|
|
|
|
|
|
} |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
sub _number_of_reverse_reads { |
|
79
|
0
|
|
|
0
|
|
0
|
my ( $self, $sequence_name, $read_coord ) = @_; |
|
80
|
0
|
|
|
|
|
0
|
return $self->_number_of_reads( $sequence_name, $read_coord, -1 ); |
|
81
|
|
|
|
|
|
|
} |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
sub _number_of_reads { |
|
84
|
0
|
|
|
0
|
|
0
|
my ( $self, $sequence_name, $read_coord, $direction ) = @_; |
|
85
|
0
|
0
|
0
|
|
|
0
|
if ( |
|
86
|
|
|
|
|
|
|
defined( |
|
87
|
|
|
|
|
|
|
$self->_frequency_of_read_start->{$sequence_name}{$read_coord} |
|
88
|
|
|
|
|
|
|
) |
|
89
|
|
|
|
|
|
|
&& defined( |
|
90
|
|
|
|
|
|
|
$self->_frequency_of_read_start->{$sequence_name}{$read_coord} |
|
91
|
|
|
|
|
|
|
{$direction} |
|
92
|
|
|
|
|
|
|
) |
|
93
|
|
|
|
|
|
|
) |
|
94
|
|
|
|
|
|
|
{ |
|
95
|
|
|
|
|
|
|
return $self->_frequency_of_read_start->{$sequence_name}{$read_coord} |
|
96
|
0
|
|
|
|
|
0
|
{$direction}; |
|
97
|
|
|
|
|
|
|
} |
|
98
|
0
|
|
|
|
|
0
|
return 0; |
|
99
|
|
|
|
|
|
|
} |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# work out if padding is needed and return it as a formatted string |
|
102
|
|
|
|
|
|
|
sub _create_padding_string { |
|
103
|
0
|
|
|
0
|
|
0
|
my ( $self, $previous_counter, $current_counter ) = @_; |
|
104
|
0
|
|
|
|
|
0
|
my $padding_string = ""; |
|
105
|
0
|
|
|
|
|
0
|
for ( my $i = $previous_counter + 1 ; $i < $current_counter ; $i++ ) { |
|
106
|
0
|
|
|
|
|
0
|
$padding_string .= "0 0\n"; |
|
107
|
|
|
|
|
|
|
} |
|
108
|
0
|
|
|
|
|
0
|
return $padding_string; |
|
109
|
|
|
|
|
|
|
} |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
sub _print_padding_at_end_of_sequence { |
|
112
|
3
|
|
|
3
|
|
12
|
my ($self) = @_; |
|
113
|
3
|
|
|
|
|
7
|
for my $sequence_name ( @{ $self->_sequence_names } ) { |
|
|
3
|
|
|
|
|
78
|
|
|
114
|
|
|
|
|
|
|
my $sequence_length = |
|
115
|
0
|
|
|
|
|
0
|
$self->_sequence_information->{$sequence_name}->{'LN'}; |
|
116
|
0
|
0
|
|
|
|
0
|
next unless ( $sequence_length =~ /^[\d]+$/ ); |
|
117
|
0
|
|
|
|
|
0
|
$sequence_length++; |
|
118
|
|
|
|
|
|
|
my $padding_string = |
|
119
|
|
|
|
|
|
|
$self->_create_padding_string( |
|
120
|
0
|
|
|
|
|
0
|
$self->_sequence_base_counters->{$sequence_name}, |
|
121
|
|
|
|
|
|
|
$sequence_length ); |
|
122
|
0
|
|
|
|
|
0
|
$self->_sequence_base_counters->{$sequence_name} = $sequence_length; |
|
123
|
0
|
|
|
|
|
0
|
print { $self->_output_file_handles->{$sequence_name} } $padding_string; |
|
|
0
|
|
|
|
|
0
|
|
|
124
|
|
|
|
|
|
|
} |
|
125
|
|
|
|
|
|
|
} |
|
126
|
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
sub _close_output_file_handles { |
|
128
|
3
|
|
|
3
|
|
10
|
my ($self) = @_; |
|
129
|
3
|
|
|
|
|
6
|
for my $output_file_handle ( values %{ $self->_output_file_handles } ) { |
|
|
3
|
|
|
|
|
72
|
|
|
130
|
0
|
|
|
|
|
0
|
close($output_file_handle); |
|
131
|
|
|
|
|
|
|
} |
|
132
|
3
|
|
|
|
|
7
|
return; |
|
133
|
|
|
|
|
|
|
} |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
sub _build__frequency_of_read_start { |
|
136
|
3
|
|
|
3
|
|
9
|
my ($self) = @_; |
|
137
|
3
|
|
|
|
|
6
|
my %frequency_of_read_start; |
|
138
|
3
|
|
|
|
|
66
|
my $samtools_command = join(' ', ($self->samtools_exec, 'view', '-F', 4, '-q', $self->mapping_score, $self->filename)); |
|
139
|
|
|
|
|
|
|
|
|
140
|
3
|
|
|
|
|
5505
|
open(my $samtools_view_fh,"-|" ,$samtools_command); |
|
141
|
3
|
|
|
|
|
251
|
while(<$samtools_view_fh>) |
|
142
|
|
|
|
|
|
|
{ |
|
143
|
0
|
|
|
|
|
0
|
my $sam_line = $_; |
|
144
|
|
|
|
|
|
|
|
|
145
|
0
|
|
|
|
|
0
|
my @read_details = split("\t", $sam_line); |
|
146
|
0
|
|
|
|
|
0
|
my $seqid = $read_details[2]; |
|
147
|
0
|
|
|
|
|
0
|
my $cigar_parser = Bio::Tradis::Parser::Cigar->new(cigar => $read_details[5], coordinate => $read_details[3]); |
|
148
|
0
|
|
|
|
|
0
|
my $strand = 1; |
|
149
|
0
|
0
|
|
|
|
0
|
$strand = -1 if(($read_details[1] & 0x10) == 0x10); |
|
150
|
|
|
|
|
|
|
|
|
151
|
0
|
0
|
|
|
|
0
|
if ( $strand == 1 ) { |
|
152
|
|
|
|
|
|
|
$frequency_of_read_start{$seqid}{ $cigar_parser->start } |
|
153
|
0
|
|
|
|
|
0
|
{ $strand }++; |
|
154
|
|
|
|
|
|
|
} |
|
155
|
|
|
|
|
|
|
else { |
|
156
|
|
|
|
|
|
|
$frequency_of_read_start{$seqid}{ $cigar_parser->end } |
|
157
|
0
|
|
|
|
|
0
|
{ $strand }++; |
|
158
|
|
|
|
|
|
|
} |
|
159
|
|
|
|
|
|
|
} |
|
160
|
3
|
|
|
|
|
192
|
return \%frequency_of_read_start; |
|
161
|
|
|
|
|
|
|
} |
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
sub create_plots { |
|
164
|
3
|
|
|
3
|
0
|
7303
|
my ($self) = @_; |
|
165
|
3
|
|
|
|
|
7
|
my %read_starts = %{ $self->_frequency_of_read_start }; |
|
|
3
|
|
|
|
|
81
|
|
|
166
|
3
|
|
|
|
|
13
|
for my $sequence_name ( keys %read_starts ) { |
|
167
|
0
|
|
|
|
|
0
|
my %sequence_read_coords = %{ $read_starts{$sequence_name} }; |
|
|
0
|
|
|
|
|
0
|
|
|
168
|
0
|
|
|
|
|
0
|
for my $read_coord ( sort { $a <=> $b } ( keys %sequence_read_coords ) ) |
|
|
0
|
|
|
|
|
0
|
|
|
169
|
|
|
|
|
|
|
{ |
|
170
|
|
|
|
|
|
|
my $padding_string = |
|
171
|
|
|
|
|
|
|
$self->_create_padding_string( |
|
172
|
0
|
|
|
|
|
0
|
$self->_sequence_base_counters->{$sequence_name}, $read_coord ); |
|
173
|
0
|
|
|
|
|
0
|
$self->_sequence_base_counters->{$sequence_name} = $read_coord; |
|
174
|
|
|
|
|
|
|
|
|
175
|
0
|
|
|
|
|
0
|
my $forward_reads = |
|
176
|
|
|
|
|
|
|
$self->_number_of_forward_reads( $sequence_name, $read_coord ); |
|
177
|
0
|
|
|
|
|
0
|
my $reverse_reads = |
|
178
|
|
|
|
|
|
|
$self->_number_of_reverse_reads( $sequence_name, $read_coord ); |
|
179
|
|
|
|
|
|
|
|
|
180
|
0
|
|
|
|
|
0
|
print { $self->_output_file_handles->{$sequence_name} } |
|
|
0
|
|
|
|
|
0
|
|
|
181
|
|
|
|
|
|
|
$padding_string . $forward_reads . " " . $reverse_reads . "\n"; |
|
182
|
|
|
|
|
|
|
} |
|
183
|
|
|
|
|
|
|
} |
|
184
|
|
|
|
|
|
|
|
|
185
|
3
|
|
|
|
|
16
|
$self->_print_padding_at_end_of_sequence; |
|
186
|
3
|
|
|
|
|
15
|
$self->_close_output_file_handles; |
|
187
|
3
|
|
|
|
|
21
|
return 1; |
|
188
|
|
|
|
|
|
|
} |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
1; |
|
191
|
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
__END__ |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=pod |
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=encoding UTF-8 |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
=head1 NAME |
|
199
|
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
Bio::Tradis::Analysis::InsertSite - Take in a bam file and plot the start position of each read |
|
201
|
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
=head1 VERSION |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
version 1.3.2 |
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
207
|
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
Takes in a mapped BAM file and plot the start position of each read |
|
209
|
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
use Bio::Tradis::Analysis::InsertSite; |
|
211
|
|
|
|
|
|
|
my $insertsite_plots_from_bam = Bio::Tradis::Analysis::InsertSite->new( |
|
212
|
|
|
|
|
|
|
filename => 'my_file.bam', |
|
213
|
|
|
|
|
|
|
output_base_filename => 'my_output_file' |
|
214
|
|
|
|
|
|
|
); |
|
215
|
|
|
|
|
|
|
$insertsite_plots_from_bam->create_plots(); |
|
216
|
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
=head1 NAME |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
InsertSite.pm - Take in a bam file and plot the start position of each read |
|
220
|
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=head1 AUTHOR |
|
222
|
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
Carla Cummins <path-help@sanger.ac.uk> |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute. |
|
228
|
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
This is free software, licensed under: |
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
The GNU General Public License, Version 3, June 2007 |
|
232
|
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
=cut |