line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::Tradis::Analysis::InsertSite; |
2
|
|
|
|
|
|
|
# ABSTRACT: Take in a bam file and plot the start position of each read |
3
|
|
|
|
|
|
|
$Bio::Tradis::Analysis::InsertSite::VERSION = '1.3.3'; |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
|
6
|
4
|
|
|
4
|
|
90001
|
use Moose; |
|
4
|
|
|
|
|
411974
|
|
|
4
|
|
|
|
|
25
|
|
7
|
4
|
|
|
4
|
|
26240
|
use Bio::Tradis::Parser::Bam; |
|
4
|
|
|
|
|
14
|
|
|
4
|
|
|
|
|
178
|
|
8
|
4
|
|
|
4
|
|
1944
|
use Bio::Tradis::Parser::Cigar; |
|
4
|
|
|
|
|
14
|
|
|
4
|
|
|
|
|
3925
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
has 'filename' => ( is => 'rw', isa => 'Str', required => 1 ); |
11
|
|
|
|
|
|
|
has 'output_base_filename' => ( is => 'rw', isa => 'Str', required => 1 ); |
12
|
|
|
|
|
|
|
has 'mapping_score' => ( is => 'ro', isa => 'Int', required => 1 ); |
13
|
|
|
|
|
|
|
has 'samtools_exec' => ( is => 'ro', isa => 'Str', default => 'samtools' ); |
14
|
|
|
|
|
|
|
has '_output_file_handles' => ( is => 'rw', isa => 'HashRef', lazy_build => 1 ); |
15
|
|
|
|
|
|
|
has '_sequence_names' => ( is => 'rw', isa => 'ArrayRef', lazy_build => 1 ); |
16
|
|
|
|
|
|
|
has '_sequence_base_counters' => |
17
|
|
|
|
|
|
|
( is => 'rw', isa => 'HashRef', lazy_build => 1 ); |
18
|
|
|
|
|
|
|
has '_sequence_information' => |
19
|
|
|
|
|
|
|
( is => 'rw', isa => 'HashRef', lazy_build => 1 ); |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
has '_frequency_of_read_start' => ( |
22
|
|
|
|
|
|
|
is => 'rw', |
23
|
|
|
|
|
|
|
isa => 'HashRef', |
24
|
|
|
|
|
|
|
lazy => 1, |
25
|
|
|
|
|
|
|
builder => '_build__frequency_of_read_start' |
26
|
|
|
|
|
|
|
); |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
sub _build__sequence_information { |
29
|
3
|
|
|
3
|
|
11
|
my ($self) = @_; |
30
|
3
|
|
|
|
|
73
|
my %all_sequences_info = |
31
|
|
|
|
|
|
|
Bio::Tradis::Parser::Bam->new( file => $self->filename, samtools_exec => $self->samtools_exec )->seq_info; |
32
|
3
|
|
|
|
|
239
|
return \%all_sequences_info; |
33
|
|
|
|
|
|
|
} |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
sub _build__sequence_names { |
36
|
3
|
|
|
3
|
|
11
|
my ($self) = @_; |
37
|
3
|
|
|
|
|
8
|
my @sequence_names = keys %{ $self->_sequence_information }; |
|
3
|
|
|
|
|
80
|
|
38
|
3
|
|
|
|
|
77
|
return \@sequence_names; |
39
|
|
|
|
|
|
|
} |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
sub _build__sequence_base_counters { |
42
|
0
|
|
|
0
|
|
0
|
my ($self) = @_; |
43
|
0
|
|
|
|
|
0
|
my %sequence_base_counters; |
44
|
0
|
|
|
|
|
0
|
for my $sequence_name ( @{ $self->_sequence_names } ) { |
|
0
|
|
|
|
|
0
|
|
45
|
0
|
|
|
|
|
0
|
$sequence_base_counters{$sequence_name} = 0; |
46
|
|
|
|
|
|
|
} |
47
|
0
|
|
|
|
|
0
|
return \%sequence_base_counters; |
48
|
|
|
|
|
|
|
} |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
sub _build__output_file_handles { |
51
|
3
|
|
|
3
|
|
10
|
my ($self) = @_; |
52
|
3
|
|
|
|
|
87
|
my $out = $self->output_base_filename; |
53
|
3
|
|
|
|
|
11
|
chomp $out; |
54
|
|
|
|
|
|
|
|
55
|
3
|
|
|
|
|
7
|
my %output_file_handles; |
56
|
3
|
|
|
|
|
8
|
for my $sequence_name ( @{ $self->_sequence_names } ) { |
|
3
|
|
|
|
|
68
|
|
57
|
0
|
|
|
|
|
0
|
my $file_sequence_name = $sequence_name; |
58
|
0
|
|
|
|
|
0
|
$file_sequence_name =~ s/[^\w\d\.]/_/g; |
59
|
0
|
|
|
|
|
0
|
my $cmd = "gzip > $out.$file_sequence_name.insert_site_plot.gz"; |
60
|
0
|
0
|
|
|
|
0
|
open( $output_file_handles{$sequence_name}, '|-', $cmd ) |
61
|
|
|
|
|
|
|
|| Bio::Tradis::Analysis::Exceptions::FailedToCreateOutputFileHandle |
62
|
|
|
|
|
|
|
->throw( error => |
63
|
|
|
|
|
|
|
"Couldnt create output file handle for saving insertsite plot results for " |
64
|
|
|
|
|
|
|
. $sequence_name . " in " |
65
|
|
|
|
|
|
|
. $self->filename |
66
|
|
|
|
|
|
|
. " and output base " |
67
|
|
|
|
|
|
|
. $self->output_base_filename ); |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
|
70
|
3
|
|
|
|
|
65
|
return \%output_file_handles; |
71
|
|
|
|
|
|
|
} |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
sub _number_of_forward_reads { |
74
|
0
|
|
|
0
|
|
0
|
my ( $self, $sequence_name, $read_coord ) = @_; |
75
|
0
|
|
|
|
|
0
|
return $self->_number_of_reads( $sequence_name, $read_coord, 1 ); |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
sub _number_of_reverse_reads { |
79
|
0
|
|
|
0
|
|
0
|
my ( $self, $sequence_name, $read_coord ) = @_; |
80
|
0
|
|
|
|
|
0
|
return $self->_number_of_reads( $sequence_name, $read_coord, -1 ); |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
sub _number_of_reads { |
84
|
0
|
|
|
0
|
|
0
|
my ( $self, $sequence_name, $read_coord, $direction ) = @_; |
85
|
0
|
0
|
0
|
|
|
0
|
if ( |
86
|
|
|
|
|
|
|
defined( |
87
|
|
|
|
|
|
|
$self->_frequency_of_read_start->{$sequence_name}{$read_coord} |
88
|
|
|
|
|
|
|
) |
89
|
|
|
|
|
|
|
&& defined( |
90
|
|
|
|
|
|
|
$self->_frequency_of_read_start->{$sequence_name}{$read_coord} |
91
|
|
|
|
|
|
|
{$direction} |
92
|
|
|
|
|
|
|
) |
93
|
|
|
|
|
|
|
) |
94
|
|
|
|
|
|
|
{ |
95
|
|
|
|
|
|
|
return $self->_frequency_of_read_start->{$sequence_name}{$read_coord} |
96
|
0
|
|
|
|
|
0
|
{$direction}; |
97
|
|
|
|
|
|
|
} |
98
|
0
|
|
|
|
|
0
|
return 0; |
99
|
|
|
|
|
|
|
} |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# work out if padding is needed and return it as a formatted string |
102
|
|
|
|
|
|
|
sub _create_padding_string { |
103
|
0
|
|
|
0
|
|
0
|
my ( $self, $previous_counter, $current_counter ) = @_; |
104
|
0
|
|
|
|
|
0
|
my $padding_string = ""; |
105
|
0
|
|
|
|
|
0
|
for ( my $i = $previous_counter + 1 ; $i < $current_counter ; $i++ ) { |
106
|
0
|
|
|
|
|
0
|
$padding_string .= "0 0\n"; |
107
|
|
|
|
|
|
|
} |
108
|
0
|
|
|
|
|
0
|
return $padding_string; |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
sub _print_padding_at_end_of_sequence { |
112
|
3
|
|
|
3
|
|
11
|
my ($self) = @_; |
113
|
3
|
|
|
|
|
8
|
for my $sequence_name ( @{ $self->_sequence_names } ) { |
|
3
|
|
|
|
|
87
|
|
114
|
|
|
|
|
|
|
my $sequence_length = |
115
|
0
|
|
|
|
|
0
|
$self->_sequence_information->{$sequence_name}->{'LN'}; |
116
|
0
|
0
|
|
|
|
0
|
next unless ( $sequence_length =~ /^[\d]+$/ ); |
117
|
0
|
|
|
|
|
0
|
$sequence_length++; |
118
|
|
|
|
|
|
|
my $padding_string = |
119
|
|
|
|
|
|
|
$self->_create_padding_string( |
120
|
0
|
|
|
|
|
0
|
$self->_sequence_base_counters->{$sequence_name}, |
121
|
|
|
|
|
|
|
$sequence_length ); |
122
|
0
|
|
|
|
|
0
|
$self->_sequence_base_counters->{$sequence_name} = $sequence_length; |
123
|
0
|
|
|
|
|
0
|
print { $self->_output_file_handles->{$sequence_name} } $padding_string; |
|
0
|
|
|
|
|
0
|
|
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
sub _close_output_file_handles { |
128
|
3
|
|
|
3
|
|
9
|
my ($self) = @_; |
129
|
3
|
|
|
|
|
9
|
for my $output_file_handle ( values %{ $self->_output_file_handles } ) { |
|
3
|
|
|
|
|
76
|
|
130
|
0
|
|
|
|
|
0
|
close($output_file_handle); |
131
|
|
|
|
|
|
|
} |
132
|
3
|
|
|
|
|
8
|
return; |
133
|
|
|
|
|
|
|
} |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
sub _build__frequency_of_read_start { |
136
|
3
|
|
|
3
|
|
7
|
my ($self) = @_; |
137
|
3
|
|
|
|
|
7
|
my %frequency_of_read_start; |
138
|
3
|
|
|
|
|
72
|
my $samtools_command = join(' ', ($self->samtools_exec, 'view', '-F', 4, '-q', $self->mapping_score, $self->filename)); |
139
|
|
|
|
|
|
|
|
140
|
3
|
|
|
|
|
5842
|
open(my $samtools_view_fh,"-|" ,$samtools_command); |
141
|
3
|
|
|
|
|
266
|
while(<$samtools_view_fh>) |
142
|
|
|
|
|
|
|
{ |
143
|
0
|
|
|
|
|
0
|
my $sam_line = $_; |
144
|
|
|
|
|
|
|
|
145
|
0
|
|
|
|
|
0
|
my @read_details = split("\t", $sam_line); |
146
|
0
|
|
|
|
|
0
|
my $seqid = $read_details[2]; |
147
|
0
|
|
|
|
|
0
|
my $cigar_parser = Bio::Tradis::Parser::Cigar->new(cigar => $read_details[5], coordinate => $read_details[3]); |
148
|
0
|
|
|
|
|
0
|
my $strand = 1; |
149
|
0
|
0
|
|
|
|
0
|
$strand = -1 if(($read_details[1] & 0x10) == 0x10); |
150
|
|
|
|
|
|
|
|
151
|
0
|
0
|
|
|
|
0
|
if ( $strand == 1 ) { |
152
|
|
|
|
|
|
|
$frequency_of_read_start{$seqid}{ $cigar_parser->start } |
153
|
0
|
|
|
|
|
0
|
{ $strand }++; |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
else { |
156
|
|
|
|
|
|
|
$frequency_of_read_start{$seqid}{ $cigar_parser->end } |
157
|
0
|
|
|
|
|
0
|
{ $strand }++; |
158
|
|
|
|
|
|
|
} |
159
|
|
|
|
|
|
|
} |
160
|
3
|
|
|
|
|
206
|
return \%frequency_of_read_start; |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
sub create_plots { |
164
|
3
|
|
|
3
|
0
|
7363
|
my ($self) = @_; |
165
|
3
|
|
|
|
|
7
|
my %read_starts = %{ $self->_frequency_of_read_start }; |
|
3
|
|
|
|
|
84
|
|
166
|
3
|
|
|
|
|
17
|
for my $sequence_name ( keys %read_starts ) { |
167
|
0
|
|
|
|
|
0
|
my %sequence_read_coords = %{ $read_starts{$sequence_name} }; |
|
0
|
|
|
|
|
0
|
|
168
|
0
|
|
|
|
|
0
|
for my $read_coord ( sort { $a <=> $b } ( keys %sequence_read_coords ) ) |
|
0
|
|
|
|
|
0
|
|
169
|
|
|
|
|
|
|
{ |
170
|
|
|
|
|
|
|
my $padding_string = |
171
|
|
|
|
|
|
|
$self->_create_padding_string( |
172
|
0
|
|
|
|
|
0
|
$self->_sequence_base_counters->{$sequence_name}, $read_coord ); |
173
|
0
|
|
|
|
|
0
|
$self->_sequence_base_counters->{$sequence_name} = $read_coord; |
174
|
|
|
|
|
|
|
|
175
|
0
|
|
|
|
|
0
|
my $forward_reads = |
176
|
|
|
|
|
|
|
$self->_number_of_forward_reads( $sequence_name, $read_coord ); |
177
|
0
|
|
|
|
|
0
|
my $reverse_reads = |
178
|
|
|
|
|
|
|
$self->_number_of_reverse_reads( $sequence_name, $read_coord ); |
179
|
|
|
|
|
|
|
|
180
|
0
|
|
|
|
|
0
|
print { $self->_output_file_handles->{$sequence_name} } |
|
0
|
|
|
|
|
0
|
|
181
|
|
|
|
|
|
|
$padding_string . $forward_reads . " " . $reverse_reads . "\n"; |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
|
185
|
3
|
|
|
|
|
18
|
$self->_print_padding_at_end_of_sequence; |
186
|
3
|
|
|
|
|
15
|
$self->_close_output_file_handles; |
187
|
3
|
|
|
|
|
28
|
return 1; |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
1; |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
__END__ |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=pod |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=encoding UTF-8 |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
=head1 NAME |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
Bio::Tradis::Analysis::InsertSite - Take in a bam file and plot the start position of each read |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
=head1 VERSION |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
version 1.3.3 |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
=head1 SYNOPSIS |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
Takes in a mapped BAM file and plot the start position of each read |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
use Bio::Tradis::Analysis::InsertSite; |
211
|
|
|
|
|
|
|
my $insertsite_plots_from_bam = Bio::Tradis::Analysis::InsertSite->new( |
212
|
|
|
|
|
|
|
filename => 'my_file.bam', |
213
|
|
|
|
|
|
|
output_base_filename => 'my_output_file' |
214
|
|
|
|
|
|
|
); |
215
|
|
|
|
|
|
|
$insertsite_plots_from_bam->create_plots(); |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
=head1 NAME |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
InsertSite.pm - Take in a bam file and plot the start position of each read |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=head1 AUTHOR |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
Carla Cummins <path-help@sanger.ac.uk> |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute. |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
This is free software, licensed under: |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
The GNU General Public License, Version 3, June 2007 |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
=cut |