line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package FASTX::Reader; |
2
|
39
|
|
|
39
|
|
2559181
|
use 5.012; |
|
39
|
|
|
|
|
507
|
|
3
|
39
|
|
|
39
|
|
210
|
use warnings; |
|
39
|
|
|
|
|
74
|
|
|
39
|
|
|
|
|
1427
|
|
4
|
39
|
|
|
39
|
|
219
|
use Carp qw(confess); |
|
39
|
|
|
|
|
75
|
|
|
39
|
|
|
|
|
1773
|
|
5
|
39
|
|
|
39
|
|
19528
|
use Data::Dumper; |
|
39
|
|
|
|
|
204255
|
|
|
39
|
|
|
|
|
2141
|
|
6
|
39
|
|
|
39
|
|
18002
|
use PerlIO::encoding; |
|
39
|
|
|
|
|
438915
|
|
|
39
|
|
|
|
|
1488
|
|
7
|
|
|
|
|
|
|
$Data::Dumper::Sortkeys = 1; |
8
|
39
|
|
|
39
|
|
18215
|
use FASTX::Seq; |
|
39
|
|
|
|
|
110
|
|
|
39
|
|
|
|
|
1820
|
|
9
|
39
|
|
|
39
|
|
258
|
use File::Basename; |
|
39
|
|
|
|
|
84
|
|
|
39
|
|
|
|
|
3674
|
|
10
|
|
|
|
|
|
|
$FASTX::Reader::VERSION = '1.12.1'; |
11
|
|
|
|
|
|
|
require Exporter; |
12
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
#ABSTRACT: A simple module to parse FASTA and FASTQ files, supporting compressed files and paired-ends. |
15
|
|
|
|
|
|
|
|
16
|
39
|
|
|
39
|
|
252
|
use constant GZIP_SIGNATURE => pack('C3', 0x1f, 0x8b, 0x08); |
|
39
|
|
|
|
|
106
|
|
|
39
|
|
|
|
|
115251
|
|
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
sub new { |
20
|
|
|
|
|
|
|
# Instantiate object |
21
|
73
|
|
|
73
|
1
|
28432
|
my $class = shift @_; |
22
|
73
|
|
|
|
|
241
|
my $self = bless {} => $class; |
23
|
73
|
|
|
|
|
171
|
my $args = {}; |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# Named parameters: undefined $_[0] will read STDIN! |
26
|
73
|
100
|
100
|
|
|
792
|
if (defined $_[0] and substr($_[0], 0, 1) eq '-') { |
27
|
9
|
|
|
|
|
34
|
my %data = @_; |
28
|
|
|
|
|
|
|
# Try parsing |
29
|
9
|
|
|
|
|
35
|
for my $i (keys %data) { |
30
|
13
|
100
|
|
|
|
97
|
if ($i =~ /^-(file|filename)/i) { |
|
|
100
|
|
|
|
|
|
31
|
8
|
|
|
|
|
33
|
$args->{filename} = $data{$i}; |
32
|
|
|
|
|
|
|
} elsif ($i =~ /^-(loadseqs)/i) { |
33
|
4
|
|
|
|
|
13
|
$args->{loadseqs} = $data{$i}; |
34
|
|
|
|
|
|
|
} else { |
35
|
1
|
|
|
|
|
250
|
confess "[FASTX::Reader]: Unknown parameter `$i`\n"; |
36
|
|
|
|
|
|
|
} |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
} else { |
39
|
|
|
|
|
|
|
# Legacy instantiation |
40
|
64
|
|
|
|
|
221
|
($args) = @_; |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
|
45
|
72
|
100
|
|
|
|
265
|
if (defined $args->{loadseqs}) { |
46
|
6
|
100
|
66
|
|
|
46
|
if ($args->{loadseqs} eq 'name' or $args->{loadseqs} eq 'names' ) { |
|
|
100
|
66
|
|
|
|
|
|
|
50
|
66
|
|
|
|
|
47
|
4
|
|
|
|
|
9
|
$args->{loadseqs} = 'name'; |
48
|
|
|
|
|
|
|
} elsif ($args->{loadseqs} eq 'seq' or $args->{loadseqs} eq 'seqs' or $args->{loadseqs} eq "1") { |
49
|
1
|
|
|
|
|
3
|
$args->{loadseqs} = 'seq'; |
50
|
|
|
|
|
|
|
} elsif ($args->{loadseqs} eq 'records') { |
51
|
1
|
|
|
|
|
2
|
$args->{loadseqs} = 'records'; |
52
|
|
|
|
|
|
|
} else { |
53
|
0
|
|
|
|
|
0
|
confess("attribute should be 'name' or 'seq' to specify the key of the hash."); |
54
|
|
|
|
|
|
|
} |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
|
57
|
72
|
|
|
|
|
360
|
$self->{filename} = $args->{filename}; |
58
|
72
|
|
|
|
|
196
|
$self->{loadseqs} = $args->{loadseqs}; |
59
|
72
|
|
|
|
|
216
|
$self->{aux} = [undef]; |
60
|
72
|
|
|
|
|
171
|
$self->{compressed} = 0; |
61
|
72
|
|
|
|
|
141
|
$self->{fh} = undef; |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
# Check if a filename was provided and not {{STDIN}} |
65
|
|
|
|
|
|
|
# uncoverable branch false |
66
|
|
|
|
|
|
|
|
67
|
72
|
50
|
66
|
|
|
1587
|
if ( defined $self->{filename} and -d $self->{filename} ) { |
68
|
0
|
|
|
|
|
0
|
confess __PACKAGE__, " Directory provide where file was expected\n"; |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
|
71
|
72
|
100
|
66
|
|
|
631
|
if (defined $self->{filename} and $self->{filename} ne '{{STDIN}}') { |
72
|
70
|
100
|
|
|
|
4167
|
open my $initial_fh, '<', $self->{filename} or confess "Unable to read file ", $self->{filename}, "\n"; |
73
|
66
|
|
|
|
|
1915
|
read( $initial_fh, my $magic_byte, 4 ); |
74
|
66
|
|
|
|
|
1026
|
close $initial_fh; |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
# See: __BioX::Seq::Stream__ for GZIP (and other) compressed file reader |
77
|
66
|
100
|
|
|
|
1927
|
if (substr($magic_byte,0,3) eq GZIP_SIGNATURE) { |
|
|
50
|
|
|
|
|
|
78
|
33
|
|
|
|
|
98
|
$self->{compressed} = 1; |
79
|
33
|
|
|
|
|
205
|
our $GZIP_BIN = _which('pigz', 'gzip'); |
80
|
|
|
|
|
|
|
#close $fh; |
81
|
33
|
50
|
|
|
|
11782
|
if (! defined $GZIP_BIN) { |
82
|
0
|
|
|
|
|
0
|
$self->{decompressor} = 'IO::Uncompress::Gunzip'; |
83
|
0
|
|
|
|
|
0
|
require IO::Uncompress::Gunzip; |
84
|
0
|
|
|
|
|
0
|
my $fh = IO::Uncompress::Gunzip->new($self->{filename}, MultiStream => 1); |
85
|
0
|
|
|
|
|
0
|
$self->{fh} = $fh; |
86
|
|
|
|
|
|
|
} else { |
87
|
33
|
|
|
|
|
199
|
$self->{decompressor} = $GZIP_BIN; |
88
|
33
|
50
|
|
|
|
84730
|
open my $fh, '-|', "$GZIP_BIN -dc $self->{filename}" or confess "Error opening gzip file ", $self->{filename}, ": $!\n"; |
89
|
33
|
|
|
|
|
1686
|
$self->{fh} = $fh; |
90
|
|
|
|
|
|
|
} |
91
|
|
|
|
|
|
|
} elsif (-B $self->{filename}) { |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
# BINARY FILE NOT SUPPORTED? |
94
|
|
|
|
|
|
|
#close $fh; |
95
|
0
|
|
|
|
|
0
|
$self->{fh} = undef; |
96
|
0
|
|
|
|
|
0
|
$self->{status} = 1; |
97
|
0
|
|
|
|
|
0
|
$self->{message} = 'Binary file not supported'; |
98
|
|
|
|
|
|
|
} else { |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
#close $fh; |
101
|
33
|
50
|
|
|
|
1398
|
open (my $fh, '<:encoding(utf8):crlf', $self->{filename}) or confess "Unable to read file ", $self->{filename}, ": ", $!, "\n"; |
102
|
33
|
|
|
|
|
2870
|
$self->{fh} = $fh; |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
} else { |
107
|
|
|
|
|
|
|
# Filename not provided, use STDIN |
108
|
2
|
|
|
|
|
6
|
$self->{fh} = \*STDIN; |
109
|
2
|
50
|
|
|
|
10
|
if ($self->{loadseqs}) { |
110
|
0
|
|
|
|
|
0
|
confess("Load sequences not supported for STDIN"); |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
} |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
|
117
|
68
|
100
|
|
|
|
524
|
if ($self->{loadseqs}) { |
118
|
6
|
|
|
|
|
39
|
_load_seqs($self); |
119
|
|
|
|
|
|
|
} |
120
|
|
|
|
|
|
|
|
121
|
68
|
|
|
|
|
1545
|
return $self; |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
} |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
sub records { |
127
|
1
|
|
|
1
|
1
|
1209
|
my $self = shift; |
128
|
1
|
50
|
|
|
|
5
|
confess "No records loaded with -loadseqs => records!\n" unless $self->{loadseqs} eq 'records'; |
129
|
1
|
|
|
|
|
3
|
return $self->{seqs}; |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub getRead { |
134
|
181
|
|
|
181
|
1
|
17002
|
my $self = shift; |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
#@::::::: ::: |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
|
139
|
181
|
50
|
33
|
|
|
628
|
return if (defined $self->{status} and $self->{status} == 0); |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
#my $aux = $self->{aux}; |
142
|
181
|
|
|
|
|
298
|
my $sequence_data; |
143
|
181
|
50
|
|
|
|
254
|
@{ $self->{aux} } = [undef, 0] if (!(@{ $self->{aux} })); |
|
0
|
|
|
|
|
0
|
|
|
181
|
|
|
|
|
546
|
|
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
|
146
|
181
|
100
|
|
|
|
504
|
if ($self->{aux}->[1]) { |
147
|
15
|
|
|
|
|
38
|
$self->{return_0}++; |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
|
150
|
181
|
100
|
|
|
|
493
|
if (!defined($self->{aux}->[0])) { |
151
|
117
|
|
|
|
|
12663
|
while ($self->{line} = readline($self->{fh})) { |
152
|
|
|
|
|
|
|
|
153
|
101
|
|
|
|
|
717
|
chomp($self->{line}); |
154
|
101
|
100
|
100
|
|
|
908
|
if (substr($self->{line}, 0, 1) eq '>' || substr($self->{line}, 0, 1) eq '@') { |
155
|
95
|
|
|
|
|
318
|
$self->{aux}->[0] = $self->{line}; |
156
|
95
|
|
|
|
|
232
|
last; |
157
|
|
|
|
|
|
|
} |
158
|
|
|
|
|
|
|
} |
159
|
117
|
100
|
|
|
|
375
|
if (!defined($self->{aux}->[0])) { |
160
|
22
|
|
|
|
|
122
|
$self->{aux}->[1] = 1; |
161
|
22
|
|
|
|
|
47
|
$self->{return_1}++; |
162
|
22
|
|
|
|
|
75
|
return; |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# Comments can have more spaces: |
168
|
159
|
50
|
|
|
|
384
|
return unless defined $self->{line}; |
169
|
|
|
|
|
|
|
my ($name, $comm) = $self->{line}=~/^.(\S+)(?:\s+)(.+)/ ? ($1, $2) : |
170
|
159
|
50
|
|
|
|
2223
|
$self->{line}=~/^.(\S+)/ ? ($1, '') : ('?', ''); |
|
|
100
|
|
|
|
|
|
171
|
159
|
|
|
|
|
383
|
my $seq = ''; |
172
|
159
|
|
|
|
|
246
|
my $c; |
173
|
159
|
|
|
|
|
307
|
$self->{aux}->[0] = undef; |
174
|
159
|
|
|
|
|
742
|
while ($self->{line} = readline($self->{fh})) { |
175
|
|
|
|
|
|
|
# PARSE SEQx |
176
|
361
|
|
|
|
|
666
|
chomp($self->{line}); |
177
|
361
|
|
|
|
|
745
|
$c = substr($self->{line}, 0, 1); |
178
|
361
|
100
|
66
|
|
|
1816
|
last if ($c eq '>' || $c eq '@' || $c eq '+'); |
|
|
|
100
|
|
|
|
|
179
|
220
|
|
|
|
|
1067
|
$seq .= $self->{line}; |
180
|
|
|
|
|
|
|
} |
181
|
159
|
|
|
|
|
360
|
$self->{aux}->[0] = $self->{line}; |
182
|
159
|
100
|
|
|
|
392
|
$self->{aux}->[1] = 1 if (!defined($self->{aux}->[0])); |
183
|
159
|
|
|
|
|
561
|
$sequence_data->{name} = $name; |
184
|
159
|
|
|
|
|
459
|
$sequence_data->{comment} = $comm; |
185
|
159
|
|
|
|
|
482
|
$sequence_data->{seq} = $seq; |
186
|
159
|
|
|
|
|
504
|
$self->{counter}++; |
187
|
|
|
|
|
|
|
# Return FASTA |
188
|
159
|
100
|
|
|
|
439
|
if ($c ne '+') { |
189
|
94
|
|
|
|
|
213
|
$self->{return_fasta1}++; |
190
|
94
|
|
|
|
|
285
|
return $sequence_data; |
191
|
|
|
|
|
|
|
} |
192
|
65
|
|
|
|
|
162
|
my $qual = ''; |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
|
195
|
65
|
|
|
|
|
283
|
while ($self->{line} = readline($self->{fh})) { |
196
|
|
|
|
|
|
|
# PARSE QUALITY |
197
|
65
|
|
|
|
|
137
|
chomp($self->{line}); |
198
|
65
|
|
|
|
|
168
|
$qual .= $self->{line}; |
199
|
65
|
50
|
|
|
|
250
|
if (length($qual) >= length($seq)) { |
200
|
65
|
|
|
|
|
130
|
$self->{aux}->[0] = undef; |
201
|
65
|
|
|
|
|
151
|
$sequence_data->{name} = $name; |
202
|
65
|
|
|
|
|
257
|
$sequence_data->{seq} = $seq; |
203
|
65
|
|
|
|
|
235
|
$sequence_data->{comment} = $comm; |
204
|
65
|
|
|
|
|
217
|
$sequence_data->{qual} = $qual; |
205
|
|
|
|
|
|
|
# return FASTQ |
206
|
65
|
|
|
|
|
204
|
$self->{return_fastq}++; |
207
|
65
|
|
|
|
|
281
|
return $sequence_data; |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
# PROCH |
211
|
0
|
|
|
|
|
0
|
close $self->{fh}; |
212
|
|
|
|
|
|
|
|
213
|
0
|
|
|
|
|
0
|
$self->{aux}->[1] = 1; |
214
|
0
|
|
|
|
|
0
|
$sequence_data->{name} = $name; |
215
|
0
|
|
|
|
|
0
|
$sequence_data->{seq} = $seq; |
216
|
0
|
|
|
|
|
0
|
$sequence_data->{comment} = $comm; |
217
|
0
|
|
|
|
|
0
|
$self->{counter}++; |
218
|
|
|
|
|
|
|
# return FASTA |
219
|
0
|
|
|
|
|
0
|
$self->{return_fasta2}++; |
220
|
0
|
|
|
|
|
0
|
return $sequence_data; |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
} |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
sub next { |
226
|
47
|
|
|
47
|
1
|
11404
|
my $self = shift; |
227
|
47
|
|
|
|
|
181
|
my $scalar_read = $self->getRead(); |
228
|
47
|
100
|
|
|
|
121
|
return unless defined $scalar_read; |
229
|
|
|
|
|
|
|
return FASTX::Seq->new( $scalar_read->{seq} // '', |
230
|
|
|
|
|
|
|
$scalar_read->{name} // undef, |
231
|
|
|
|
|
|
|
$scalar_read->{comment} // undef, |
232
|
42
|
|
50
|
|
|
762
|
$scalar_read->{qual} // undef); |
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
sub getFastqRead { |
238
|
7
|
|
|
7
|
1
|
2782
|
my $self = shift; |
239
|
7
|
|
|
|
|
18
|
my $seq_object = undef; |
240
|
|
|
|
|
|
|
|
241
|
7
|
50
|
66
|
|
|
44
|
return if (defined $self->{status} and $self->{status} == 0); |
242
|
|
|
|
|
|
|
|
243
|
7
|
|
|
|
|
21
|
$self->{status} = 1; |
244
|
7
|
|
|
|
|
734
|
my $header = readline($self->{fh}); |
245
|
7
|
|
|
|
|
73
|
my $seq = readline($self->{fh}); |
246
|
7
|
|
|
|
|
23
|
my $check = readline($self->{fh}); |
247
|
7
|
|
|
|
|
30
|
my $qual = readline($self->{fh}); |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
# Check 4 lines were found (FASTQ) |
251
|
|
|
|
|
|
|
|
252
|
7
|
100
|
|
|
|
26
|
unless (defined $qual) { |
253
|
1
|
50
|
|
|
|
5
|
if (defined $header) { |
254
|
0
|
|
|
|
|
0
|
$self->{message} = "Unknown format: FASTQ truncated at " . $header . "?"; |
255
|
0
|
|
|
|
|
0
|
$self->{status} = 0; |
256
|
|
|
|
|
|
|
} |
257
|
1
|
|
|
|
|
4
|
return; |
258
|
|
|
|
|
|
|
} |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
# Fast format control: header and separator |
261
|
6
|
100
|
66
|
|
|
85
|
if ( (substr($header, 0, 1) eq '@') and (substr($check, 0, 1) eq '+') ) { |
262
|
5
|
|
|
|
|
26
|
chomp($header); |
263
|
5
|
|
|
|
|
22
|
chomp($seq); |
264
|
5
|
|
|
|
|
9
|
chomp($qual); |
265
|
|
|
|
|
|
|
# Also control sequence integrity |
266
|
5
|
100
|
66
|
|
|
79
|
if ($seq=~/^[ACGTNacgtn]+$/ and length($seq) == length($qual) ) { |
267
|
4
|
|
|
|
|
31
|
my ($name, $comments) = split /\s+/, substr($header, 1); |
268
|
4
|
|
|
|
|
17
|
$seq_object->{name} = $name; |
269
|
4
|
|
|
|
|
9
|
$seq_object->{comments} = $comments; |
270
|
4
|
|
|
|
|
14
|
$seq_object->{seq} = $seq; |
271
|
4
|
|
|
|
|
17
|
$seq_object->{qual} = $qual; |
272
|
4
|
|
|
|
|
17
|
$self->{counter}++; |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
} else { |
275
|
|
|
|
|
|
|
# Return error (corrupted FASTQ) |
276
|
1
|
|
|
|
|
3
|
$self->{message} = "Unknown format: expecting FASTQ (corrupted?)"; |
277
|
1
|
|
|
|
|
2
|
$self->{status} = 0; |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
} |
280
|
|
|
|
|
|
|
} else { |
281
|
|
|
|
|
|
|
# Return error (not a FASTQ) |
282
|
1
|
|
|
|
|
2
|
$self->{message} = "Unknown format: expecting FASTQ but @ header not found"; |
283
|
|
|
|
|
|
|
|
284
|
1
|
50
|
|
|
|
4
|
if (substr($header, 0,1 ) eq '>' ) { |
285
|
|
|
|
|
|
|
# HINT: is FASTA? |
286
|
1
|
|
|
|
|
3
|
$self->{message} .= " (might be FASTA instead)"; |
287
|
|
|
|
|
|
|
} |
288
|
1
|
|
|
|
|
3
|
$self->{status} = 0; |
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
|
291
|
6
|
|
|
|
|
26
|
return $seq_object; |
292
|
|
|
|
|
|
|
} |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
sub getIlluminaRead { |
296
|
16
|
|
|
16
|
1
|
18385
|
my $self = shift; |
297
|
16
|
|
|
|
|
32
|
my $seq_object = undef; |
298
|
|
|
|
|
|
|
|
299
|
16
|
50
|
66
|
|
|
84
|
return if (defined $self->{status} and $self->{status} == 0); |
300
|
|
|
|
|
|
|
|
301
|
16
|
|
|
|
|
34
|
$self->{status} = 1; |
302
|
16
|
|
|
|
|
886
|
my $header = readline($self->{fh}); |
303
|
16
|
|
|
|
|
93
|
my $seq = readline($self->{fh}); |
304
|
16
|
|
|
|
|
49
|
my $check = readline($self->{fh}); |
305
|
16
|
|
|
|
|
52
|
my $qual = readline($self->{fh}); |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
# Check 4 lines were found (FASTQ) |
309
|
|
|
|
|
|
|
|
310
|
16
|
100
|
|
|
|
55
|
unless (defined $qual) { |
311
|
2
|
50
|
|
|
|
9
|
if (defined $header) { |
312
|
0
|
|
|
|
|
0
|
$self->{message} = "Unknown format: FASTQ truncated at " . $header . "?"; |
313
|
0
|
|
|
|
|
0
|
$self->{status} = 0; |
314
|
|
|
|
|
|
|
} |
315
|
2
|
|
|
|
|
9
|
return; |
316
|
|
|
|
|
|
|
} |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
# Fast format control: header and separator |
319
|
14
|
50
|
33
|
|
|
121
|
if ( (substr($header, 0, 1) eq '@') and (substr($check, 0, 1) eq '+') ) { |
320
|
14
|
|
|
|
|
33
|
chomp($header); |
321
|
14
|
|
|
|
|
22
|
chomp($seq); |
322
|
14
|
|
|
|
|
21
|
chomp($qual); |
323
|
|
|
|
|
|
|
# Also control sequence integrity |
324
|
14
|
50
|
33
|
|
|
139
|
if ($seq=~/^[ACGTNacgtn]+$/ and length($seq) == length($qual) ) { |
325
|
14
|
|
|
|
|
91
|
my ($name, $comments) = split /\s+/, substr($header, 1); |
326
|
|
|
|
|
|
|
#@::::::: ::: |
327
|
14
|
|
|
|
|
70
|
my ($instrument, $run, $flowcell, $lane, $tile, $x, $y, $umi) = split /:/, $name; |
328
|
14
|
|
|
|
|
32
|
my ($read, $filtered, $ctrl, $index1, $index2); |
329
|
|
|
|
|
|
|
|
330
|
14
|
50
|
|
|
|
31
|
if (not defined $y) { |
331
|
0
|
|
|
|
|
0
|
$self->{message} = "Unknown format: not Illumina naming: ::::::"; |
332
|
0
|
|
|
|
|
0
|
$self->{status} = 0; |
333
|
|
|
|
|
|
|
} |
334
|
|
|
|
|
|
|
|
335
|
14
|
|
|
|
|
47
|
$seq_object->{name} = $name; |
336
|
14
|
|
|
|
|
51
|
$seq_object->{comments} = $comments; |
337
|
14
|
|
|
|
|
35
|
$seq_object->{seq} = $seq; |
338
|
14
|
|
|
|
|
32
|
$seq_object->{qual} = $qual; |
339
|
|
|
|
|
|
|
|
340
|
14
|
|
|
|
|
25
|
$seq_object->{instrument} = $instrument; |
341
|
14
|
|
|
|
|
32
|
$seq_object->{run} = $run; |
342
|
14
|
|
|
|
|
21
|
$seq_object->{flowcell} = $flowcell; |
343
|
14
|
|
|
|
|
32
|
$seq_object->{lane} = $lane; |
344
|
14
|
|
|
|
|
26
|
$seq_object->{tile} = $tile; |
345
|
14
|
|
|
|
|
36
|
$seq_object->{x} = $x; |
346
|
14
|
|
|
|
|
35
|
$seq_object->{y} = $y; |
347
|
14
|
|
|
|
|
25
|
$seq_object->{umi} = $umi; |
348
|
14
|
|
|
|
|
28
|
$seq_object->{instrument} = $instrument; |
349
|
14
|
|
|
|
|
17
|
$seq_object->{run} = $run; |
350
|
14
|
|
|
|
|
25
|
$seq_object->{flowcell} = $flowcell; |
351
|
|
|
|
|
|
|
|
352
|
14
|
100
|
|
|
|
30
|
if (defined $comments) { |
353
|
7
|
|
|
|
|
40
|
($read, $filtered, $ctrl, $index1, $index2) = split /[:+]/, $comments; |
354
|
7
|
50
|
|
|
|
21
|
if ( defined $ctrl ) { |
355
|
7
|
|
|
|
|
19
|
$seq_object->{read} = $read; |
356
|
7
|
50
|
|
|
|
19
|
$filtered eq 'N' ? $seq_object->{filtered} = 0 : $seq_object->{filtered} = 1; |
357
|
7
|
|
|
|
|
14
|
$seq_object->{control} = $ctrl; |
358
|
7
|
50
|
|
|
|
9
|
if ($read eq '1') { |
359
|
7
|
|
|
|
|
22
|
$seq_object->{index} = $index1; |
360
|
7
|
|
|
|
|
17
|
$seq_object->{paired_index} = $index2; |
361
|
|
|
|
|
|
|
} else { |
362
|
0
|
|
|
|
|
0
|
$seq_object->{paired_index} = $index1; |
363
|
0
|
|
|
|
|
0
|
$seq_object->{index} = $index2; |
364
|
|
|
|
|
|
|
} |
365
|
|
|
|
|
|
|
} |
366
|
|
|
|
|
|
|
} |
367
|
14
|
|
|
|
|
36
|
$self->{counter}++; |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
} else { |
370
|
|
|
|
|
|
|
# Return error (corrupted FASTQ) |
371
|
0
|
|
|
|
|
0
|
$self->{message} = "Unknown format: expecting FASTQ (corrupted?)"; |
372
|
0
|
|
|
|
|
0
|
$self->{status} = 0; |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
} |
375
|
|
|
|
|
|
|
} else { |
376
|
|
|
|
|
|
|
# Return error (not a FASTQ) |
377
|
0
|
|
|
|
|
0
|
$self->{message} = "Unknown format: expecting FASTQ but @ header not found"; |
378
|
|
|
|
|
|
|
|
379
|
0
|
0
|
|
|
|
0
|
if (substr($header, 0,1 ) eq '>' ) { |
380
|
|
|
|
|
|
|
# HINT: is FASTA? |
381
|
0
|
|
|
|
|
0
|
$self->{message} .= " (might be FASTA instead)"; |
382
|
|
|
|
|
|
|
} |
383
|
0
|
|
|
|
|
0
|
$self->{status} = 0; |
384
|
|
|
|
|
|
|
} |
385
|
|
|
|
|
|
|
|
386
|
14
|
|
|
|
|
45
|
return $seq_object; |
387
|
|
|
|
|
|
|
} |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
sub getFileFormat { |
390
|
4
|
|
|
4
|
1
|
1313
|
my $self = shift; |
391
|
4
|
|
|
|
|
8
|
my ($filename) = shift; |
392
|
4
|
50
|
|
|
|
11
|
return 0 if (not defined $filename); |
393
|
|
|
|
|
|
|
|
394
|
4
|
50
|
|
|
|
171
|
open my $fh, '<', $filename or confess "Unable to read file ", $filename, "\n"; |
395
|
4
|
|
|
|
|
109
|
read( $fh, my $magic_byte, 4 ); |
396
|
4
|
|
|
|
|
88
|
close $fh; |
397
|
|
|
|
|
|
|
|
398
|
4
|
50
|
|
|
|
26
|
if (substr($magic_byte,0,3) eq GZIP_SIGNATURE) { |
399
|
|
|
|
|
|
|
# GZIPPED FILE |
400
|
0
|
0
|
|
|
|
0
|
if (! defined $self->{GZIP_BIN}) { |
401
|
0
|
|
|
|
|
0
|
require IO::Uncompress::Gunzip; |
402
|
0
|
|
|
|
|
0
|
$fh = IO::Uncompress::Gunzip->new($filename, MultiStream => 1); |
403
|
|
|
|
|
|
|
} else { |
404
|
0
|
0
|
|
|
|
0
|
open $fh, '-|', "$self->{GZIP_BIN} -dc $filename" or confess "Error opening gzip file ", $filename, ": $!\n"; |
405
|
|
|
|
|
|
|
} |
406
|
|
|
|
|
|
|
} else { |
407
|
|
|
|
|
|
|
# NOT COMPRESSED |
408
|
4
|
|
33
|
|
|
144
|
open $fh, '<:encoding(utf8)', "$filename" || confess "Unable to read $filename\n$!\n"; |
409
|
|
|
|
|
|
|
} |
410
|
4
|
|
|
|
|
312
|
my $first = readline($fh); |
411
|
4
|
100
|
|
|
|
71
|
if (substr($first, 0,1) eq '>') { |
|
|
100
|
|
|
|
|
|
412
|
|
|
|
|
|
|
#should be FASTA |
413
|
1
|
|
|
|
|
19
|
return 'fasta'; |
414
|
|
|
|
|
|
|
} elsif (substr($first, 0, 1) eq '@') { |
415
|
|
|
|
|
|
|
#should be fastq |
416
|
2
|
|
|
|
|
6
|
readline($fh); |
417
|
2
|
|
|
|
|
5
|
my $sep = readline($fh); |
418
|
2
|
50
|
|
|
|
8
|
if ( substr($sep, 0, 1) eq '+' ) { |
419
|
|
|
|
|
|
|
#second check for fastq |
420
|
2
|
|
|
|
|
33
|
return 'fastq'; |
421
|
|
|
|
|
|
|
} |
422
|
|
|
|
|
|
|
} else { |
423
|
|
|
|
|
|
|
#unknown format |
424
|
1
|
|
|
|
|
17
|
return; |
425
|
|
|
|
|
|
|
} |
426
|
|
|
|
|
|
|
} |
427
|
|
|
|
|
|
|
sub _load_seqs { |
428
|
6
|
|
|
6
|
|
26
|
my ($self) = @_; |
429
|
6
|
50
|
|
|
|
29
|
return 0 unless (defined $self->{loadseqs}); |
430
|
|
|
|
|
|
|
|
431
|
6
|
|
|
|
|
14
|
my $seqs = undef; |
432
|
6
|
|
|
|
|
23
|
while (my $s = $self->getRead() ) { |
433
|
22
|
|
|
|
|
52
|
my ($name, $seq) = ($s->{name}, $s->{seq}); |
434
|
22
|
100
|
|
|
|
58
|
if ($self->{loadseqs} eq 'name') { |
|
|
100
|
|
|
|
|
|
435
|
12
|
|
|
|
|
55
|
$seqs->{$name} = $seq; |
436
|
|
|
|
|
|
|
} elsif ($self->{loadseqs} eq 'seq') { |
437
|
3
|
|
|
|
|
14
|
$seqs->{$seq} = $name; |
438
|
|
|
|
|
|
|
} else { |
439
|
|
|
|
|
|
|
my $r = FASTX::Seq->new( |
440
|
|
|
|
|
|
|
-name => $name, |
441
|
|
|
|
|
|
|
-seq => $seq, |
442
|
|
|
|
|
|
|
-comment => $s->{comments}, |
443
|
|
|
|
|
|
|
-qual => $s->{qual}, |
444
|
7
|
|
|
|
|
81
|
); |
445
|
7
|
|
|
|
|
18
|
push(@{$seqs}, $r); |
|
7
|
|
|
|
|
26
|
|
446
|
|
|
|
|
|
|
} |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
} |
449
|
6
|
|
|
|
|
20
|
$self->{seqs} = $seqs; |
450
|
|
|
|
|
|
|
} |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
sub _which { |
454
|
33
|
50
|
|
33
|
|
274
|
return if ($^O eq 'MSWin32'); |
455
|
33
|
|
|
|
|
87
|
my $has_which = eval { require File::Which; File::Which->import(); 1}; |
|
33
|
|
|
|
|
7910
|
|
|
33
|
|
|
|
|
19162
|
|
|
33
|
|
|
|
|
139
|
|
456
|
33
|
50
|
|
|
|
93
|
if ($has_which) { |
457
|
33
|
|
|
|
|
204
|
foreach my $cmd (@_) { |
458
|
66
|
100
|
|
|
|
8210
|
return which($cmd) if (which($cmd)); |
459
|
|
|
|
|
|
|
} |
460
|
|
|
|
|
|
|
} else { |
461
|
0
|
|
|
|
|
|
foreach my $cmd (@_) { |
462
|
0
|
|
|
|
|
|
my $exit; |
463
|
0
|
|
|
|
|
|
eval { |
464
|
0
|
|
|
|
|
|
`which $cmd 2> /dev/null`; |
465
|
0
|
|
|
|
|
|
$exit = $?; |
466
|
|
|
|
|
|
|
}; |
467
|
0
|
0
|
0
|
|
|
|
return $cmd if ($exit == 0 and not $@); |
468
|
|
|
|
|
|
|
} |
469
|
|
|
|
|
|
|
} |
470
|
0
|
|
|
|
|
|
return; |
471
|
|
|
|
|
|
|
} |
472
|
|
|
|
|
|
|
1; |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
__END__ |