line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# $Id: FASTAgrammar.pm,v 1.1 2004/04/28 15:03:43 aphillip Exp $ |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
package TIGR::FASTA::Grammar; |
4
|
|
|
|
|
|
|
{ |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
=head1 NAME |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
TIGR::FASTA::Grammar - module for validating FASTA format records |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
=head1 SYNOPSIS |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
use TIGR::FASTA::Grammar ':public'; |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
$header = FASTA header here... |
15
|
|
|
|
|
|
|
$data = FASTA data here... |
16
|
|
|
|
|
|
|
$return_value = isValidFASTARecord($header, $data); |
17
|
|
|
|
|
|
|
... |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=head1 DESCRIPTION |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
This module provides functions for verifying compliance with TIGR's FASTA |
22
|
|
|
|
|
|
|
file and record definitions. |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=cut |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
BEGIN { |
27
|
1
|
|
|
1
|
|
32
|
require 5.006_00; |
28
|
|
|
|
|
|
|
} |
29
|
|
|
|
|
|
|
|
30
|
1
|
|
|
1
|
|
5
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
902
|
|
31
|
|
|
|
|
|
|
require Exporter; |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
## internal variables and identifiers |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
36
|
|
|
|
|
|
|
our $REVISION = (qw$Revision: 1.1 $)[-1]; |
37
|
|
|
|
|
|
|
our $VERSION = '1.2'; |
38
|
|
|
|
|
|
|
our $VERSION_STRING = "$VERSION (Build $REVISION)"; |
39
|
|
|
|
|
|
|
our @DEPEND = (); |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
## Export methods |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( 'public' => [ qw( isValidFASTArecord |
45
|
|
|
|
|
|
|
isValidFASTAheader |
46
|
|
|
|
|
|
|
isValidFASTAdata |
47
|
|
|
|
|
|
|
isValidFASTAlineLength |
48
|
|
|
|
|
|
|
setValidFASTAlineLength ) ] , |
49
|
|
|
|
|
|
|
'private' => [ qw( _headerToIdentifier |
50
|
|
|
|
|
|
|
_isNucleotideData |
51
|
|
|
|
|
|
|
_isPeptideData ) ] ); |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'public'} }, |
54
|
|
|
|
|
|
|
@{ $EXPORT_TAGS{'private'} } ); |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
## data structures |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
# IUPAC extended codes acceptable for sequence data |
60
|
|
|
|
|
|
|
# see WU-BLAST format on http://tigrblast.tigr.org/html/fasta.html |
61
|
|
|
|
|
|
|
our $NA_IUPAC_CODES = 'ATUGCMRWSYKVHDBN\.\-'; |
62
|
|
|
|
|
|
|
our $AA_IUPAC_CODES = 'ARNDBCQEZGHILKMFPSTUWXYV\*\-'; |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
# FASTA parameters |
65
|
|
|
|
|
|
|
our $FASTA_SEPARATOR = '^>'; |
66
|
|
|
|
|
|
|
our $UNBOUND_FASTA_SEPARATOR = '>'; |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
# note: BLAST can accept 80 bases per line; this just emits a warning |
69
|
|
|
|
|
|
|
our $OUTPUT_LINE_LENGTH = 60; |
70
|
|
|
|
|
|
|
my $RECORD_LINE_LENGTH = 0; |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
# the TIGR FASTA header parse |
73
|
|
|
|
|
|
|
our $FASTA_HEADER_PARSE = |
74
|
|
|
|
|
|
|
'^>([[:graph:]]+)(?: +[ \ca[:graph:]]*$| *$)'; |
75
|
|
|
|
|
|
|
## prototypes |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
sub isValidFASTArecord(@); |
78
|
|
|
|
|
|
|
sub isValidFASTAheader($); |
79
|
|
|
|
|
|
|
sub isValidFASTAdata($); |
80
|
|
|
|
|
|
|
sub isValidFASTAlineLength($); |
81
|
|
|
|
|
|
|
sub setValidFASTAlineLength($); |
82
|
|
|
|
|
|
|
sub _isNucleotideData($); |
83
|
|
|
|
|
|
|
sub _isPeptideData($); |
84
|
|
|
|
|
|
|
sub _headerToIdentifier($); |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
## implementation |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=over |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
=item $result = isValidFASTArecord(@record_defn); |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
This method determines if a FASTA record, C<@record_defn>, fits the TIGR |
93
|
|
|
|
|
|
|
definition for a FASTA record. C<@record_defn> is an array of lines over |
94
|
|
|
|
|
|
|
which the record is defined. The first line should be the FASTA header, and |
95
|
|
|
|
|
|
|
subsequent lines the data definition. This method checks line width, |
96
|
|
|
|
|
|
|
character composition, and header format. If the record parses correctly, |
97
|
|
|
|
|
|
|
this method returns 1. Otherwise, this method returns 0. |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=cut |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
sub isValidFASTArecord(@) { |
103
|
|
|
|
|
|
|
|
104
|
0
|
|
|
0
|
1
|
|
my $header = shift; |
105
|
0
|
|
|
|
|
|
my @data_lines = @_; |
106
|
0
|
|
|
|
|
|
my $valid_flag = 0; |
107
|
0
|
|
|
|
|
|
my $first_line_flag = 0; |
108
|
0
|
|
|
|
|
|
my $first_len = 0; |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
# check conformance of header |
111
|
0
|
|
|
|
|
|
$valid_flag = isValidFASTAheader($header); |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
# check conformance of data |
114
|
0
|
0
|
0
|
|
|
|
if ( ( $valid_flag != 0 ) && |
115
|
|
|
|
|
|
|
( scalar(@data_lines) > 0 ) ) { |
116
|
0
|
|
|
|
|
|
my $data_scalar = join "", @data_lines; |
117
|
0
|
|
|
|
|
|
$data_scalar =~ s/\n//g; # extract new lines from scalar data |
118
|
0
|
|
|
|
|
|
$valid_flag = isValidFASTAdata($data_scalar); |
119
|
|
|
|
|
|
|
} |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
# check conformance of line length |
122
|
0
|
|
0
|
|
|
|
while ( ( $valid_flag != 0 ) && |
123
|
|
|
|
|
|
|
( defined ( my $line = shift @data_lines ) ) ) { |
124
|
0
|
|
|
|
|
|
chomp $line; |
125
|
|
|
|
|
|
|
|
126
|
0
|
0
|
|
|
|
|
if($first_line_flag == 0) { |
127
|
0
|
|
|
|
|
|
$first_len = setValidFASTAlineLength($line); |
128
|
0
|
|
|
|
|
|
$first_line_flag = 1; |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
0
|
0
|
|
|
|
|
if(defined($first_len)) { |
132
|
0
|
|
|
|
|
|
my $line_len_flag = isValidFASTAlineLength($line); |
133
|
0
|
0
|
0
|
|
|
|
if ( ( $line_len_flag > 0 ) || |
|
|
|
0
|
|
|
|
|
134
|
|
|
|
|
|
|
( ( $line_len_flag < 0 ) && |
135
|
|
|
|
|
|
|
( $#data_lines != -1 ) ) ) { |
136
|
0
|
|
|
|
|
|
$valid_flag = 0; |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
} |
140
|
|
|
|
|
|
|
|
141
|
0
|
|
|
|
|
|
return $valid_flag; |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
=item $result = isValidFASTAheader($header); |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
This method determines if the FASTA header description, C<$header>, is |
147
|
|
|
|
|
|
|
a valid description. It checks for a leading carot symbol and trailing non |
148
|
|
|
|
|
|
|
white space characters. Any number of white space characters |
149
|
|
|
|
|
|
|
may be interleaved throughout the text portion of the header, with the |
150
|
|
|
|
|
|
|
exception that there may be no space between the carot and the first word. |
151
|
|
|
|
|
|
|
If the header is valid, this method returns 1. Otherwise, this method |
152
|
|
|
|
|
|
|
returns 0. |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=cut |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
sub isValidFASTAheader($) { |
158
|
0
|
|
|
0
|
1
|
|
my $header = shift; |
159
|
0
|
|
|
|
|
|
my $return_val = 0; |
160
|
0
|
|
|
|
|
|
my $identifier = undef; |
161
|
|
|
|
|
|
|
|
162
|
0
|
0
|
0
|
|
|
|
if ( ( defined ($header) ) && |
163
|
|
|
|
|
|
|
( ($identifier) = $header =~ /$FASTA_HEADER_PARSE/ ) ) { |
164
|
|
|
|
|
|
|
|
165
|
0
|
0
|
0
|
|
|
|
if((defined $identifier) && ($identifier !~ /\//)) { |
166
|
0
|
|
|
|
|
|
$return_val = 1; |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
else { |
169
|
0
|
|
|
|
|
|
$return_val = 0; |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
else { |
173
|
0
|
|
|
|
|
|
$return_val = 0; |
174
|
|
|
|
|
|
|
} |
175
|
0
|
|
|
|
|
|
return $return_val; |
176
|
|
|
|
|
|
|
} |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
=item $result = isValidFASTAdata($data_def); |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
This method takes the scalar data definition of a FASTA record, C<$data_def>. |
181
|
|
|
|
|
|
|
It tests the data and returns 1 if the data conforms to nucleotide data or if |
182
|
|
|
|
|
|
|
it conforms to peptide data or both. If the data is not recognizable or is |
183
|
|
|
|
|
|
|
undefined, it returns 0. |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=cut |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
sub isValidFASTAdata($) { |
189
|
0
|
|
|
0
|
1
|
|
my $data_definition = shift; |
190
|
0
|
|
|
|
|
|
my $return_val = undef; |
191
|
0
|
0
|
0
|
|
|
|
if(($data_definition =~ /^[$NA_IUPAC_CODES]+$/i) || |
192
|
|
|
|
|
|
|
($data_definition =~ /^[$AA_IUPAC_CODES]+$/i)) { |
193
|
0
|
|
|
|
|
|
$return_val = 1; |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
else { |
196
|
0
|
|
|
|
|
|
$return_val = 0; |
197
|
|
|
|
|
|
|
} |
198
|
|
|
|
|
|
|
} |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=item $result = isValidFASTAlineLength($line); |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
This method returns -1 if the data line, C<$line> is less than |
203
|
|
|
|
|
|
|
the TIGR definition requirement for line length, 0 if the data |
204
|
|
|
|
|
|
|
line meets the TIGR definition requirement for line length, and |
205
|
|
|
|
|
|
|
1 if the data line is greater than the TIGR definition requirement |
206
|
|
|
|
|
|
|
for line length. |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
=cut |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
sub isValidFASTAlineLength($) { |
212
|
0
|
|
|
0
|
1
|
|
my $line = shift; |
213
|
0
|
|
|
|
|
|
my $line_len = undef; |
214
|
0
|
|
|
|
|
|
my $return_val = undef; |
215
|
|
|
|
|
|
|
|
216
|
0
|
0
|
|
|
|
|
if ( defined ($line) ) { |
217
|
0
|
|
|
|
|
|
chomp $line; |
218
|
0
|
|
|
|
|
|
$line_len = length($line); |
219
|
0
|
0
|
|
|
|
|
if ( $line_len > $RECORD_LINE_LENGTH ) { |
|
|
0
|
|
|
|
|
|
220
|
0
|
|
|
|
|
|
$return_val = 1; |
221
|
|
|
|
|
|
|
} |
222
|
|
|
|
|
|
|
elsif ( $line_len < $RECORD_LINE_LENGTH ) { |
223
|
0
|
|
|
|
|
|
$return_val = -1; |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
else { |
226
|
0
|
|
|
|
|
|
$return_val = 0; |
227
|
|
|
|
|
|
|
} |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
=item $result = setValidFASTAlineLength($); |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
This method takes in the first data line in the data portion of a FASTA record. |
234
|
|
|
|
|
|
|
The function returns the length of this line if it is positive. This length |
235
|
|
|
|
|
|
|
determines the line length for all the data lines following this first line. |
236
|
|
|
|
|
|
|
The function returns undefined if unsuccessful. |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
=cut |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
sub setValidFASTAlineLength($) { |
242
|
0
|
|
|
0
|
1
|
|
my $line = shift; |
243
|
0
|
|
|
|
|
|
my $line_len = undef; |
244
|
0
|
|
|
|
|
|
my $ret_len = undef; |
245
|
|
|
|
|
|
|
|
246
|
0
|
0
|
|
|
|
|
if(defined ($line)) { |
247
|
0
|
|
|
|
|
|
chomp $line; |
248
|
0
|
|
|
|
|
|
$line_len = length($line); |
249
|
|
|
|
|
|
|
|
250
|
0
|
0
|
|
|
|
|
if($line_len > 0) { |
251
|
0
|
|
|
|
|
|
$ret_len = $line_len ; |
252
|
|
|
|
|
|
|
} |
253
|
|
|
|
|
|
|
} |
254
|
0
|
|
|
|
|
|
$RECORD_LINE_LENGTH = $ret_len; |
255
|
0
|
|
|
|
|
|
return $ret_len; |
256
|
|
|
|
|
|
|
} |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
# $result = _isNucleotideData($data_def); |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
#This method takes the scalar data definition of a FASTA record, C<$data_def>. |
261
|
|
|
|
|
|
|
#It tests it for conformance to a nucleotide data type. If the data are |
262
|
|
|
|
|
|
|
#nucleotide IUPAC characters, this method returns 1. If not, this method |
263
|
|
|
|
|
|
|
#returns 0. This method returns 0 if C<$data_def> is undefined. |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
sub _isNucleotideData($) { |
267
|
0
|
|
|
0
|
|
|
my $data_def = shift; |
268
|
0
|
|
|
|
|
|
my $return_val = undef; |
269
|
|
|
|
|
|
|
|
270
|
0
|
0
|
0
|
|
|
|
if ( ( defined ( $data_def ) ) && |
271
|
|
|
|
|
|
|
( $data_def =~ /^[$NA_IUPAC_CODES]+$/i ) ) { |
272
|
0
|
|
|
|
|
|
$return_val = 1; |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
else { |
275
|
0
|
|
|
|
|
|
$return_val = 0; |
276
|
|
|
|
|
|
|
} |
277
|
|
|
|
|
|
|
|
278
|
0
|
|
|
|
|
|
return $return_val; |
279
|
|
|
|
|
|
|
} |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
# $result = _isPeptideData($data_def); |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
#This method takes the scalar data definition of a FASTA record, C<$data_def>. |
285
|
|
|
|
|
|
|
#It tests it for conformance to a peptide data type. If the data are |
286
|
|
|
|
|
|
|
#peptide IUPAC characters, this method returns 1. If not, this method returns |
287
|
|
|
|
|
|
|
#zero. This method returns undefined if C<$data_def> is undefined. |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
sub _isPeptideData($) { |
291
|
0
|
|
|
0
|
|
|
my $data_def = shift; |
292
|
0
|
|
|
|
|
|
my $return_val = undef; |
293
|
|
|
|
|
|
|
|
294
|
0
|
0
|
0
|
|
|
|
if ( ( defined ( $data_def ) ) && |
295
|
|
|
|
|
|
|
( $data_def =~ /^[$AA_IUPAC_CODES]+$/i ) ) { |
296
|
0
|
|
|
|
|
|
$return_val = 1; |
297
|
|
|
|
|
|
|
} |
298
|
|
|
|
|
|
|
else { |
299
|
0
|
|
|
|
|
|
$return_val = 0; |
300
|
|
|
|
|
|
|
} |
301
|
|
|
|
|
|
|
|
302
|
0
|
|
|
|
|
|
return $return_val; |
303
|
|
|
|
|
|
|
} |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
# $identifier = _headerToIdentifier($header); |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
#This function takes a FASTA header as a parameter, returning a parsed |
308
|
|
|
|
|
|
|
#identifier for the record. If the supplied header is invalid or undefined, |
309
|
|
|
|
|
|
|
#this method returns undefined. |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
sub _headerToIdentifier($) { |
313
|
0
|
|
|
0
|
|
|
my $header = shift; |
314
|
0
|
|
|
|
|
|
my $identifier = undef; |
315
|
0
|
|
|
|
|
|
my $return_val = undef; |
316
|
|
|
|
|
|
|
|
317
|
0
|
0
|
0
|
|
|
|
if ( ( defined ($header) ) && |
|
|
|
0
|
|
|
|
|
318
|
|
|
|
|
|
|
( ($identifier) = $header =~ /$FASTA_HEADER_PARSE/ ) && |
319
|
|
|
|
|
|
|
( defined ($identifier) ) ) { |
320
|
0
|
0
|
|
|
|
|
if($identifier =~ /\//) { |
321
|
0
|
|
|
|
|
|
$return_val = undef; |
322
|
|
|
|
|
|
|
} |
323
|
|
|
|
|
|
|
else { |
324
|
0
|
|
|
|
|
|
$return_val = $identifier; |
325
|
|
|
|
|
|
|
} |
326
|
|
|
|
|
|
|
} |
327
|
|
|
|
|
|
|
else { |
328
|
0
|
|
|
|
|
|
$return_val = undef; |
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
|
331
|
0
|
|
|
|
|
|
return $return_val; |
332
|
|
|
|
|
|
|
} |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
=back |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
=head1 USAGE |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
This module is not intended for developer use. Instead, use the front |
340
|
|
|
|
|
|
|
end modules C and C. |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
=cut |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
} |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
1; |