File Coverage

blib/lib/Bio/FASTASequence.pm
Criterion Covered Total %
statement 12 174 6.9
branch 0 58 0.0
condition 0 15 0.0
subroutine 4 18 22.2
pod 12 12 100.0
total 28 277 10.1


line stmt bran cond sub pod time code
1             package Bio::FASTASequence;
2              
3             # ABSTRACT: Parsing sequence informations in FASTA format.
4              
5 1     1   23254 use 5.006;
  1         3  
  1         35  
6 1     1   5 use strict;
  1         2  
  1         28  
7 1     1   5 use warnings;
  1         6  
  1         2605  
8              
9             our $VERSION = '0.07';
10              
11             #
12             # new creates a new instance of Bio::FASTASequence
13             #
14             sub new{
15 0     0 1   my ($class, $string_text) = @_;
16 0           my ($description,$accession_nr);
17 0           my %f_db_nr = ();
18 0 0         unless($string_text){
19 0           $string_text = "";
20             }
21 0           my $self = {};
22 0           bless($self,$class);
23 0           my $sequencer;
24             #print STDERR "Object doesn't contain fasta-sequence" if($string_text && !_is_fasta($string_text));
25 0 0 0       die "Object doesn't contain fasta-sequence" if($string_text && !_is_fasta($string_text));
26 0 0         if($string_text){
27 0           $string_text =~ s/^(\r?\n)+//;
28 0           $string_text =~ s/^>+/>/;
29 0           my ($description_line,$sequence) = split(/\n/,$string_text,2);
30 0           $description_line =~ s/\s+$//;
31 0           $description_line =~ s/\r?\n/\n/g;
32 0           $sequence =~ s/\r?\n//g;
33 0 0         unless($description_line =~ /^>/){
34 0           $self->{accession_nr} = "";
35             }
36             else{
37             # parsing the description line
38 0 0 0       if($description_line =~ /^>gi\|/){
    0          
    0          
    0          
    0          
39 0           my ($gi,$number,$db);
40 0           $description_line =~ s/^>//;
41 0           ($gi,$number,$db,$accession_nr,$description) = split(/\|/,$description_line);
42             }
43             elsif($description_line =~ /^>sp\|/ || $description_line =~ /^>sptrembl\|/){
44 0           $description_line =~ s/^>//;
45 0           my $desc;
46 0           ($desc,$description) = split(/\s/,$description_line,2);
47 0           $accession_nr = (split(/\|/,$desc))[1];
48             }
49             elsif($description_line =~ /^>tr\|/){
50 0           $description_line =~ s/^>//;
51 0           my $desc;
52 0           ($desc,$description) = split(/\s/,$description_line,2);
53 0           $accession_nr = (split(/\|/,$desc))[1];
54             }
55             elsif($description_line =~ /^>[XY\d+]/){
56 0           $description_line =~ s/>//;
57 0           chomp $description_line;
58 0           $description = (split(/\s/,$description_line,3))[-1];
59 0           $accession_nr = (split(/\s/,$description_line,3))[0];
60             }
61             elsif($description_line =~ /^>[0-9A-Za-z_]+\s?/){
62 0           $description_line =~ s/^>//;
63             #-------------------------------------------------#
64             # IPI-Sequences #
65             #-------------------------------------------------#
66 0 0         if($description_line =~ /^IPI:/){
    0          
    0          
67             # split only at first whitespace and take first element
68 0           my $foreign_numbers = (split(/\s/,$description_line,2))[0];
69 0           $description = (split(/\s/,$description_line,3))[2];
70 0           my @foreign_acs = split(/\|/,$foreign_numbers);
71             # cross-references to other databases
72 0           foreach my $f_ac(@foreign_acs){
73 0           my ($key, $value) = split(/:/,$f_ac);
74 0           $f_db_nr{$key} = $value;
75             }
76 0 0         unless($f_db_nr{'SWISS-PROT'}){
77 0           $f_db_nr{'SWISS-PROT'} = "NULL";
78             }
79 0 0         unless($f_db_nr{'ENSEMBL'}){
80 0           $f_db_nr{'ENSEMBL'} = "NULL";
81             }
82 0 0         unless($f_db_nr{'REFSEQ_XP'}){
83 0           $f_db_nr{'REFSEQ_XP'} = "NULL";
84             }
85 0 0         unless($f_db_nr{'TREMBL'}){
86 0           $f_db_nr{'TREMBL'} = "NULL";
87             }
88 0           $accession_nr = $f_db_nr{'IPI'};
89 0           delete $f_db_nr{IPI};
90             }
91             #-----------------------------------------#
92             # format begins with accession-nr #
93             #-----------------------------------------#
94             elsif($description_line =~ /^[A-Za-z][0-9][A-Z0-9a-z]{3}[0-9][\s\|]/){
95 0 0         if($description_line =~ /\|/){
96 0           ($accession_nr, $description) = split(/\|/,$description_line,2);
97             }
98             else{
99 0           ($accession_nr, $description) = split(/\s/,$description_line,2);
100             }
101             }
102             elsif($description_line =~ /^[A-Za-z0-9_]+\s*?$/){
103 0           chomp $description_line;
104 0           $accession_nr = $description_line;
105 0           $description_line = '';
106             }
107             else{
108 0           ($accession_nr,$description)= split(/\s/,$description_line,2);
109             }
110             }
111             }
112              
113 0           $accession_nr =~ s/^>//;
114 0           $accession_nr =~ s/[^\w\d]*?$//;
115 0           $accession_nr =~ s/\.\d$//;
116 0           $sequence =~ s/[^A-Z]//g;
117 0           $sequencer = $sequence;
118             }
119              
120 0           $self->{text} = $sequencer;
121 0           $self->{accession_nr} = $accession_nr;
122 0           $self->{description} = $description;
123 0           $self->{seq_length} = length($sequencer);
124 0           $self->{dbrefs} = \%f_db_nr;
125 0           $self->{crc64} = $self->_crc64();
126              
127 0           return $self;
128             }#end new
129              
130             #
131             # getSequence returns the sequence itself.
132             #
133             sub getSequence{
134 0     0 1   my ($class) = @_;
135 0           return $class->{text};
136             }# end getText
137              
138             #
139             # _is_fasta checks whether the given Sequence is in fasta-format or not
140             #
141             sub _is_fasta{
142 0     0     my ($sequence) = @_;
143 0           my @lines = split(/\r?\n/,$sequence);
144 0           my $desc = shift(@lines);
145 0           my $seq = join("",@lines);
146 0           $seq =~ s/\s+//g;
147 0 0 0       if($desc =~ /^>/ && $seq !~ /[^A-NP-Z\*\-]/i && length($seq) > 0){
      0        
148 0           return 1;
149             }
150 0           return 0;
151             }# end _is_fasta
152              
153             #
154             # getSequenceLength returns how man aminoacids the sequence contains
155             #
156             sub getSequenceLength{
157 0     0 1   my ($class) = @_;
158 0           return $class->{seq_length};
159             }# end getSequenceLength
160              
161             #
162             # getAccessionNr returns the parsed accession number
163             #
164             sub getAccessionNr{
165 0     0 1   my ($class) = @_;
166 0           return $class->{accession_nr};
167             }# end of getAccessionNr
168              
169             #
170             # getDescription returns the description
171             #
172             sub getDescription{
173 0     0 1   my ($class) = @_;
174 0           return $class->{description}
175             }# end getDescription
176              
177             #
178             # getCrc64 returns the crc64-checksum of the sequence
179             #
180             sub getCrc64{
181 0     0 1   my ($class) = @_;
182 0           return $class->{crc64};
183             }# end getCrc64
184              
185             #
186             # getDBRefs returns an anonymous hash containing all references to foreign databases
187             #
188             sub getDBRefs{
189 0     0 1   my ($class) = @_;
190 0           return $class->{dbrefs};
191             }# end getDBRefs
192              
193             #
194             # allIndexesOf returns a reference to an array containing all positions of the requested Substring
195             #
196             sub allIndexesOf{
197 0     0 1   my ($self,$search) = @_;
198 0           my $i = 1;
199 0           my $index = 0;
200 0           my @indices = ();
201 0           while($i != -1){
202 0           $index = index($self->{text},$search,$index);
203 0 0         push(@indices,$index) unless ($index == -1);
204 0           $i = $index;
205 0           $index++;
206             }
207 0           return \@indices;
208             }# end allIndicesOf
209              
210             #
211             # _crc64 calculates the crc64-checksum of the sequence. It's the crc64-checksum like at swiss-prot
212             # the code is mainly adapted from SWISS::CRC64
213             #
214             sub _crc64 {
215 0     0     my ($self) = @_;
216 0           my $text = $self->{text};
217 1     1   9 use constant EXP => 0xd8000000;
  1         2  
  1         911  
218 0           my @highCrcTable = 256;
219 0           my @lowCrcTable = 256;
220 0           my $initialized = ();
221 0           my $low = 0;
222 0           my $high = 0;
223              
224 0 0         unless($initialized) {
225 0           $initialized = 1;
226 0           for my $i(0..255) {
227 0           my $low_part = $i;
228 0           my $high_part = 0;
229 0           for my $j(0..7) {
230 0           my $flag = $low_part & 1; # rflag ist für alle ungeraden zahlen 1
231 0           $low_part >>= 1;# um ein bit nach rechts verschieben
232 0 0         $low_part |= (1 << 31) if $high_part & 1; # bitweises oder mit 2147483648 (), wenn $parth ungerade
233 0           $high_part >>= 1; # um ein bit nach rechtsverschieben
234 0 0         $high_part ^= EXP if $flag;
235             }
236 0           $highCrcTable[$i] = $high_part;
237 0           $lowCrcTable[$i] = $low_part;
238             }
239             }
240              
241 0           foreach (split '', $text) {
242 0           my $shr = ($high & 0xFF) << 24;
243 0           my $tmph = $high >> 8;
244 0           my $tmpl = ($low >> 8) | $shr;
245 0           my $index = ($low ^ (unpack "C", $_)) & 0xFF;
246 0           $high = $tmph ^ $highCrcTable[$index];
247 0           $low = $tmpl ^ $lowCrcTable[$index];
248             }
249 0           return sprintf("%08X%08X", $high, $low);
250             }# end crc64
251              
252             #
253             # seq2file prints the sequence into a file in fasta-file
254             #
255             sub seq2file{
256 0     0 1   my ($self,$file,$args_ref) = @_;
257             # open the file to write
258 0 0         open(W_SEQUENCE,">$file") or die "Can't open $file: $!\n";
259 0           print W_SEQUENCE ">",$self->{accession_nr};
260             # add the references
261 0           foreach my $dbkey(keys(%{$self->{dbrefs}})){
  0            
262 0 0         print W_SEQUENCE "|".$dbkey.":".$self->{dbrefs}->{$dbkey} if($self->{dbrefs}->{$dbkey} ne 'NULL');
263             }
264             # add description
265 0           print W_SEQUENCE " ",$self->{description},"\n";
266             # add the sequence
267 0           print W_SEQUENCE $self->{text},"\n";
268 0           close W_SEQUENCE;
269             }# end seq2file
270              
271             #
272             # getFASTA return a string in fasta-format
273             #
274             sub getFASTA{
275 0     0 1   my ($self) = @_;
276 0           my $fasta = ">".$self->{accession_nr};
277             # add the references to foreign databases
278 0           foreach my $dbkey(keys(%{$self->{dbrefs}})){
  0            
279 0 0         $fasta .= "|".$dbkey.":".$self->{dbrefs}->{$dbkey} if($self->{dbrefs}->{$dbkey} ne "NULL");
280             }
281             # add description
282 0 0         $fasta .= " ".$self->{description} if($self->{description});
283 0           $fasta .= "\n";
284             # add sequence
285 0           $fasta .= $self->{text}."\n";
286 0           return $fasta;
287             }# end getFASTA
288              
289             #
290             # addDBRef adds a reference to a foreign database to the anonymous hash
291             #
292             sub addDBRef{
293 0     0 1   my ($self,$db,$dbref) = @_;
294             # if a reference to the requested database already exists, append the new reference
295 0 0 0       if($self->{dbrefs}->{$db} && ($self->{dbrefs}->{$db} ne 'NULL')){
296 0           $self->{dbrefs}->{$db} .= ";".$dbref;
297             }
298             # if no reference exists, add the reference to the hash
299             else{
300 0           $self->{dbrefs}->{$db} = $dbref;
301             }
302             }# end addDBRef
303              
304             #
305             # seq2xml creates an xml-string containing all information about the given sequence.
306             #
307             sub seq2xml{
308 0     0 1   my ($self) = @_;
309             # create the tags representing the references to foreign databases, e.g. SWISS-Prot
310 0           my $dbrefs = " ";
311 0           foreach(keys(%{$self->{dbrefs}})){
  0            
312 0           my $key = uc($_);
313 0 0         $dbrefs .= "\n<".$key.'>'.${$self->{dbrefs}}->{$_}.'' if(${$self->{dbrefs}}->{$_} ne 'NULL');
  0            
  0            
314             }
315 0 0         $dbrefs = "\n$dbrefs" if($dbrefs ne " ");
316             # create the xml-string
317 0           my $xml = qq~
318            
319             $self->{description}$dbrefs
320             $self->{seq_length}
321             $self->{crc64}
322             $self->{text}
323             ~;
324 0           return $xml;
325             }# end seq2xml
326              
327             1;
328              
329             __END__