File Coverage

blib/lib/Bio/FASTASequence.pm

Criterion	Covered	Total	%
statement	12	174	6.9
branch	0	58	0.0
condition	0	15	0.0
subroutine	4	18	22.2
pod	12	12	100.0
total	28	277	10.1

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::FASTASequence;
2
3							# ABSTRACT: Parsing sequence informations in FASTA format.
4
5	1			1		23254	use 5.006;
	1					3
	1					35
6	1			1		5	use strict;
	1					2
	1					28
7	1			1		5	use warnings;
	1					6
	1					2605
8
9							our $VERSION = '0.07';
10
11							#
12							# new creates a new instance of Bio::FASTASequence
13							#
14							sub new{
15	0			0	1		my ($class, $string_text) = @_;
16	0						my ($description,$accession_nr);
17	0						my %f_db_nr = ();
18	0	0					unless($string_text){
19	0						$string_text = "";
20							}
21	0						my $self = {};
22	0						bless($self,$class);
23	0						my $sequencer;
24							#print STDERR "Object doesn't contain fasta-sequence" if($string_text && !_is_fasta($string_text));
25	0	0	0				die "Object doesn't contain fasta-sequence" if($string_text && !_is_fasta($string_text));
26	0	0					if($string_text){
27	0						$string_text =~ s/^(\r?\n)+//;
28	0						$string_text =~ s/^>+/>/;
29	0						my ($description_line,$sequence) = split(/\n/,$string_text,2);
30	0						$description_line =~ s/\s+$//;
31	0						$description_line =~ s/\r?\n/\n/g;
32	0						$sequence =~ s/\r?\n//g;
33	0	0					unless($description_line =~ /^>/){
34	0						$self->{accession_nr} = "";
35							}
36							else{
37							# parsing the description line
38	0	0	0				if($description_line =~ /^>gi\\|/){
		0
		0
		0
		0
39	0						my ($gi,$number,$db);
40	0						$description_line =~ s/^>//;
41	0						($gi,$number,$db,$accession_nr,$description) = split(/\\|/,$description_line);
42							}
43							elsif($description_line =~ /^>sp\\|/ \|\| $description_line =~ /^>sptrembl\\|/){
44	0						$description_line =~ s/^>//;
45	0						my $desc;
46	0						($desc,$description) = split(/\s/,$description_line,2);
47	0						$accession_nr = (split(/\\|/,$desc))[1];
48							}
49							elsif($description_line =~ /^>tr\\|/){
50	0						$description_line =~ s/^>//;
51	0						my $desc;
52	0						($desc,$description) = split(/\s/,$description_line,2);
53	0						$accession_nr = (split(/\\|/,$desc))[1];
54							}
55							elsif($description_line =~ /^>[XY\d+]/){
56	0						$description_line =~ s/>//;
57	0						chomp $description_line;
58	0						$description = (split(/\s/,$description_line,3))[-1];
59	0						$accession_nr = (split(/\s/,$description_line,3))[0];
60							}
61							elsif($description_line =~ /^>[0-9A-Za-z_]+\s?/){
62	0						$description_line =~ s/^>//;
63							#-------------------------------------------------#
64							# IPI-Sequences #
65							#-------------------------------------------------#
66	0	0					if($description_line =~ /^IPI:/){
		0
		0
67							# split only at first whitespace and take first element
68	0						my $foreign_numbers = (split(/\s/,$description_line,2))[0];
69	0						$description = (split(/\s/,$description_line,3))[2];
70	0						my @foreign_acs = split(/\\|/,$foreign_numbers);
71							# cross-references to other databases
72	0						foreach my $f_ac(@foreign_acs){
73	0						my ($key, $value) = split(/:/,$f_ac);
74	0						$f_db_nr{$key} = $value;
75							}
76	0	0					unless($f_db_nr{'SWISS-PROT'}){
77	0						$f_db_nr{'SWISS-PROT'} = "NULL";
78							}
79	0	0					unless($f_db_nr{'ENSEMBL'}){
80	0						$f_db_nr{'ENSEMBL'} = "NULL";
81							}
82	0	0					unless($f_db_nr{'REFSEQ_XP'}){
83	0						$f_db_nr{'REFSEQ_XP'} = "NULL";
84							}
85	0	0					unless($f_db_nr{'TREMBL'}){
86	0						$f_db_nr{'TREMBL'} = "NULL";
87							}
88	0						$accession_nr = $f_db_nr{'IPI'};
89	0						delete $f_db_nr{IPI};
90							}
91							#-----------------------------------------#
92							# format begins with accession-nr #
93							#-----------------------------------------#
94							elsif($description_line =~ /^[A-Za-z][0-9][A-Z0-9a-z]{3}[0-9][\s\\|]/){
95	0	0					if($description_line =~ /\\|/){
96	0						($accession_nr, $description) = split(/\\|/,$description_line,2);
97							}
98							else{
99	0						($accession_nr, $description) = split(/\s/,$description_line,2);
100							}
101							}
102							elsif($description_line =~ /^[A-Za-z0-9_]+\s*?$/){
103	0						chomp $description_line;
104	0						$accession_nr = $description_line;
105	0						$description_line = '';
106							}
107							else{
108	0						($accession_nr,$description)= split(/\s/,$description_line,2);
109							}
110							}
111							}
112
113	0						$accession_nr =~ s/^>//;
114	0						$accession_nr =~ s/[^\w\d]*?$//;
115	0						$accession_nr =~ s/\.\d$//;
116	0						$sequence =~ s/[^A-Z]//g;
117	0						$sequencer = $sequence;
118							}
119
120	0						$self->{text} = $sequencer;
121	0						$self->{accession_nr} = $accession_nr;
122	0						$self->{description} = $description;
123	0						$self->{seq_length} = length($sequencer);
124	0						$self->{dbrefs} = \%f_db_nr;
125	0						$self->{crc64} = $self->_crc64();
126
127	0						return $self;
128							}#end new
129
130							#
131							# getSequence returns the sequence itself.
132							#
133							sub getSequence{
134	0			0	1		my ($class) = @_;
135	0						return $class->{text};
136							}# end getText
137
138							#
139							# _is_fasta checks whether the given Sequence is in fasta-format or not
140							#
141							sub _is_fasta{
142	0			0			my ($sequence) = @_;
143	0						my @lines = split(/\r?\n/,$sequence);
144	0						my $desc = shift(@lines);
145	0						my $seq = join("",@lines);
146	0						$seq =~ s/\s+//g;
147	0	0	0				if($desc =~ /^>/ && $seq !~ /[^A-NP-Z\*\-]/i && length($seq) > 0){
			0
148	0						return 1;
149							}
150	0						return 0;
151							}# end _is_fasta
152
153							#
154							# getSequenceLength returns how man aminoacids the sequence contains
155							#
156							sub getSequenceLength{
157	0			0	1		my ($class) = @_;
158	0						return $class->{seq_length};
159							}# end getSequenceLength
160
161							#
162							# getAccessionNr returns the parsed accession number
163							#
164							sub getAccessionNr{
165	0			0	1		my ($class) = @_;
166	0						return $class->{accession_nr};
167							}# end of getAccessionNr
168
169							#
170							# getDescription returns the description
171							#
172							sub getDescription{
173	0			0	1		my ($class) = @_;
174	0						return $class->{description}
175							}# end getDescription
176
177							#
178							# getCrc64 returns the crc64-checksum of the sequence
179							#
180							sub getCrc64{
181	0			0	1		my ($class) = @_;
182	0						return $class->{crc64};
183							}# end getCrc64
184
185							#
186							# getDBRefs returns an anonymous hash containing all references to foreign databases
187							#
188							sub getDBRefs{
189	0			0	1		my ($class) = @_;
190	0						return $class->{dbrefs};
191							}# end getDBRefs
192
193							#
194							# allIndexesOf returns a reference to an array containing all positions of the requested Substring
195							#
196							sub allIndexesOf{
197	0			0	1		my ($self,$search) = @_;
198	0						my $i = 1;
199	0						my $index = 0;
200	0						my @indices = ();
201	0						while($i != -1){
202	0						$index = index($self->{text},$search,$index);
203	0	0					push(@indices,$index) unless ($index == -1);
204	0						$i = $index;
205	0						$index++;
206							}
207	0						return \@indices;
208							}# end allIndicesOf
209
210							#
211							# _crc64 calculates the crc64-checksum of the sequence. It's the crc64-checksum like at swiss-prot
212							# the code is mainly adapted from SWISS::CRC64
213							#
214							sub _crc64 {
215	0			0			my ($self) = @_;
216	0						my $text = $self->{text};
217	1			1		9	use constant EXP => 0xd8000000;
	1					2
	1					911
218	0						my @highCrcTable = 256;
219	0						my @lowCrcTable = 256;
220	0						my $initialized = ();
221	0						my $low = 0;
222	0						my $high = 0;
223
224	0	0					unless($initialized) {
225	0						$initialized = 1;
226	0						for my $i(0..255) {
227	0						my $low_part = $i;
228	0						my $high_part = 0;
229	0						for my $j(0..7) {
230	0						my $flag = $low_part & 1; # rflag ist f�r alle ungeraden zahlen 1
231	0						$low_part >>= 1;# um ein bit nach rechts verschieben
232	0	0					$low_part \|= (1 << 31) if $high_part & 1; # bitweises oder mit 2147483648 (), wenn $parth ungerade
233	0						$high_part >>= 1; # um ein bit nach rechtsverschieben
234	0	0					$high_part ^= EXP if $flag;
235							}
236	0						$highCrcTable[$i] = $high_part;
237	0						$lowCrcTable[$i] = $low_part;
238							}
239							}
240
241	0						foreach (split '', $text) {
242	0						my $shr = ($high & 0xFF) << 24;
243	0						my $tmph = $high >> 8;
244	0						my $tmpl = ($low >> 8) \| $shr;
245	0						my $index = ($low ^ (unpack "C", $_)) & 0xFF;
246	0						$high = $tmph ^ $highCrcTable[$index];
247	0						$low = $tmpl ^ $lowCrcTable[$index];
248							}
249	0						return sprintf("%08X%08X", $high, $low);
250							}# end crc64
251
252							#
253							# seq2file prints the sequence into a file in fasta-file
254							#
255							sub seq2file{
256	0			0	1		my ($self,$file,$args_ref) = @_;
257							# open the file to write
258	0	0					open(W_SEQUENCE,">$file") or die "Can't open $file: $!\n";
259	0						print W_SEQUENCE ">",$self->{accession_nr};
260							# add the references
261	0						foreach my $dbkey(keys(%{$self->{dbrefs}})){
	0
262	0	0					print W_SEQUENCE "\|".$dbkey.":".$self->{dbrefs}->{$dbkey} if($self->{dbrefs}->{$dbkey} ne 'NULL');
263							}
264							# add description
265	0						print W_SEQUENCE " ",$self->{description},"\n";
266							# add the sequence
267	0						print W_SEQUENCE $self->{text},"\n";
268	0						close W_SEQUENCE;
269							}# end seq2file
270
271							#
272							# getFASTA return a string in fasta-format
273							#
274							sub getFASTA{
275	0			0	1		my ($self) = @_;
276	0						my $fasta = ">".$self->{accession_nr};
277							# add the references to foreign databases
278	0						foreach my $dbkey(keys(%{$self->{dbrefs}})){
	0
279	0	0					$fasta .= "\|".$dbkey.":".$self->{dbrefs}->{$dbkey} if($self->{dbrefs}->{$dbkey} ne "NULL");
280							}
281							# add description
282	0	0					$fasta .= " ".$self->{description} if($self->{description});
283	0						$fasta .= "\n";
284							# add sequence
285	0						$fasta .= $self->{text}."\n";
286	0						return $fasta;
287							}# end getFASTA
288
289							#
290							# addDBRef adds a reference to a foreign database to the anonymous hash
291							#
292							sub addDBRef{
293	0			0	1		my ($self,$db,$dbref) = @_;
294							# if a reference to the requested database already exists, append the new reference
295	0	0	0				if($self->{dbrefs}->{$db} && ($self->{dbrefs}->{$db} ne 'NULL')){
296	0						$self->{dbrefs}->{$db} .= ";".$dbref;
297							}
298							# if no reference exists, add the reference to the hash
299							else{
300	0						$self->{dbrefs}->{$db} = $dbref;
301							}
302							}# end addDBRef
303
304							#
305							# seq2xml creates an xml-string containing all information about the given sequence.
306							#
307							sub seq2xml{
308	0			0	1		my ($self) = @_;
309							# create the tags representing the references to foreign databases, e.g. SWISS-Prot
310	0						my $dbrefs = " ";
311	0						foreach(keys(%{$self->{dbrefs}})){
	0
312	0						my $key = uc($_);
313	0	0					$dbrefs .= "\n<".$key.'>'.${$self->{dbrefs}}->{$_}.'' if(${$self->{dbrefs}}->{$_} ne 'NULL');
	0
	0
314							}
315	0	0					$dbrefs = "\n$dbrefs" if($dbrefs ne " ");
316							# create the xml-string
317	0						my $xml = qq~
318
319							$self->{description}$dbrefs
320							$self->{seq_length}
321							$self->{crc64}
322							$self->{text}
323							~;
324	0						return $xml;
325							}# end seq2xml
326
327							1;
328
329							__END__