File Coverage

Bio/SeqIO/embl.pm

Criterion	Covered	Total	%
statement	544	651	83.5
branch	267	434	61.5
condition	90	128	70.3
subroutine	30	33	90.9
pod	2	2	100.0
total	933	1248	74.7

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::SeqIO::EMBL
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Ewan Birney
7							#
8							# Copyright Ewan Birney
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14							=head1 NAME
15
16							Bio::SeqIO::embl - EMBL sequence input/output stream
17
18							=head1 SYNOPSIS
19
20							It is probably best not to use this object directly, but
21							rather go through the SeqIO handler system. Go:
22
23							$stream = Bio::SeqIO->new(-file => $filename, -format => 'EMBL');
24
25							while ( (my $seq = $stream->next_seq()) ) {
26							# do something with $seq
27							}
28
29							=head1 DESCRIPTION
30
31							This object can transform Bio::Seq objects to and from EMBL flat
32							file databases.
33
34							There is a lot of flexibility here about how to dump things which
35							should be documented more fully.
36
37							There should be a common object that this and Genbank share (probably
38							with Swissprot). Too much of the magic is identical.
39
40							=head2 Optional functions
41
42							=over 3
43
44							=item _show_dna()
45
46							(output only) shows the dna or not
47
48							=item _post_sort()
49
50							(output only) provides a sorting func which is applied to the FTHelpers
51							before printing
52
53							=item _id_generation_func()
54
55							This is function which is called as
56
57							print "ID ", $func($annseq), "\n";
58
59							To generate the ID line. If it is not there, it generates a sensible ID
60							line using a number of tools.
61
62							If you want to output annotations in EMBL format they need to be
63							stored in a Bio::Annotation::Collection object which is accessible
64							through the Bio::SeqI interface method L.
65
66							The following are the names of the keys which are polled from a
67							L object.
68
69							reference - Should contain Bio::Annotation::Reference objects
70							comment - Should contain Bio::Annotation::Comment objects
71							dblink - Should contain Bio::Annotation::DBLink objects
72
73							=back
74
75							=head1 FEEDBACK
76
77							=head2 Mailing Lists
78
79							User feedback is an integral part of the evolution of this and other
80							Bioperl modules. Send your comments and suggestions preferably to one
81							of the Bioperl mailing lists. Your participation is much appreciated.
82
83							bioperl-l@bioperl.org - General discussion
84							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
85
86							=head2 Support
87
88							Please direct usage questions or support issues to the mailing list:
89
90							I
91
92							rather than to the module maintainer directly. Many experienced and
93							reponsive experts will be able look at the problem and quickly
94							address it. Please include a thorough description of the problem
95							with code and data examples if at all possible.
96
97							=head2 Reporting Bugs
98
99							Report bugs to the Bioperl bug tracking system to help us keep track
100							the bugs and their resolution. Bug reports can be submitted via
101							the web:
102
103							https://github.com/bioperl/bioperl-live/issues
104
105							=head1 AUTHOR - Ewan Birney
106
107							Email birney@ebi.ac.uk
108
109							=head1 APPENDIX
110
111							The rest of the documentation details each of the object
112							methods. Internal methods are usually preceded with a _
113
114							=cut
115
116
117							# Let the code begin...
118
119
120							package Bio::SeqIO::embl;
121	7			7		599	use vars qw(%FTQUAL_NO_QUOTE);
	7					13
	7					402
122	7			7		44	use strict;
	7					16
	7					162
123	7			7		1383	use Bio::SeqIO::FTHelper;
	7					81
	7					204
124	7			7		44	use Bio::SeqFeature::Generic;
	7					11
	7					145
125	7			7		1212	use Bio::Species;
	7					18
	7					190
126	7			7		46	use Bio::Seq::SeqFactory;
	7					13
	7					169
127	7			7		34	use Bio::Annotation::Collection;
	7					13
	7					168
128	7			7		1283	use Bio::Annotation::Comment;
	7					17
	7					189
129	7			7		1410	use Bio::Annotation::Reference;
	7					20
	7					219
130	7			7		46	use Bio::Annotation::DBLink;
	7					19
	7					189
131
132	7			7		41	use base qw(Bio::SeqIO);
	7					16
	7					41516
133
134							# Note that a qualifier that exceeds one line (i.e. a long label) will
135							# automatically be quoted regardless:
136							%FTQUAL_NO_QUOTE=(
137							'anticodon'=>1,
138							'citation'=>1,
139							'codon'=>1,
140							'codon_start'=>1,
141							'cons_splice'=>1,
142							'direction'=>1,
143							'evidence'=>1,
144							'label'=>1,
145							'mod_base'=> 1,
146							'number'=> 1,
147							'rpt_type'=> 1,
148							'rpt_unit'=> 1,
149							'transl_except'=> 1,
150							'transl_table'=> 1,
151							'usedin'=> 1,
152							);
153
154							sub _initialize {
155	31			31		99	my($self,@args) = @_;
156
157	31					160	$self->SUPER::_initialize(@args);
158							# hash for functions for decoding keys.
159	31					115	$self->{'_func_ftunit_hash'} = {};
160							# sets this to one by default. People can change it
161	31					143	$self->_show_dna(1);
162	31	50				160	if ( ! defined $self->sequence_factory ) {
163	31					122	$self->sequence_factory(Bio::Seq::SeqFactory->new
164							(-verbose => $self->verbose(),
165							-type => 'Bio::Seq::RichSeq'));
166							}
167							}
168
169							=head2 next_seq
170
171							Title : next_seq
172							Usage : $seq = $stream->next_seq()
173							Function: returns the next sequence in the stream
174							Returns : Bio::Seq object
175							Args :
176
177							=cut
178
179							sub next_seq {
180	25			25	1	132	my ($self,@args) = @_;
181	25					70	my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div,
182							$date, $comment, @date_arr);
183
184	25					203	my ($annotation, %params, @features) =
185							Bio::Annotation::Collection->new();
186
187	25					124	$line = $self->_readline;
188							# This needs to be before the first eof() test
189
190	25	100				84	if ( !defined $line ) {
191	1					5	return; # no throws - end of file
192							}
193
194	24	100				161	if ( $line =~ /^\s+$/ ) {
195	1					8	while ( defined ($line = $self->_readline) ) {
196	1	50				7	$line =~/^\S/ && last;
197							}
198							# return without error if the whole next sequence was just a single
199							# blank line and then eof
200	1	50				4	return unless $line;
201							}
202
203							# no ID as 1st non-blank line, need short circuit and exit routine
204	24	50				126	$self->throw("EMBL stream with no ID. Not embl in my book")
205							unless $line =~ /^ID\s+\S+/;
206
207							# At this point we are sure that $line contains an ID header line
208	24					52	my $alphabet;
209	24	100				90	if ( $line =~ tr/;/;/ == 6) { # New style headers contain exactly six semicolons.
210
211							# New style header (EMBL Release >= 87, after June 2006)
212	10					32	my $topology;
213							my $sv;
214
215							# ID DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP.
216							# This regexp comes from the new2old.pl conversion script, from EBI
217	10	100				90	if ($line =~ m/^ID (\S+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) BP./) {
218	7					74	($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6);
219							}
220	10	100				37	if (defined $sv) {
221	7					27	$params{'-seq_version'} = $sv;
222	7					19	$params{'-version'} = $sv;
223							}
224
225	10	50	66			46	if (defined $topology && $topology eq 'circular') {
226	0					0	$params{'-is_circular'} = 1;
227							}
228
229	10	100				35	if (defined $mol ) {
230	7	100				32	if ($mol =~ /DNA/) {
		50
		0
231	6					17	$alphabet = 'dna';
232							} elsif ($mol =~ /RNA/) {
233	1					3	$alphabet = 'rna';
234							} elsif ($mol =~ /AA/) {
235	0					0	$alphabet = 'protein';
236							}
237							}
238							} else {
239
240							# Old style header (EMBL Release < 87, before June 2006)
241	14	100				115	if ($line =~ /^ID\s+(\S+)[^;];\s+(\S+)[^;];\s+(\S+)[^;]*;/) {
242	12					63	($name, $mol, $div) = ($1, $2, $3);
243							}
244
245	14	100				34	if ($mol) {
246	12	50				38	if ( $mol =~ /circular/ ) {
247	0					0	$params{'-is_circular'} = 1;
248	0					0	$mol =~ s\|circular \|\|;
249							}
250	12	50				35	if (defined $mol ) {
251	12	100				49	if ($mol =~ /DNA/) {
		50
		0
252	10					25	$alphabet='dna';
253							} elsif ($mol =~ /RNA/) {
254	2					4	$alphabet='rna';
255							} elsif ($mol =~ /AA/) {
256	0					0	$alphabet='protein';
257							}
258							}
259							}
260							}
261
262	24	100	66			150	unless( defined $name && length($name) ) {
263	5					12	$name = "unknown_id";
264							}
265
266							# $self->warn("not parsing upper annotation in EMBL file yet!");
267	24					43	my $buffer = $line;
268	24					39	local $_;
269	24					37	BEFORE_FEATURE_TABLE :
270							my $ncbi_taxid;
271	24					88	until ( !defined $buffer ) {
272	499					741	$_ = $buffer;
273							# Exit at start of Feature table
274	499	100				1476	if ( /^(F[HT]\|SQ)/ ) {
275	24	100	100			196	$self->_pushback($_) if( $1 eq 'SQ' \|\| $1 eq 'FT');
276	24					75	last;
277							}
278							# Description line(s)
279	475	100				993	if (/^DE\s+(\S.*\S)/) {
280	22	100				110	$desc .= $desc ? " $1" : $1;
281							}
282
283							#accession number
284	475	100	100			1538	if ( /^AC\s+(.)?/ \|\| /^PA\s+(.)?/) {
285	24					164	my @accs = split(/[; ]+/, $1); # allow space in addition
286							$params{'-accession_number'} = shift @accs
287	24	100				107	unless defined $params{'-accession_number'};
288	24					45	push @{$params{'-secondary_accessions'}}, @accs;
	24					86
289							}
290
291							#version number
292	475	100				851	if ( /^SV\s+\S+\.(\d+);?/ ) {
293	6					17	my $sv = $1;
294							#$sv =~ s/\;//;
295	6					16	$params{'-seq_version'} = $sv;
296	6					14	$params{'-version'} = $sv;
297							}
298
299							#date (NOTE: takes last date line)
300	475	100				816	if ( /^DT\s+(.+)$/ ) {
301	26					74	my $line = $1;
302	26					97	my ($date, $version) = split(' ', $line, 2);
303	26					51	$date =~ tr/,//d; # remove comma if new version
304	26	100				62	if ($version) {
305	24	100				137	if ($version =~ /$Rel\. (\d+), Created$/ms ) {
		100
306	11					99	my $release = Bio::Annotation::SimpleValue->new(
307							-tagname => 'creation_release',
308							-value => $1
309							);
310	11					74	$annotation->add_Annotation($release);
311							} elsif ($version =~ /$Rel\. (\d+), Last updated, Version (\d+)$/ms ) {
312	11					55	my $release = Bio::Annotation::SimpleValue->new(
313							-tagname => 'update_release',
314							-value => $1
315							);
316	11					47	$annotation->add_Annotation($release);
317
318	11					43	my $update = Bio::Annotation::SimpleValue->new(
319							-tagname => 'update_version',
320							-value => $2
321							);
322	11					30	$annotation->add_Annotation($update);
323							}
324							}
325	26					40	push @{$params{'-dates'}}, $date;
	26					85
326							}
327
328							#keywords
329	475	100				1953	if ( /^KW (.)\S$/ ) {
		100
		100
		100
		100
		100
330	29					161	my @kw = split(/\s\;\s/,$1);
331	29					55	push @{$params{'-keywords'}}, @kw;
	29					114
332							}
333
334							# Organism name and phylogenetic information
335							elsif (/^O[SC]/) {
336							# pass the accession number so we can give an informative throw message if necessary
337	21					124	my $species = $self->_read_EMBL_Species(\$buffer, $params{'-accession_number'});
338	21					78	$params{'-species'}= $species;
339							}
340
341							# NCBI TaxID Xref
342							elsif (/^OX/) {
343	3	50				34	if (/NCBI_TaxID=(\d+)/) {
344	3					19	$ncbi_taxid=$1;
345							}
346
347	3					17	my @links = $self->_read_EMBL_TaxID_DBLink(\$buffer);
348	3					10	foreach my $dblink ( @links ) {
349	1					7	$annotation->add_Annotation('dblink',$dblink);
350							}
351							}
352
353							# References
354							elsif (/^R/) {
355	99					311	my @refs = $self->_read_EMBL_References(\$buffer);
356	99					168	foreach my $ref ( @refs ) {
357	99					327	$annotation->add_Annotation('reference',$ref);
358							}
359							}
360
361							# DB Xrefs
362							elsif (/^DR/) {
363	5					27	my @links = $self->_read_EMBL_DBLink(\$buffer);
364	5					13	foreach my $dblink ( @links ) {
365	25					51	$annotation->add_Annotation('dblink',$dblink);
366							}
367							}
368
369							# Comments
370							elsif (/^CC\s+(.*)/) {
371	15					63	$comment .= $1;
372	15					35	$comment .= " ";
373	15					47	while (defined ($_ = $self->_readline) ) {
374	257	100				724	if (/^CC\s+(.*)/) {
375	242					470	$comment .= $1;
376	242					397	$comment .= " ";
377							} else {
378	15					34	last;
379							}
380							}
381	15					146	my $commobj = Bio::Annotation::Comment->new();
382	15					71	$commobj->text($comment);
383	15					60	$annotation->add_Annotation('comment',$commobj);
384	15					30	$comment = "";
385							}
386
387							# Get next line.
388	475					1036	$buffer = $self->_readline;
389							}
390
391	24					69	while ( defined ($_ = $self->_readline) ) {
392	46	100				168	/^FT\s{3}\w/ && last;
393	24	100				63	/^SQ / && last;
394	22	50				68	/^CO / && last;
395							}
396	24					49	$buffer = $_;
397
398	24	100	66			186	if (defined($buffer) && $buffer =~ /^FT /) {
399	22					77	until ( !defined ($buffer) ) {
400	256					757	my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
401
402							# process ftunit
403	256					750	my $feat =
404							$ftunit->_generic_seqfeature($self->location_factory(), $name);
405
406							# add taxon_id from source if available
407							# Notice, this will override what is found in the OX line.
408							# this is by design as this seems to be the official way
409							# of specifying a TaxID
410	256	100	100			793	if ($params{'-species'} && ($feat->primary_tag eq 'source')
			100
			66
411							&& $feat->has_tag('db_xref')
412							&& (! $params{'-species'}->ncbi_taxid())) {
413	12					44	foreach my $tagval ($feat->get_tag_values('db_xref')) {
414	12	50				54	if (index($tagval,"taxon:") == 0) {
415	12					60	$params{'-species'}->ncbi_taxid(substr($tagval,6));
416	12					38	last;
417							}
418							}
419							}
420
421							# add feature to list of features
422	256					437	push(@features, $feat);
423
424	256	100				1460	if ( $buffer !~ /^FT/ ) {
425	22					88	last;
426							}
427							}
428							}
429							# Set taxid found in OX line
430	24	100	100			175	if ($params{'-species'} && defined $ncbi_taxid
			100
431							&& (! $params{'-species'}->ncbi_taxid())) {
432	2					9	$params{'-species'}->ncbi_taxid($ncbi_taxid);
433							}
434
435							# skip comments
436	24		66			200	while ( defined ($buffer) && $buffer =~ /^XX/ ) {
437	20					72	$buffer = $self->_readline();
438							}
439
440	24	100				78	if ( $buffer =~ /^CO/ ) {
441							# bug#2982
442							# special : create contig as annotation
443	4					16	while ( defined ($buffer) ) {
444	4					18	$annotation->add_Annotation($_) for $self->_read_EMBL_Contig(\$buffer);
445	4	50	66			46	if ( !$buffer \|\| $buffer !~ /^CO/ ) {
446	4					9	last;
447							}
448							}
449	4		100			14	$buffer \|\|= '';
450							}
451	24	100				94	if ($buffer !~ /^\/\//) { # if no SQ lines following CO (bug#2958)
452	20	100				127	if ( $buffer !~ /^SQ/ ) {
453	1					9	while ( defined ($_ = $self->_readline) ) {
454	0	0				0	/^SQ/ && last;
455							}
456							}
457	20					39	$seqc = "";
458	20					61	while ( defined ($_ = $self->_readline) ) {
459	2924	100				5750	m{^//} && last;
460	2906					4154	$_ = uc($_);
461	2906					29191	s/[^A-Za-z]//g;
462	2906					7603	$seqc .= $_;
463							}
464							}
465	24					143	my $seq = $self->sequence_factory->create
466							(-verbose => $self->verbose(),
467							-division => $div,
468							-seq => $seqc,
469							-desc => $desc,
470							-display_id => $name,
471							-annotation => $annotation,
472							-molecule => $mol,
473							-alphabet => $alphabet,
474							-features => \@features,
475							%params);
476	24					385	return $seq;
477							}
478
479
480
481							=head2 _write_ID_line
482
483							Title : _write_ID_line
484							Usage : $self->_write_ID_line($seq);
485							Function: Writes the EMBL Release 87 format ID line to the stream, unless
486							: there is a user-supplied ID line generation function in which
487							: case that is used instead.
488							: ( See Bio::SeqIO::embl::_id_generation_function(). )
489							Returns : nothing
490							Args : Bio::Seq object
491
492							=cut
493
494							sub _write_ID_line {
495
496	11			11		30	my ($self, $seq) = @_;
497
498	11					25	my $id_line;
499							# If there is a user-supplied ID generation function, use it.
500	11	50				47	if ( $self->_id_generation_func ) {
501	0					0	$id_line = "ID " . &{$self->_id_generation_func}($seq) . "\nXX\n";
	0					0
502							}
503							# Otherwise, generate a standard EMBL release 87 (June 2006) ID line.
504							else {
505
506							# The sequence name is supposed to be the primary accession number,
507	11					51	my $name = $seq->accession_number();
508	11	100	66			90	if ( not(defined $name) \|\| $name eq 'unknown') {
509							# but if it is not present, use the sequence ID or the empty string
510	5		100			25	$name = $seq->id() \|\| '';
511							}
512
513	11	50				55	$self->warn("No whitespace allowed in EMBL id [". $name. "]") if $name =~ /\s/;
514
515							# Use the sequence version, or default to 1.
516	11		50			53	my $version = $seq->version() \|\| 1;
517
518	11					52	my $len = $seq->length();
519
520							# Taxonomic division.
521	11					18	my $div;
522	11	100	100			214	if ( $seq->can('division') && defined($seq->division) &&
			66
523							$self->_is_valid_division($seq->division) ) {
524	2					7	$div = $seq->division();
525							} else {
526	9		50			66	$div \|\|= 'UNC'; # 'UNC' is the EMBL division code for 'unclassified'.
527							}
528
529	11					24	my $mol;
530							# If the molecule type is a valid EMBL type, use it.
531	11	50	100			143	if ( $seq->can('molecule')
		100	66
			100
532							&& defined($seq->molecule)
533							&& $self->_is_valid_molecule_type($seq->molecule)
534							) {
535	0					0	$mol = $seq->molecule();
536							}
537							# Otherwise, choose unassigned DNA or RNA based on the alphabet.
538							elsif ($seq->can('primary_seq') && defined $seq->primary_seq->alphabet) {
539	6					16	my $alphabet =$seq->primary_seq->alphabet;
540	6	50				19	if ($alphabet eq 'dna') {
		0
		0
541	6					14	$mol ='unassigned DNA';
542							} elsif ($alphabet eq 'rna') {
543	0					0	$mol='unassigned RNA';
544							} elsif ($alphabet eq 'protein') {
545	0					0	$self->warn("Protein sequence found; EMBL is a nucleotide format.");
546	0					0	$mol='AA'; # AA is not a valid EMBL molecule type.
547							}
548							}
549
550	11					30	my $topology = 'linear';
551	11	50				51	if ($seq->is_circular) {
552	0					0	$topology = 'circular';
553							}
554
555	11		100			52	$mol \|\|= ''; # 'unassigned'; ?
556	11					76	$id_line = "ID $name; SV $version; $topology; $mol; STD; $div; $len BP.\nXX\n";
557							}
558	11					292	$self->_print($id_line);
559							}
560
561							=head2 _is_valid_division
562
563							Title : _is_valid_division
564							Usage : $self->_is_valid_division($div)
565							Function: tests division code for validity
566							Returns : true if $div is a valid EMBL release 87 taxonomic division.
567							Args : taxonomic division code string
568
569							=cut
570
571							sub _is_valid_division {
572	2			2		11	my ($self, $division) = @_;
573
574	2					35	my %EMBL_divisions = (
575							"PHG" => 1, # Bacteriophage
576							"ENV" => 1, # Environmental Sample
577							"FUN" => 1, # Fungal
578							"HUM" => 1, # Human
579							"INV" => 1, # Invertebrate
580							"MAM" => 1, # Other Mammal
581							"VRT" => 1, # Other Vertebrate
582							"MUS" => 1, # Mus musculus
583							"PLN" => 1, # Plant
584							"PRO" => 1, # Prokaryote
585							"ROD" => 1, # Other Rodent
586							"SYN" => 1, # Synthetic
587							"UNC" => 1, # Unclassified
588							"VRL" => 1 # Viral
589							);
590
591	2					15	return exists($EMBL_divisions{$division});
592							}
593
594							=head2 _is_valid_molecule_type
595
596							Title : _is_valid_molecule_type
597							Usage : $self->_is_valid_molecule_type($mol)
598							Function: tests molecule type for validity
599							Returns : true if $mol is a valid EMBL release 87 molecule type.
600							Args : molecule type string
601
602							=cut
603
604							sub _is_valid_molecule_type {
605	2			2		8	my ($self, $moltype) = @_;
606
607	2					23	my %EMBL_molecule_types = (
608							"genomic DNA" => 1,
609							"genomic RNA" => 1,
610							"mRNA" => 1,
611							"tRNA" => 1,
612							"rRNA" => 1,
613							"snoRNA" => 1,
614							"snRNA" => 1,
615							"scRNA" => 1,
616							"pre-RNA" => 1,
617							"other RNA" => 1,
618							"other DNA" => 1,
619							"unassigned DNA" => 1,
620							"unassigned RNA" => 1
621							);
622
623	2					26	return exists($EMBL_molecule_types{$moltype});
624							}
625
626							=head2 write_seq
627
628							Title : write_seq
629							Usage : $stream->write_seq($seq)
630							Function: writes the $seq object (must be seq) to the stream
631							Returns : 1 for success and undef for error
632							Args : array of 1 to n Bio::SeqI objects
633
634
635							=cut
636
637							sub write_seq {
638	13			13	1	114	my ($self,@seqs) = @_;
639
640	13					40	foreach my $seq ( @seqs ) {
641	13	50				44	$self->throw("Attempting to write with no seq!") unless defined $seq;
642	13	100	100			129	unless ( ref $seq && $seq->isa('Bio::SeqI' ) ) {
643	4	50				22	$self->warn("$seq is not a SeqI compliant sequence object!")
644							if $self->verbose >= 0;
645	4	100	66			30	unless ( ref $seq && $seq->isa('Bio::PrimarySeqI' ) ) {
646	2					34	$self->throw("$seq is not a PrimarySeqI compliant sequence object!");
647							}
648							}
649	11		100			61	my $str = $seq->seq \|\| '';
650
651							# Write the ID line.
652	11					53	$self->_write_ID_line($seq);
653
654
655							# Write the accession line if present
656	11					21	my( $acc );
657							{
658	11	50	66			25	if ( my $func = $self->_ac_generation_func ) {
	11	100				38
		50
659	0					0	$acc = &{$func}($seq);
	0					0
660							} elsif ( $seq->isa('Bio::Seq::RichSeqI') &&
661							defined($seq->accession_number) ) {
662	4					12	$acc = $seq->accession_number;
663	4					22	$acc = join("; ", $acc, $seq->get_secondary_accessions);
664							} elsif ( $seq->can('accession_number') ) {
665	7					27	$acc = $seq->accession_number;
666							}
667
668	11	50				44	if (defined $acc) {
669	11	50				59	$self->_print("AC $acc;\n",
670							"XX\n") \|\| return;
671							}
672							}
673
674							# Date lines
675	11					32	my $switch=0;
676	11	100				73	if ( $seq->can('get_dates') ) {
677	4					18	my @dates = $seq->get_dates();
678	4					8	my $ct = 1;
679	4					8	my $date_flag = 0;
680	4					16	my ($cr) = $seq->annotation->get_Annotations("creation_release");
681	4					16	my ($ur) = $seq->annotation->get_Annotations("update_release");
682	4					13	my ($uv) = $seq->annotation->get_Annotations("update_version");
683
684	4	0	33			14	unless ($cr && $ur && $ur) {
			33
685	4					8	$date_flag = 1;
686							}
687
688	4					10	foreach my $dt (@dates) {
689	0	0				0	if (!$date_flag) {
690	0	0				0	$self->_write_line_EMBL_regex("DT ","DT ",
691							$dt." (Rel. ".($cr->value()).", Created)",
692							'\s+\|$',80) if $ct == 1;
693	0	0				0	$self->_write_line_EMBL_regex("DT ","DT ",
694							$dt." (Rel. ".($ur->value()).", Last updated, Version ".($uv->value()).")",
695							'\s+\|$',80) if $ct == 2;
696							} else { # other formats?
697	0					0	$self->_write_line_EMBL_regex("DT ","DT ",
698							$dt,'\s+\|$',80);
699							}
700	0					0	$switch =1;
701	0					0	$ct++;
702							}
703	4	50				14	if ($switch == 1) {
704	0	0				0	$self->_print("XX\n") \|\| return;
705							}
706							}
707
708							# Description lines
709	11	50				60	$self->_write_line_EMBL_regex("DE ","DE ",$seq->desc(),'\s+\|$',80) \|\| return; #'
710	11	50				38	$self->_print( "XX\n") \|\| return;
711
712							# if there, write the kw line
713							{
714	11					21	my( $kw );
	11					22
715	11	50				61	if ( my $func = $self->_kw_generation_func ) {
		100
716	0					0	$kw = &{$func}($seq);
	0					0
717							} elsif ( $seq->can('keywords') ) {
718	4					16	$kw = $seq->keywords;
719							}
720	11	100				44	if (defined $kw) {
721	4	50				632	$self->_write_line_EMBL_regex("KW ", "KW ", $kw, '\s+\|$', 80) \|\| return; #'
722	4	50				14	$self->_print( "XX\n") \|\| return;
723							}
724							}
725
726							# Organism lines
727
728	11	100	100			112	if ($seq->can('species') && (my $spec = $seq->species)) {
729	6					39	my @class = $spec->classification();
730	6					11	shift @class; # get rid of species name. Some embl files include
731							# the species name in the OC lines, but this seems
732							# more like an error than something we need to
733							# emulate
734	6					34	my $OS = $spec->scientific_name;
735	6	50				24	if ($spec->common_name) {
736	0					0	$OS .= ' ('.$spec->common_name.')';
737							}
738	6	50				290	$self->_print("OS $OS\n") \|\| return;
739	6					41	my $OC = join('; ', reverse(@class)) .'.';
740	6	50				22	$self->_write_line_EMBL_regex("OC ","OC ",$OC,'; \|$',80) \|\| return;
741	6	50				29	if ($spec->organelle) {
742	0	0				0	$self->_write_line_EMBL_regex("OG ","OG ",$spec->organelle,'; \|$',80) \|\| return;
743							}
744	6					21	my $ncbi_taxid = $spec->ncbi_taxid;
745	6	100				16	if ($ncbi_taxid) {
746	2	50				10	$self->_print("OX NCBI_TaxID=$ncbi_taxid\n") \|\| return;
747							}
748	6	50				21	$self->_print("XX\n") \|\| return;
749							}
750							# Reference lines
751	11					27	my $t = 1;
752	11	100	66			88	if ( $seq->can('annotation') && defined $seq->annotation ) {
753	9					24	foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
754	6	50				19	$self->_print( "RN [$t]\n") \|\| return;
755
756							# Having no RP line is legal, but we need both
757							# start and end for a valid location.
758	6	50				21	if ($ref->comment) {
759	0	0				0	$self->_write_line_EMBL_regex("RC ", "RC ", $ref->comment, '\s+\|$', 80) \|\| return; #'
760							}
761	6					17	my $start = $ref->start;
762	6					18	my $end = $ref->end;
763	6	50	33			20	if ($start and $end) {
		0	0
764	6	50				16	$self->_print( "RP $start-$end\n") \|\| return;
765							} elsif ($start or $end) {
766	0					0	$self->throw("Both start and end are needed for a valid RP line.".
767							" Got: start='$start' end='$end'");
768							}
769
770	6	50				17	if (my $med = $ref->medline) {
771	0	0				0	$self->_print( "RX MEDLINE; $med.\n") \|\| return;
772							}
773	6	50				13	if (my $pm = $ref->pubmed) {
774	0	0				0	$self->_print( "RX PUBMED; $pm.\n") \|\| return;
775							}
776	6					15	my $authors = $ref->authors;
777	6					73	$authors =~ s/([\w\.]) (\w)/$1#$2/g; # add word wrap protection char '#'
778
779	6	50				22	$self->_write_line_EMBL_regex("RA ", "RA ",
780							$authors . ";",
781							'\s+\|$', 80) \|\| return; #'
782
783							# If there is no title to the reference, it appears
784							# as a single semi-colon. All titles must end in
785							# a semi-colon.
786	6		100			17	my $ref_title = $ref->title \|\| '';
787	6					40	$ref_title =~ s/[\s;]*$/;/;
788	6	50				16	$self->_write_line_EMBL_regex("RT ", "RT ", $ref_title, '\s+\|$', 80) \|\| return; #'
789	6	50				16	$self->_write_line_EMBL_regex("RL ", "RL ", $ref->location, '\s+\|$', 80) \|\| return; #'
790	6	50				14	$self->_print("XX\n") \|\| return;
791	6					14	$t++;
792							}
793
794							# DB Xref lines
795	9	100				47	if (my @db_xref = $seq->annotation->get_Annotations('dblink') ) {
796	2					4	for my $dr (@db_xref) {
797	43					84	my $db_name = $dr->database;
798	43					74	my $prim = $dr->primary_id;
799
800	43		50			74	my $opt = $dr->optional_id \|\| '';
801	43	50				102	my $line = $opt ? "$db_name; $prim; $opt." : "$db_name; $prim.";
802	43	50				68	$self->_write_line_EMBL_regex("DR ", "DR ", $line, '\s+\|$', 80) \|\| return; #'
803							}
804	2	50				9	$self->_print("XX\n") \|\| return;
805							}
806
807							# Comment lines
808	9					37	foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
809	4	50				19	$self->_write_line_EMBL_regex("CC ", "CC ", $comment->text, '\s+\|$', 80) \|\| return; #'
810	4	50				14	$self->_print("XX\n") \|\| return;
811							}
812							}
813							# "\\s\+\\|\$"
814
815							## FEATURE TABLE
816
817	11	50				60	$self->_print("FH Key Location/Qualifiers\n") \|\| return;
818	11	50				34	$self->_print("FH\n") \|\| return;
819
820	11	100				240	my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : ();
821	11	100				42	if ($feats[0]) {
822	6	50				25	if ( defined $self->_post_sort ) {
823							# we need to read things into an array.
824							# Process. Sort them. Print 'em
825
826	0					0	my $post_sort_func = $self->_post_sort();
827	0					0	my @fth;
828
829	0					0	foreach my $sf ( @feats ) {
830	0					0	push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq));
831							}
832
833	0					0	@fth = sort { &$post_sort_func($a,$b) } @fth;
	0					0
834
835	0					0	foreach my $fth ( @fth ) {
836	0	0				0	$self->_print_EMBL_FTHelper($fth) \|\| return;
837							}
838							} else {
839							# not post sorted. And so we can print as we get them.
840							# lower memory load...
841
842	6					23	foreach my $sf ( @feats ) {
843	38					138	my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq);
844	38					73	foreach my $fth ( @fth ) {
845	38	50				99	if ( $fth->key eq 'CONTIG') {
846	0					0	$self->_show_dna(0);
847							}
848	38	50				138	$self->_print_EMBL_FTHelper($fth) \|\| return;
849							}
850							}
851							}
852							}
853
854	11	50				61	if ( $self->_show_dna() == 0 ) {
855	0	0				0	$self->_print( "//\n") \|\| return;
856	0					0	return;
857							}
858	11	50				42	$self->_print( "XX\n") \|\| return;
859
860							# finished printing features.
861
862							# print contig if present : bug#2982
863	11	100	66			127	if ( $seq->can('annotation') && defined $seq->annotation) {
864	9					42	foreach my $ctg ( $seq->annotation->get_Annotations('contig') ) {
865	0	0				0	if ($ctg->value) {
866	0	0				0	$self->_write_line_EMBL_regex("CO ","CO ", $ctg->value,
867							'[,]\|$', 80) \|\| return;
868							}
869							}
870							}
871							# print sequence lines only if sequence is present! bug#2982
872	11	100				46	if (length($str)) {
873	8					48	$str =~ tr/A-Z/a-z/;
874
875							# Count each nucleotide
876	8					30	my $alen = $str =~ tr/a/a/;
877	8					29	my $clen = $str =~ tr/c/c/;
878	8					27	my $glen = $str =~ tr/g/g/;
879	8					23	my $tlen = $str =~ tr/t/t/;
880
881	8					32	my $len = $seq->length();
882	8					24	my $olen = $seq->length() - ($alen + $tlen + $clen + $glen);
883	8	50				32	if ( $olen < 0 ) {
884	0					0	$self->warn("Weird. More atgc than bases. Problem!");
885							}
886
887	8	50				63	$self->_print("SQ Sequence $len BP; $alen A; $clen C; $glen G; $tlen T; $olen other;\n") \|\| return;
888
889	8					19	my $nuc = 60; # Number of nucleotides per line
890	8					20	my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line
891	8					14	my $out_pat = 'A11' x 6; # Pattern for packing a line
892	8					16	my $length = length($str);
893
894							# Calculate the number of nucleotides which fit on whole lines
895	8					45	my $whole = int($length / $nuc) * $nuc;
896
897							# Print the whole lines
898	8					12	my( $i );
899	8					33	for ($i = 0; $i < $whole; $i += $nuc) {
900	162					752	my $blocks = pack $out_pat,
901							unpack $whole_pat,
902							substr($str, $i, $nuc);
903	162	50				718	$self->_print(sprintf(" $blocks%9d\n", $i + $nuc)) \|\| return;
904							}
905
906							# Print the last line
907	8	50				46	if (my $last = substr($str, $i)) {
908	8					19	my $last_len = length($last);
909	8					46	my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10;
910	8					83	my $blocks = pack $out_pat,
911							unpack($last_pat, $last);
912	8	50				67	$self->_print(sprintf(" $blocks%9d\n", $length)) \|\|
913							return; # Add the length to the end
914							}
915							}
916
917	11	50				49	$self->_print( "//\n") \|\| return;
918
919	11	50	33			76	$self->flush if $self->_flush_on_write && defined $self->_fh;
920							}
921	11					85	return 1;
922							}
923
924							=head2 _print_EMBL_FTHelper
925
926							Title : _print_EMBL_FTHelper
927							Usage :
928							Function: Internal function
929							Returns : 1 if writing succeeded, otherwise undef
930							Args :
931
932
933							=cut
934
935							sub _print_EMBL_FTHelper {
936	38			38		81	my ($self,$fth) = @_;
937
938	38	50	33			238	if ( ! ref $fth \|\| ! $fth->isa('Bio::SeqIO::FTHelper') ) {
939	0					0	$fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!");
940							}
941
942
943							#$self->_print( "FH Key Location/Qualifiers\n");
944							#$self->_print( sprintf("FT %-15s %s\n",$fth->key,$fth->loc));
945							# let
946	38	50				102	if ( $fth->key eq 'CONTIG' ) {
947	0	0				0	$self->_print("XX\n") \|\| return;
948	0	0				0	$self->_write_line_EMBL_regex("CO ",
949							"CO ",$fth->loc,
950							'\,\|$',80) \|\| return; #'
951	0					0	return 1;
952							}
953	38	50				110	$self->_write_line_EMBL_regex(sprintf("FT %-15s ",$fth->key),
954							"FT ",$fth->loc,
955							'\,\|$',80) \|\| return; #'
956	38					76	foreach my $tag (sort keys %{$fth->field} ) {
	38					105
957	108	50				249	if ( ! defined $fth->field->{$tag} ) {
958	0					0	next;
959							}
960	108					138	foreach my $value ( @{$fth->field->{$tag}} ) {
	108					190
961	139					243	$value =~ s/\"/\"\"/g;
962	139	50	66			412	if ($value eq "_no_value") {
		50
963	0	0				0	$self->_write_line_EMBL_regex("FT ",
964							"FT ",
965							"/$tag",'.\|$',80) \|\| return; #'
966							}
967							# there are almost 3x more quoted qualifier values and they
968							# are more common too so we take quoted ones first
969							#
970							# Long qualifiers, that will be line wrapped, are always quoted
971							elsif (!$FTQUAL_NO_QUOTE{$tag} or length("/$tag=$value")>=60) {
972	139	100				413	my $pat = $value =~ /\s+/ ? '\s+\|\-\|$' : '.\|\-\|$';
973	139	50				414	$self->_write_line_EMBL_regex("FT ",
974							"FT ",
975							"/$tag=\"$value\"",$pat,80) \|\| return;
976							} else {
977	0	0				0	$self->_write_line_EMBL_regex("FT ",
978							"FT ",
979							"/$tag=$value",'.\|$',80) \|\| return; #'
980							}
981							}
982							}
983
984	38					273	return 1;
985							}
986
987
988
989							=head2 _read_EMBL_Contig()
990
991							Title : _read_EMBL_Contig
992							Usage :
993							Function: convert CO lines into annotations
994							Returns :
995							Args :
996
997							=cut
998
999							sub _read_EMBL_Contig {
1000	4			4		11	my ($self, $buffer) = @_;
1001	4					6	my @ret;
1002	4	50				18	if ( $$buffer !~ /^CO/ ) {
1003	0					0	warn("Not parsing line '$$buffer' which maybe important");
1004							}
1005	4					14	$self->_pushback($$buffer);
1006	4					12	while ( defined ($_ = $self->_readline) ) {
1007	356	100				906	/^C/ \|\| last;
1008	353	50				898	/^CO\s+(.*)/ && do {
1009	353					907	push @ret, Bio::Annotation::SimpleValue->new( -tagname => 'contig',
1010							-value => $1);
1011							};
1012							}
1013	4					12	$$buffer = $_;
1014	4					59	return @ret;
1015
1016							}
1017							#'
1018							=head2 _read_EMBL_References
1019
1020							Title : _read_EMBL_References
1021							Usage :
1022							Function: Reads references from EMBL format. Internal function really
1023							Example :
1024							Returns :
1025							Args :
1026
1027
1028							=cut
1029
1030							sub _read_EMBL_References {
1031	99			99		184	my ($self,$buffer) = @_;
1032	99					130	my (@refs);
1033
1034							# assume things are starting with RN
1035
1036	99	50				289	if ( $$buffer !~ /^RN/ ) {
1037	0					0	warn("Not parsing line '$$buffer' which maybe important");
1038							}
1039	99					519	my $b1;
1040							my $b2;
1041	99					0	my $title;
1042	99					0	my $loc;
1043	99					0	my $au;
1044	99					0	my $med;
1045	99					0	my $pm;
1046	99					0	my $com;
1047
1048	99					185	while ( defined ($_ = $self->_readline) ) {
1049	727	100				1820	/^R/ \|\| last;
1050	628	100				1147	/^RP (\d+)-(\d+)/ && do {$b1=$1;$b2=$2;};
	40					153
	40					78
1051	628	100				977	/^RX MEDLINE;\s+(\d+)/ && do {$med=$1};
	55					134
1052	628	100				935	/^RX PUBMED;\s+(\d+)/ && do {$pm=$1};
	10					25
1053	628	100				1148	/^RA (.*)/ && do {
1054	193					529	$au = $self->_concatenate_lines($au,$1); next;
	193					384
1055							};
1056	435	100				759	/^RT (.*)/ && do {
1057	164					323	$title = $self->_concatenate_lines($title,$1); next;
	164					326
1058							};
1059	271	100				621	/^RL (.*)/ && do {
1060	156					323	$loc = $self->_concatenate_lines($loc,$1); next;
	156					351
1061							};
1062	115	100				272	/^RC (.*)/ && do {
1063	3					12	$com = $self->_concatenate_lines($com,$1); next;
	3					7
1064							};
1065							}
1066
1067	99					426	my $ref = Bio::Annotation::Reference->new();
1068	99					540	$au =~ s/;\s*$//g;
1069	99					325	$title =~ s/;\s*$//g;
1070
1071	99					295	$ref->start($b1);
1072	99					228	$ref->end($b2);
1073	99					253	$ref->authors($au);
1074	99					224	$ref->title($title);
1075	99					217	$ref->location($loc);
1076	99					278	$ref->medline($med);
1077	99					286	$ref->comment($com);
1078	99					226	$ref->pubmed($pm);
1079
1080	99					133	push(@refs,$ref);
1081	99					170	$$buffer = $_;
1082
1083	99					258	return @refs;
1084							}
1085
1086							=head2 _read_EMBL_Species
1087
1088							Title : _read_EMBL_Species
1089							Usage :
1090							Function: Reads the EMBL Organism species and classification
1091							lines.
1092							Example :
1093							Returns : A Bio::Species object
1094							Args : a reference to the current line buffer, accession number
1095
1096							=cut
1097
1098							sub _read_EMBL_Species {
1099	21			21		74	my( $self, $buffer, $acc ) = @_;
1100	21					39	my $org;
1101
1102	21					50	$_ = $$buffer;
1103	21					49	my( $sub_species, $species, $genus, $common, $sci_name, $class_lines );
1104	21		66			96	while (defined( $_ \|\|= $self->_readline )) {
1105	92	100				446	if (/^OS\s+(.+)/) {
		100
		50
1106	21	50				87	$sci_name .= ($sci_name) ? ' '.$1 : $1;
1107							} elsif (s/^OC\s+(.+)$//) {
1108	50					181	$class_lines .= $1;
1109							} elsif (/^OG\s+(.*)/) {
1110	0					0	$org = $1;
1111							} else {
1112	21					59	last;
1113							}
1114
1115	71					250	$_ = undef; # Empty $_ to trigger read of next line
1116							}
1117
1118							# $$buffer = $_;
1119	21					96	$self->_pushback($_);
1120
1121	21					40	$sci_name =~ s{\.$}{};
1122	21	50				60	$sci_name \|\| return;
1123
1124							# Convert data in classification lines into classification array.
1125							# only split on ';' or '.' so that classification that is 2 or more words
1126							# will still get matched, use map() to remove trailing/leading/intervening
1127							# spaces
1128	21					242	my @class = map { s/^\s+//; s/\s+$//; s/\s{2,}/ /g; $_; } split /(?
	224					401
	224					326
	224					244
	224					333
1129
1130							# do we have a genus?
1131	21					64	my $possible_genus = $class[-1];
1132	21	50				80	$possible_genus .= "\|$class[-2]" if $class[-2];
1133	21	50				686	if ($sci_name =~ /^($possible_genus)/) {
1134	21					74	$genus = $1;
1135	21					263	($species) = $sci_name =~ /^$genus\s+(.+)/;
1136							} else {
1137	0					0	$species = $sci_name;
1138							}
1139
1140							# Don't make a species object if it is "Unknown" or "None"
1141	21	50				86	if ($genus) {
1142	21	50				111	return if $genus =~ /^(Unknown\|None)$/i;
1143							}
1144
1145							# is this organism of rank species or is it lower?
1146							# (doesn't catch everything, but at least the guess isn't dangerous)
1147	21	50				120	if ($species =~ /subsp\.\|var\./) {
1148	0					0	($species, $sub_species) = $species =~ /(.+)\s+((?:subsp\.\|var\.).+)/;
1149							}
1150
1151							# sometimes things have common name in brackets, like
1152							# Schizosaccharomyces pombe (fission yeast), so get rid of the common
1153							# name bit. Probably dangerous if real scientific species name ends in
1154							# bracketed bit.
1155	21	50				67	unless ($class[-1] eq 'Viruses') {
1156	21					110	($species, $common) = $species =~ /^(.+)\s+$(.+)$$/;
1157	21	100				110	$sci_name =~ s/\s+$.+$$// if $common;
1158							}
1159
1160							# Bio::Species array needs array in Species -> Kingdom direction
1161	21	50				65	unless ($class[-1] eq $sci_name) {
1162	21					59	push(@class, $sci_name);
1163							}
1164	21					44	@class = reverse @class;
1165
1166							# do minimal sanity checks before we hand off to Bio::Species which won't
1167							# be able to give informative throw messages if it has to throw because
1168							# of problems here
1169	21	50				67	$self->throw("$acc seems to be missing its OS line: invalid.") unless $sci_name;
1170	21					35	my %names;
1171	21					90	foreach my $i (0..$#class) {
1172	245					270	my $name = $class[$i];
1173	245					406	$names{$name}++;
1174							# this code breaks examples like: Xenopus (Silurana) tropicalis
1175							# commenting out, see bug 3158
1176
1177							#if ($names{$name} > 1 && ($name ne $class[$i - 1])) {
1178							# $self->warn("$acc seems to have an invalid species classification:$name ne $class[$i - 1]");
1179							#}
1180							}
1181	21					170	my $make = Bio::Species->new();
1182	21					112	$make->scientific_name($sci_name);
1183	21					93	$make->classification(@class);
1184	21	50				97	unless ($class[-1] eq 'Viruses') {
1185	21	50				136	$make->genus($genus) if $genus;
1186	21	100				96	$make->species($species) if $species;
1187	21	50				66	$make->sub_species($sub_species) if $sub_species;
1188	21	100				89	$make->common_name($common) if $common;
1189							}
1190	21	50				56	$make->organelle($org) if $org;
1191	21					208	return $make;
1192							}
1193
1194							=head2 _read_EMBL_DBLink
1195
1196							Title : _read_EMBL_DBLink
1197							Usage :
1198							Function: Reads the EMBL database cross reference ("DR") lines
1199							Example :
1200							Returns : A list of Bio::Annotation::DBLink objects
1201							Args :
1202
1203							=cut
1204
1205							sub _read_EMBL_DBLink {
1206	5			5		11	my( $self,$buffer ) = @_;
1207	5					7	my( @db_link );
1208
1209	5					15	$_ = $$buffer;
1210	5		66			23	while (defined( $_ \|\|= $self->_readline )) {
1211	30	100				145	if ( /^DR ([^\s;]+);\s([^\s;]+);?\s([^\s;]+)?\.$/) {
1212	25					72	my ($databse, $prim_id, $sec_id) = ($1,$2,$3);
1213	25					77	my $link = Bio::Annotation::DBLink->new(-database => $databse,
1214							-primary_id => $prim_id,
1215							-optional_id => $sec_id);
1216
1217	25					45	push(@db_link, $link);
1218							} else {
1219	5					8	last;
1220							}
1221	25					68	$_ = undef; # Empty $_ to trigger read of next line
1222							}
1223
1224	5					11	$$buffer = $_;
1225	5					12	return @db_link;
1226							}
1227
1228							=head2 _read_EMBL_TaxID_DBLink
1229
1230							Title : _read_EMBL_TaxID_DBLink
1231							Usage :
1232							Function: Reads the EMBL database cross reference to NCBI TaxID ("OX") lines
1233							Example :
1234							Returns : A list of Bio::Annotation::DBLink objects
1235							Args :
1236
1237							=cut
1238
1239							sub _read_EMBL_TaxID_DBLink {
1240	3			3		9	my( $self,$buffer ) = @_;
1241	3					5	my( @db_link );
1242
1243	3					6	$_ = $$buffer;
1244	3		66			16	while (defined( $_ \|\|= $self->_readline )) {
1245	4	100				68	if ( /^OX (\S+)=(\d+);$/ ) {
1246	1					10	my ($databse, $prim_id) = ($1,$2);
1247	1					20	my $link = Bio::Annotation::DBLink->new(-database => $databse,
1248							-primary_id => $prim_id,);
1249	1					6	push(@db_link, $link);
1250							} else {
1251	3					8	last;
1252							}
1253	1					8	$_ = undef; # Empty $_ to trigger read of next line
1254							}
1255
1256	3					6	$$buffer = $_;
1257	3					8	return @db_link;
1258							}
1259
1260							=head2 _filehandle
1261
1262							Title : _filehandle
1263							Usage : $obj->_filehandle($newval)
1264							Function:
1265							Example :
1266							Returns : value of _filehandle
1267							Args : newvalue (optional)
1268
1269
1270							=cut
1271
1272							sub _filehandle{
1273	0			0		0	my ($obj,$value) = @_;
1274	0	0				0	if ( defined $value) {
1275	0					0	$obj->{'_filehandle'} = $value;
1276							}
1277	0					0	return $obj->{'_filehandle'};
1278
1279							}
1280
1281							=head2 _read_FTHelper_EMBL
1282
1283							Title : _read_FTHelper_EMBL
1284							Usage : _read_FTHelper_EMBL($buffer)
1285							Function: reads the next FT key line
1286							Example :
1287							Returns : Bio::SeqIO::FTHelper object
1288							Args : filehandle and reference to a scalar
1289
1290
1291							=cut
1292
1293							sub _read_FTHelper_EMBL {
1294	256			256		380	my ($self,$buffer) = @_;
1295
1296	256					344	my ($key, # The key of the feature
1297							$loc, # The location line from the feature
1298							@qual, # An arrray of lines making up the qualifiers
1299							);
1300
1301	256	50				1017	if ($$buffer =~ /^FT\s{3}(\S+)\s+(\S+)/ ) {
		0
1302	256					600	$key = $1;
1303	256					437	$loc = $2;
1304							# Read all the lines up to the next feature
1305	256					587	while ( defined($_ = $self->_readline) ) {
1306	2038	100				11902	if (/^FT(\s+)(.+?)\s*$/) {
1307							# Lines inside features are preceeded by 19 spaces
1308							# A new feature is preceeded by 3 spaces
1309	2016	100				3895	if (length($1) > 4) {
1310							# Add to qualifiers if we're in the qualifiers
1311	1782	100				2838	if (@qual) {
		100
1312	1523					3891	push(@qual, $2);
1313							}
1314							# Start the qualifier list if it's the first qualifier
1315							elsif (substr($2, 0, 1) eq '/') {
1316	254					752	@qual = ($2);
1317							}
1318							# We're still in the location line, so append to location
1319							else {
1320	5					23	$loc .= $2;
1321							}
1322							} else {
1323							# We've reached the start of the next feature
1324	234					328	last;
1325							}
1326							} else {
1327							# We're at the end of the feature table
1328	22					44	last;
1329							}
1330							}
1331							} elsif ( $$buffer =~ /^CO\s+(\S+)/) {
1332	0					0	$key = 'CONTIG';
1333	0					0	$loc = $1;
1334							# Read all the lines up to the next feature
1335	0					0	while ( defined($_ = $self->_readline) ) {
1336	0	0				0	if (/^CO\s+(\S+)\s*$/) {
1337	0					0	$loc .= $1;
1338							} else {
1339							# We've reached the start of the next feature
1340	0					0	last;
1341							}
1342							}
1343							} else {
1344							# No feature key
1345	0					0	return;
1346							}
1347
1348							# Put the first line of the next feature into the buffer
1349	256					371	$$buffer = $_;
1350
1351							# Make the new FTHelper object
1352	256					797	my $out = Bio::SeqIO::FTHelper->new();
1353	256					631	$out->verbose($self->verbose());
1354	256					604	$out->key($key);
1355	256					551	$out->loc($loc);
1356
1357							# Now parse and add any qualifiers. (@qual is kept
1358							# intact to provide informative error messages.)
1359	256					272	my $last_unquoted_qualifier;
1360							QUAL:
1361	256					518	for (my $i = 0; $i < @qual; $i++) {
1362	1091					1481	my $data = $qual[$i];
1363	1091					1127	my ( $qualifier, $value );
1364
1365	1091	50				4997	unless (( $qualifier, $value ) = ($data =~ m{^/([^=]+)(?:=\s(.))?})) {
1366	0	0				0	if ( defined $last_unquoted_qualifier ) {
1367							# handle case of unquoted multiline - read up everything until the next qualifier
1368	0		0			0	do {
1369							# Protein sequence translations need to be joined without spaces,
1370							# other qualifiers need those.
1371	0	0				0	$value .= ' ' if $qualifier ne "translation";
1372	0					0	$value .= $data;
1373							} while defined($data = $qual[++$i]) && $data !~ m[^/];
1374	0					0	$i--;
1375	0					0	$out->field->{$last_unquoted_qualifier}->[-1] .= $value;
1376	0					0	$last_unquoted_qualifier = undef;
1377	0					0	next QUAL;
1378							} else {
1379	0					0	$self->throw("Can't see new qualifier in: $_\nfrom:\n"
1380							. join('', map "$_\n", @qual));
1381							}
1382							}
1383	1091	50				1979	$qualifier = '' if not defined $qualifier;
1384
1385	1091					1420	$last_unquoted_qualifier = undef;
1386	1091	50				1447	if (defined $value) {
1387							# Do we have a quoted value?
1388	1091	100				1672	if (substr($value, 0, 1) eq '"') {
1389							# Keep adding to value until we find the trailing quote
1390							# and the quotes are balanced
1391							QUOTES:
1392	971		66			3663	while ($value !~ /"$/ or $value =~ tr/"/"/ % 2) { #"
1393	686					729	$i++;
1394	686					770	my $next = $qual[$i];
1395	686	50				971	if (!defined($next)) {
1396	0					0	$self->warn("Unbalanced quote in:\n".join("\n", @qual).
1397							"\nAdding quote to close...".
1398							"Check sequence quality!");
1399	0					0	$value .= '"';
1400	0					0	last QUOTES;
1401							}
1402
1403							# Protein sequence translations need to be joined without spaces,
1404							# other qualifiers need those.
1405	686	100				880	if ($qualifier eq "translation") {
1406	569					1600	$value .= $next;
1407							} else {
1408	117					489	$value .= " $next";
1409							}
1410							}
1411							# Trim leading and trailing quotes
1412	971					8297	$value =~ s/^"\|"$//g;
1413							# Undouble internal quotes
1414	971					1680	$value =~ s/""/"/g; #"
1415							} else {
1416	120					169	$last_unquoted_qualifier = $qualifier;
1417							}
1418							} else {
1419	0					0	$value = '_no_value';
1420							}
1421
1422							# Store the qualifier
1423	1091		100			2178	$out->field->{$qualifier} \|\|= [];
1424	1091					1173	push(@{$out->field->{$qualifier}},$value);
	1091					1684
1425							}
1426
1427	256					657	return $out;
1428							}
1429
1430							=head2 _write_line_EMBL
1431
1432							Title : _write_line_EMBL
1433							Usage :
1434							Function: internal function
1435							Example :
1436							Returns : 1 if writing succeeded, else undef
1437							Args :
1438
1439
1440							=cut
1441
1442							sub _write_line_EMBL {
1443	0			0		0	my ($self,$pre1,$pre2,$line,$length) = @_;
1444
1445	0	0				0	$length \|\| $self->throw("Miscalled write_line_EMBL without length. Programming error!");
1446	0					0	my $subl = $length - length $pre2;
1447	0					0	my $linel = length $line;
1448	0					0	my $i;
1449
1450	0					0	my $sub = substr($line,0,$length - length $pre1);
1451
1452	0	0				0	$self->_print( "$pre1$sub\n") \|\| return;
1453
1454	0					0	for ($i= ($length - length $pre1);$i < $linel;) {
1455	0					0	$sub = substr($line,$i,($subl));
1456	0	0				0	$self->_print( "$pre2$sub\n") \|\| return;
1457	0					0	$i += $subl;
1458							}
1459
1460	0					0	return 1;
1461							}
1462
1463							=head2 _write_line_EMBL_regex
1464
1465							Title : _write_line_EMBL_regex
1466							Usage :
1467							Function: internal function for writing lines of specified
1468							length, with different first and the next line
1469							left hand headers and split at specific points in the
1470							text
1471							Example :
1472							Returns : nothing
1473							Args : file handle, first header, second header, text-line, regex for line breaks, total line length
1474
1475
1476							=cut
1477
1478							sub _write_line_EMBL_regex {
1479	263			263		549	my ($self,$pre1,$pre2,$line,$regex,$length) = @_;
1480
1481							#print STDOUT "Going to print with $line!\n";
1482
1483	263	50				456	$length \|\| $self->throw("Programming error - called write_line_EMBL_regex without length.");
1484
1485	263					378	my $subl = $length - (length $pre1) -1 ;
1486	263					257	my( @lines );
1487
1488	263					410	CHUNK: while($line) {
1489	401					633	foreach my $pat ($regex, '[,;\.\/-]\s\|'.$regex, '[,;\.\/-]\|'.$regex) {
1490	401	50				4174	if ($line =~ m/^(.{1,$subl})($pat)(.*)/ ) {
1491	401					1077	my $l = $1.$2;
1492	401	100				682	$l =~ s/#/ /g # remove word wrap protection char '#'
1493							if $pre1 eq "RA ";
1494	401					566	my $newl = $3;
1495	401					681	$line = substr($line,length($l));
1496							# be strict about not padding spaces according to
1497							# genbank format
1498	401					1135	$l =~ s/\s+$//;
1499	401	50				654	next CHUNK if ($l eq '');
1500	401					528	push(@lines, $l);
1501	401					895	next CHUNK;
1502							}
1503							}
1504							# if we get here none of the patterns matched $subl or less chars
1505	0					0	$self->warn("trouble dissecting \"$line\"\n into chunks ".
1506							"of $subl chars or less - this tag won't print right");
1507							# insert a space char to prevent infinite loops
1508	0					0	$line = substr($line,0,$subl) . " " . substr($line,$subl);
1509							}
1510	263					397	my $s = shift @lines;
1511	263	100	50			1072	($self->_print("$pre1$s\n") \|\| return) if $s;
1512	263					462	foreach my $s ( @lines ) {
1513	145	50				300	$self->_print("$pre2$s\n") \|\| return;
1514							}
1515
1516	263					707	return 1;
1517							}
1518
1519							=head2 _post_sort
1520
1521							Title : _post_sort
1522							Usage : $obj->_post_sort($newval)
1523							Function:
1524							Returns : value of _post_sort
1525							Args : newvalue (optional)
1526
1527
1528							=cut
1529
1530							sub _post_sort{
1531	6			6		11	my $obj = shift;
1532	6	50				21	if ( @_ ) {
1533	0					0	my $value = shift;
1534	0					0	$obj->{'_post_sort'} = $value;
1535							}
1536	6					24	return $obj->{'_post_sort'};
1537
1538							}
1539
1540							=head2 _show_dna
1541
1542							Title : _show_dna
1543							Usage : $obj->_show_dna($newval)
1544							Function:
1545							Returns : value of _show_dna
1546							Args : newvalue (optional)
1547
1548
1549							=cut
1550
1551							sub _show_dna{
1552	42			42		77	my $obj = shift;
1553	42	100				119	if ( @_ ) {
1554	31					60	my $value = shift;
1555	31					79	$obj->{'_show_dna'} = $value;
1556							}
1557	42					121	return $obj->{'_show_dna'};
1558
1559							}
1560
1561							=head2 _id_generation_func
1562
1563							Title : _id_generation_func
1564							Usage : $obj->_id_generation_func($newval)
1565							Function:
1566							Returns : value of _id_generation_func
1567							Args : newvalue (optional)
1568
1569
1570							=cut
1571
1572							sub _id_generation_func{
1573	11			11		24	my $obj = shift;
1574	11	50				34	if ( @_ ) {
1575	0					0	my $value = shift;
1576	0					0	$obj->{'_id_generation_func'} = $value;
1577							}
1578	11					39	return $obj->{'_id_generation_func'};
1579
1580							}
1581
1582							=head2 _ac_generation_func
1583
1584							Title : _ac_generation_func
1585							Usage : $obj->_ac_generation_func($newval)
1586							Function:
1587							Returns : value of _ac_generation_func
1588							Args : newvalue (optional)
1589
1590
1591							=cut
1592
1593							sub _ac_generation_func{
1594	11			11		28	my $obj = shift;
1595	11	50				32	if ( @_ ) {
1596	0					0	my $value = shift;
1597	0					0	$obj->{'_ac_generation_func'} = $value;
1598							}
1599	11					190	return $obj->{'_ac_generation_func'};
1600
1601							}
1602
1603							=head2 _sv_generation_func
1604
1605							Title : _sv_generation_func
1606							Usage : $obj->_sv_generation_func($newval)
1607							Function:
1608							Returns : value of _sv_generation_func
1609							Args : newvalue (optional)
1610
1611
1612							=cut
1613
1614							sub _sv_generation_func{
1615	0			0		0	my $obj = shift;
1616	0	0				0	if ( @_ ) {
1617	0					0	my $value = shift;
1618	0					0	$obj->{'_sv_generation_func'} = $value;
1619							}
1620	0					0	return $obj->{'_sv_generation_func'};
1621
1622							}
1623
1624							=head2 _kw_generation_func
1625
1626							Title : _kw_generation_func
1627							Usage : $obj->_kw_generation_func($newval)
1628							Function:
1629							Returns : value of _kw_generation_func
1630							Args : newvalue (optional)
1631
1632
1633							=cut
1634
1635							sub _kw_generation_func{
1636	11			11		24	my $obj = shift;
1637	11	50				30	if ( @_ ) {
1638	0					0	my $value = shift;
1639	0					0	$obj->{'_kw_generation_func'} = $value;
1640							}
1641	11					103	return $obj->{'_kw_generation_func'};
1642
1643							}
1644
1645							1;