File Coverage

Bio/SeqIO/embl.pm

Criterion	Covered	Total	%
statement	538	651	82.6
branch	267	434	61.5
condition	90	128	70.3
subroutine	30	33	90.9
pod	2	2	100.0
total	927	1248	74.2

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::SeqIO::EMBL
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Ewan Birney
7							#
8							# Copyright Ewan Birney
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14							=head1 NAME
15
16							Bio::SeqIO::embl - EMBL sequence input/output stream
17
18							=head1 SYNOPSIS
19
20							It is probably best not to use this object directly, but
21							rather go through the SeqIO handler system. Go:
22
23							$stream = Bio::SeqIO->new(-file => $filename, -format => 'EMBL');
24
25							while ( (my $seq = $stream->next_seq()) ) {
26							# do something with $seq
27							}
28
29							=head1 DESCRIPTION
30
31							This object can transform Bio::Seq objects to and from EMBL flat
32							file databases.
33
34							There is a lot of flexibility here about how to dump things which
35							should be documented more fully.
36
37							There should be a common object that this and Genbank share (probably
38							with Swissprot). Too much of the magic is identical.
39
40							=head2 Optional functions
41
42							=over 3
43
44							=item _show_dna()
45
46							(output only) shows the dna or not
47
48							=item _post_sort()
49
50							(output only) provides a sorting func which is applied to the FTHelpers
51							before printing
52
53							=item _id_generation_func()
54
55							This is function which is called as
56
57							print "ID ", $func($annseq), "\n";
58
59							To generate the ID line. If it is not there, it generates a sensible ID
60							line using a number of tools.
61
62							If you want to output annotations in EMBL format they need to be
63							stored in a Bio::Annotation::Collection object which is accessible
64							through the Bio::SeqI interface method L.
65
66							The following are the names of the keys which are polled from a
67							L object.
68
69							reference - Should contain Bio::Annotation::Reference objects
70							comment - Should contain Bio::Annotation::Comment objects
71							dblink - Should contain Bio::Annotation::DBLink objects
72
73							=back
74
75							=head1 FEEDBACK
76
77							=head2 Mailing Lists
78
79							User feedback is an integral part of the evolution of this and other
80							Bioperl modules. Send your comments and suggestions preferably to one
81							of the Bioperl mailing lists. Your participation is much appreciated.
82
83							bioperl-l@bioperl.org - General discussion
84							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
85
86							=head2 Support
87
88							Please direct usage questions or support issues to the mailing list:
89
90							I
91
92							rather than to the module maintainer directly. Many experienced and
93							reponsive experts will be able look at the problem and quickly
94							address it. Please include a thorough description of the problem
95							with code and data examples if at all possible.
96
97							=head2 Reporting Bugs
98
99							Report bugs to the Bioperl bug tracking system to help us keep track
100							the bugs and their resolution. Bug reports can be submitted via
101							the web:
102
103							https://github.com/bioperl/bioperl-live/issues
104
105							=head1 AUTHOR - Ewan Birney
106
107							Email birney@ebi.ac.uk
108
109							=head1 APPENDIX
110
111							The rest of the documentation details each of the object
112							methods. Internal methods are usually preceded with a _
113
114							=cut
115
116
117							# Let the code begin...
118
119
120							package Bio::SeqIO::embl;
121	7			7		514	use vars qw(%FTQUAL_NO_QUOTE);
	7					9
	7					291
122	7			7		24	use strict;
	7					7
	7					111
123	7			7		1372	use Bio::SeqIO::FTHelper;
	7					10
	7					127
124	7			7		28	use Bio::SeqFeature::Generic;
	7					8
	7					98
125	7			7		1401	use Bio::Species;
	7					11
	7					139
126	7			7		26	use Bio::Seq::SeqFactory;
	7					7
	7					113
127	7			7		21	use Bio::Annotation::Collection;
	7					7
	7					111
128	7			7		1114	use Bio::Annotation::Comment;
	7					10
	7					122
129	7			7		1500	use Bio::Annotation::Reference;
	7					10
	7					143
130	7			7		26	use Bio::Annotation::DBLink;
	7					8
	7					110
131
132	7			7		20	use base qw(Bio::SeqIO);
	7					8
	7					33181
133
134							# Note that a qualifier that exceeds one line (i.e. a long label) will
135							# automatically be quoted regardless:
136							%FTQUAL_NO_QUOTE=(
137							'anticodon'=>1,
138							'citation'=>1,
139							'codon'=>1,
140							'codon_start'=>1,
141							'cons_splice'=>1,
142							'direction'=>1,
143							'evidence'=>1,
144							'label'=>1,
145							'mod_base'=> 1,
146							'number'=> 1,
147							'rpt_type'=> 1,
148							'rpt_unit'=> 1,
149							'transl_except'=> 1,
150							'transl_table'=> 1,
151							'usedin'=> 1,
152							);
153
154							sub _initialize {
155	31			31		49	my($self,@args) = @_;
156
157	31					110	$self->SUPER::_initialize(@args);
158							# hash for functions for decoding keys.
159	31					83	$self->{'_func_ftunit_hash'} = {};
160							# sets this to one by default. People can change it
161	31					83	$self->_show_dna(1);
162	31	50				84	if ( ! defined $self->sequence_factory ) {
163	31					87	$self->sequence_factory(Bio::Seq::SeqFactory->new
164							(-verbose => $self->verbose(),
165							-type => 'Bio::Seq::RichSeq'));
166							}
167							}
168
169							=head2 next_seq
170
171							Title : next_seq
172							Usage : $seq = $stream->next_seq()
173							Function: returns the next sequence in the stream
174							Returns : Bio::Seq object
175							Args :
176
177							=cut
178
179							sub next_seq {
180	25			25	1	66	my ($self,@args) = @_;
181	25					27	my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div,
182							$date, $comment, @date_arr);
183
184	25					139	my ($annotation, %params, @features) =
185							Bio::Annotation::Collection->new();
186
187	25					88	$line = $self->_readline;
188							# This needs to be before the first eof() test
189
190	25	100				60	if ( !defined $line ) {
191	1					4	return; # no throws - end of file
192							}
193
194	24	100				109	if ( $line =~ /^\s+$/ ) {
195	1					4	while ( defined ($line = $self->_readline) ) {
196	1	50				4	$line =~/^\S/ && last;
197							}
198							# return without error if the whole next sequence was just a single
199							# blank line and then eof
200	1	50				4	return unless $line;
201							}
202
203							# no ID as 1st non-blank line, need short circuit and exit routine
204	24	50				98	$self->throw("EMBL stream with no ID. Not embl in my book")
205							unless $line =~ /^ID\s+\S+/;
206
207							# At this point we are sure that $line contains an ID header line
208	24					26	my $alphabet;
209	24	100				62	if ( $line =~ tr/;/;/ == 6) { # New style headers contain exactly six semicolons.
210
211							# New style header (EMBL Release >= 87, after June 2006)
212	10					13	my $topology;
213							my $sv;
214
215							# ID DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP.
216							# This regexp comes from the new2old.pl conversion script, from EBI
217	10	100				46	if ($line =~ m/^ID (\S+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) BP./) {
218	7					38	($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6);
219							}
220	10	100				19	if (defined $sv) {
221	7					13	$params{'-seq_version'} = $sv;
222	7					12	$params{'-version'} = $sv;
223							}
224
225	10	50	66			38	if (defined $topology && $topology eq 'circular') {
226	0					0	$params{'-is_circular'} = 1;
227							}
228
229	10	100				20	if (defined $mol ) {
230	7	100				20	if ($mol =~ /DNA/) {
		50
		0
231	6					12	$alphabet = 'dna';
232							} elsif ($mol =~ /RNA/) {
233	1					2	$alphabet = 'rna';
234							} elsif ($mol =~ /AA/) {
235	0					0	$alphabet = 'protein';
236							}
237							}
238							} else {
239
240							# Old style header (EMBL Release < 87, before June 2006)
241	14	100				92	if ($line =~ /^ID\s+(\S+)[^;];\s+(\S+)[^;];\s+(\S+)[^;]*;/) {
242	12					42	($name, $mol, $div) = ($1, $2, $3);
243							}
244
245	14	100				30	if ($mol) {
246	12	50				24	if ( $mol =~ /circular/ ) {
247	0					0	$params{'-is_circular'} = 1;
248	0					0	$mol =~ s\|circular \|\|;
249							}
250	12	50				26	if (defined $mol ) {
251	12	100				36	if ($mol =~ /DNA/) {
		50
		0
252	10					12	$alphabet='dna';
253							} elsif ($mol =~ /RNA/) {
254	2					3	$alphabet='rna';
255							} elsif ($mol =~ /AA/) {
256	0					0	$alphabet='protein';
257							}
258							}
259							}
260							}
261
262	24	100	66			96	unless( defined $name && length($name) ) {
263	5					9	$name = "unknown_id";
264							}
265
266							# $self->warn("not parsing upper annotation in EMBL file yet!");
267	24					28	my $buffer = $line;
268	24					24	local $_;
269	24					26	BEFORE_FEATURE_TABLE :
270							my $ncbi_taxid;
271	24					52	until ( !defined $buffer ) {
272	499					399	$_ = $buffer;
273							# Exit at start of Feature table
274	499	100				957	if ( /^(F[HT]\|SQ)/ ) {
275	24	100	100			134	$self->_pushback($_) if( $1 eq 'SQ' \|\| $1 eq 'FT');
276	24					26	last;
277							}
278							# Description line(s)
279	475	100				617	if (/^DE\s+(\S.*\S)/) {
280	22	100				95	$desc .= $desc ? " $1" : $1;
281							}
282
283							#accession number
284	475	100	100			1303	if ( /^AC\s+(.)?/ \|\| /^PA\s+(.)?/) {
285	24					102	my @accs = split(/[; ]+/, $1); # allow space in addition
286							$params{'-accession_number'} = shift @accs
287	24	100				76	unless defined $params{'-accession_number'};
288	24					25	push @{$params{'-secondary_accessions'}}, @accs;
	24					61
289							}
290
291							#version number
292	475	100				611	if ( /^SV\s+\S+\.(\d+);?/ ) {
293	6					9	my $sv = $1;
294							#$sv =~ s/\;//;
295	6					13	$params{'-seq_version'} = $sv;
296	6					9	$params{'-version'} = $sv;
297							}
298
299							#date (NOTE: takes last date line)
300	475	100				633	if ( /^DT\s+(.+)$/ ) {
301	26					44	my $line = $1;
302	26					56	my ($date, $version) = split(' ', $line, 2);
303	26					32	$date =~ tr/,//d; # remove comma if new version
304	26	100				42	if ($version) {
305	24	100				106	if ($version =~ /$Rel\. (\d+), Created$/ms ) {
		100
306	11					73	my $release = Bio::Annotation::SimpleValue->new(
307							-tagname => 'creation_release',
308							-value => $1
309							);
310	11					31	$annotation->add_Annotation($release);
311							} elsif ($version =~ /$Rel\. (\d+), Last updated, Version (\d+)$/ms ) {
312	11					44	my $release = Bio::Annotation::SimpleValue->new(
313							-tagname => 'update_release',
314							-value => $1
315							);
316	11					27	$annotation->add_Annotation($release);
317
318	11					39	my $update = Bio::Annotation::SimpleValue->new(
319							-tagname => 'update_version',
320							-value => $2
321							);
322	11					25	$annotation->add_Annotation($update);
323							}
324							}
325	26					22	push @{$params{'-dates'}}, $date;
	26					49
326							}
327
328							#keywords
329	475	100				1633	if ( /^KW (.)\S$/ ) {
		100
		100
		100
		100
		100
330	29					138	my @kw = split(/\s\;\s/,$1);
331	29					35	push @{$params{'-keywords'}}, @kw;
	29					91
332							}
333
334							# Organism name and phylogenetic information
335							elsif (/^O[SC]/) {
336							# pass the accession number so we can give an informative throw message if necessary
337	21					76	my $species = $self->_read_EMBL_Species(\$buffer, $params{'-accession_number'});
338	21					46	$params{'-species'}= $species;
339							}
340
341							# NCBI TaxID Xref
342							elsif (/^OX/) {
343	3	50				19	if (/NCBI_TaxID=(\d+)/) {
344	3					14	$ncbi_taxid=$1;
345							}
346
347	3					12	my @links = $self->_read_EMBL_TaxID_DBLink(\$buffer);
348	3					6	foreach my $dblink ( @links ) {
349	1					6	$annotation->add_Annotation('dblink',$dblink);
350							}
351							}
352
353							# References
354							elsif (/^R/) {
355	99					187	my @refs = $self->_read_EMBL_References(\$buffer);
356	99					122	foreach my $ref ( @refs ) {
357	99					211	$annotation->add_Annotation('reference',$ref);
358							}
359							}
360
361							# DB Xrefs
362							elsif (/^DR/) {
363	5					32	my @links = $self->_read_EMBL_DBLink(\$buffer);
364	5					10	foreach my $dblink ( @links ) {
365	25					39	$annotation->add_Annotation('dblink',$dblink);
366							}
367							}
368
369							# Comments
370							elsif (/^CC\s+(.*)/) {
371	15					40	$comment .= $1;
372	15					18	$comment .= " ";
373	15					37	while (defined ($_ = $self->_readline) ) {
374	257	100				519	if (/^CC\s+(.*)/) {
375	242					288	$comment .= $1;
376	242					325	$comment .= " ";
377							} else {
378	15					21	last;
379							}
380							}
381	15					96	my $commobj = Bio::Annotation::Comment->new();
382	15					45	$commobj->text($comment);
383	15					33	$annotation->add_Annotation('comment',$commobj);
384	15					31	$comment = "";
385							}
386
387							# Get next line.
388	475					744	$buffer = $self->_readline;
389							}
390
391	24					50	while ( defined ($_ = $self->_readline) ) {
392	46	100				114	/^FT\s{3}\w/ && last;
393	24	100				56	/^SQ / && last;
394	22	50				55	/^CO / && last;
395							}
396	24					27	$buffer = $_;
397
398	24	100	66			116	if (defined($buffer) && $buffer =~ /^FT /) {
399	22					46	until ( !defined ($buffer) ) {
400	256					468	my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
401
402							# process ftunit
403	256					530	my $feat =
404							$ftunit->_generic_seqfeature($self->location_factory(), $name);
405
406							# add taxon_id from source if available
407							# Notice, this will override what is found in the OX line.
408							# this is by design as this seems to be the official way
409							# of specifying a TaxID
410	256	100	100			673	if ($params{'-species'} && ($feat->primary_tag eq 'source')
			100
			66
411							&& $feat->has_tag('db_xref')
412							&& (! $params{'-species'}->ncbi_taxid())) {
413	12					32	foreach my $tagval ($feat->get_tag_values('db_xref')) {
414	12	50				46	if (index($tagval,"taxon:") == 0) {
415	12					38	$params{'-species'}->ncbi_taxid(substr($tagval,6));
416	12					24	last;
417							}
418							}
419							}
420
421							# add feature to list of features
422	256					305	push(@features, $feat);
423
424	256	100				1126	if ( $buffer !~ /^FT/ ) {
425	22					69	last;
426							}
427							}
428							}
429							# Set taxid found in OX line
430	24	100	100			129	if ($params{'-species'} && defined $ncbi_taxid
			100
431							&& (! $params{'-species'}->ncbi_taxid())) {
432	2					7	$params{'-species'}->ncbi_taxid($ncbi_taxid);
433							}
434
435							# skip comments
436	24		66			147	while ( defined ($buffer) && $buffer =~ /^XX/ ) {
437	20					49	$buffer = $self->_readline();
438							}
439
440	24	100				82	if ( $buffer =~ /^CO/ ) {
441							# bug#2982
442							# special : create contig as annotation
443	4					11	while ( defined ($buffer) ) {
444	4					16	$annotation->add_Annotation($_) for $self->_read_EMBL_Contig(\$buffer);
445	4	50	66			23	if ( !$buffer \|\| $buffer !~ /^CO/ ) {
446	4					5	last;
447							}
448							}
449	4		100			11	$buffer \|\|= '';
450							}
451	24	100				66	if ($buffer !~ /^\/\//) { # if no SQ lines following CO (bug#2958)
452	20	100				63	if ( $buffer !~ /^SQ/ ) {
453	1					3	while ( defined ($_ = $self->_readline) ) {
454	0	0				0	/^SQ/ && last;
455							}
456							}
457	20					24	$seqc = "";
458	20					48	while ( defined ($_ = $self->_readline) ) {
459	2924	100				3631	m{^//} && last;
460	2906					2431	$_ = uc($_);
461	2906					20861	s/[^A-Za-z]//g;
462	2906					4620	$seqc .= $_;
463							}
464							}
465	24					84	my $seq = $self->sequence_factory->create
466							(-verbose => $self->verbose(),
467							-division => $div,
468							-seq => $seqc,
469							-desc => $desc,
470							-display_id => $name,
471							-annotation => $annotation,
472							-molecule => $mol,
473							-alphabet => $alphabet,
474							-features => \@features,
475							%params);
476	24					197	return $seq;
477							}
478
479
480
481							=head2 _write_ID_line
482
483							Title : _write_ID_line
484							Usage : $self->_write_ID_line($seq);
485							Function: Writes the EMBL Release 87 format ID line to the stream, unless
486							: there is a user-supplied ID line generation function in which
487							: case that is used instead.
488							: ( See Bio::SeqIO::embl::_id_generation_function(). )
489							Returns : nothing
490							Args : Bio::Seq object
491
492							=cut
493
494							sub _write_ID_line {
495
496	11			11		17	my ($self, $seq) = @_;
497
498	11					14	my $id_line;
499							# If there is a user-supplied ID generation function, use it.
500	11	50				27	if ( $self->_id_generation_func ) {
501	0					0	$id_line = "ID " . &{$self->_id_generation_func}($seq) . "\nXX\n";
	0					0
502							}
503							# Otherwise, generate a standard EMBL release 87 (June 2006) ID line.
504							else {
505
506							# The sequence name is supposed to be the primary accession number,
507	11					36	my $name = $seq->accession_number();
508	11	100	66			57	if ( not(defined $name) \|\| $name eq 'unknown') {
509							# but if it is not present, use the sequence ID or the empty string
510	5		100			14	$name = $seq->id() \|\| '';
511							}
512
513	11	50				35	$self->warn("No whitespace allowed in EMBL id [". $name. "]") if $name =~ /\s/;
514
515							# Use the sequence version, or default to 1.
516	11		50			32	my $version = $seq->version() \|\| 1;
517
518	11					32	my $len = $seq->length();
519
520							# Taxonomic division.
521	11					16	my $div;
522	11	100	100			122	if ( $seq->can('division') && defined($seq->division) &&
			66
523							$self->_is_valid_division($seq->division) ) {
524	2					5	$div = $seq->division();
525							} else {
526	9		50			39	$div \|\|= 'UNC'; # 'UNC' is the EMBL division code for 'unclassified'.
527							}
528
529	11					16	my $mol;
530							# If the molecule type is a valid EMBL type, use it.
531	11	50	100			86	if ( $seq->can('molecule')
		100	66
			100
532							&& defined($seq->molecule)
533							&& $self->_is_valid_molecule_type($seq->molecule)
534							) {
535	0					0	$mol = $seq->molecule();
536							}
537							# Otherwise, choose unassigned DNA or RNA based on the alphabet.
538							elsif ($seq->can('primary_seq') && defined $seq->primary_seq->alphabet) {
539	6					10	my $alphabet =$seq->primary_seq->alphabet;
540	6	50				12	if ($alphabet eq 'dna') {
		0
		0
541	6					8	$mol ='unassigned DNA';
542							} elsif ($alphabet eq 'rna') {
543	0					0	$mol='unassigned RNA';
544							} elsif ($alphabet eq 'protein') {
545	0					0	$self->warn("Protein sequence found; EMBL is a nucleotide format.");
546	0					0	$mol='AA'; # AA is not a valid EMBL molecule type.
547							}
548							}
549
550	11					18	my $topology = 'linear';
551	11	50				31	if ($seq->is_circular) {
552	0					0	$topology = 'circular';
553							}
554
555	11		100			34	$mol \|\|= ''; # 'unassigned'; ?
556	11					46	$id_line = "ID $name; SV $version; $topology; $mol; STD; $div; $len BP.\nXX\n";
557	11					48	$self->_print($id_line);
558							}
559							}
560
561							=head2 _is_valid_division
562
563							Title : _is_valid_division
564							Usage : $self->_is_valid_division($div)
565							Function: tests division code for validity
566							Returns : true if $div is a valid EMBL release 87 taxonomic division.
567							Args : taxonomic division code string
568
569							=cut
570
571							sub _is_valid_division {
572	2			2		4	my ($self, $division) = @_;
573
574	2					25	my %EMBL_divisions = (
575							"PHG" => 1, # Bacteriophage
576							"ENV" => 1, # Environmental Sample
577							"FUN" => 1, # Fungal
578							"HUM" => 1, # Human
579							"INV" => 1, # Invertebrate
580							"MAM" => 1, # Other Mammal
581							"VRT" => 1, # Other Vertebrate
582							"MUS" => 1, # Mus musculus
583							"PLN" => 1, # Plant
584							"PRO" => 1, # Prokaryote
585							"ROD" => 1, # Other Rodent
586							"SYN" => 1, # Synthetic
587							"UNC" => 1, # Unclassified
588							"VRL" => 1 # Viral
589							);
590
591	2					10	return exists($EMBL_divisions{$division});
592							}
593
594							=head2 _is_valid_molecule_type
595
596							Title : _is_valid_molecule_type
597							Usage : $self->_is_valid_molecule_type($mol)
598							Function: tests molecule type for validity
599							Returns : true if $mol is a valid EMBL release 87 molecule type.
600							Args : molecule type string
601
602							=cut
603
604							sub _is_valid_molecule_type {
605	2			2		3	my ($self, $moltype) = @_;
606
607	2					15	my %EMBL_molecule_types = (
608							"genomic DNA" => 1,
609							"genomic RNA" => 1,
610							"mRNA" => 1,
611							"tRNA" => 1,
612							"rRNA" => 1,
613							"snoRNA" => 1,
614							"snRNA" => 1,
615							"scRNA" => 1,
616							"pre-RNA" => 1,
617							"other RNA" => 1,
618							"other DNA" => 1,
619							"unassigned DNA" => 1,
620							"unassigned RNA" => 1
621							);
622
623	2					29	return exists($EMBL_molecule_types{$moltype});
624							}
625
626							=head2 write_seq
627
628							Title : write_seq
629							Usage : $stream->write_seq($seq)
630							Function: writes the $seq object (must be seq) to the stream
631							Returns : 1 for success and undef for error
632							Args : array of 1 to n Bio::SeqI objects
633
634
635							=cut
636
637							sub write_seq {
638	13			13	1	55	my ($self,@seqs) = @_;
639
640	13					27	foreach my $seq ( @seqs ) {
641	13	50				32	$self->throw("Attempting to write with no seq!") unless defined $seq;
642	13	100	100			87	unless ( ref $seq && $seq->isa('Bio::SeqI' ) ) {
643	4	50				12	$self->warn("$seq is not a SeqI compliant sequence object!")
644							if $self->verbose >= 0;
645	4	100	66			22	unless ( ref $seq && $seq->isa('Bio::PrimarySeqI' ) ) {
646	2					27	$self->throw("$seq is not a PrimarySeqI compliant sequence object!");
647							}
648							}
649	11		100			38	my $str = $seq->seq \|\| '';
650
651							# Write the ID line.
652	11					30	$self->_write_ID_line($seq);
653
654
655							# Write the accession line if present
656	11					13	my( $acc );
657							{
658	11	50	66			10	if ( my $func = $self->_ac_generation_func ) {
	11	100				26
		50
659	0					0	$acc = &{$func}($seq);
	0					0
660							} elsif ( $seq->isa('Bio::Seq::RichSeqI') &&
661							defined($seq->accession_number) ) {
662	4					7	$acc = $seq->accession_number;
663	4					10	$acc = join("; ", $acc, $seq->get_secondary_accessions);
664							} elsif ( $seq->can('accession_number') ) {
665	7					15	$acc = $seq->accession_number;
666							}
667
668	11	50				22	if (defined $acc) {
669	11	50				37	$self->_print("AC $acc;\n",
670							"XX\n") \|\| return;
671							}
672							}
673
674							# Date lines
675	11					12	my $switch=0;
676	11	100				58	if ( $seq->can('get_dates') ) {
677	4					8	my @dates = $seq->get_dates();
678	4					4	my $ct = 1;
679	4					3	my $date_flag = 0;
680	4					9	my ($cr) = $seq->annotation->get_Annotations("creation_release");
681	4					12	my ($ur) = $seq->annotation->get_Annotations("update_release");
682	4					8	my ($uv) = $seq->annotation->get_Annotations("update_version");
683
684	4	0	33			11	unless ($cr && $ur && $ur) {
			33
685	4					5	$date_flag = 1;
686							}
687
688	4					7	foreach my $dt (@dates) {
689	0	0				0	if (!$date_flag) {
690	0	0				0	$self->_write_line_EMBL_regex("DT ","DT ",
691							$dt." (Rel. ".($cr->value()).", Created)",
692							'\s+\|$',80) if $ct == 1;
693	0	0				0	$self->_write_line_EMBL_regex("DT ","DT ",
694							$dt." (Rel. ".($ur->value()).", Last updated, Version ".($uv->value()).")",
695							'\s+\|$',80) if $ct == 2;
696							} else { # other formats?
697	0					0	$self->_write_line_EMBL_regex("DT ","DT ",
698							$dt,'\s+\|$',80);
699							}
700	0					0	$switch =1;
701	0					0	$ct++;
702							}
703	4	50				8	if ($switch == 1) {
704	0	0				0	$self->_print("XX\n") \|\| return;
705							}
706							}
707
708							# Description lines
709	11	50				31	$self->_write_line_EMBL_regex("DE ","DE ",$seq->desc(),'\s+\|$',80) \|\| return; #'
710	11	50				28	$self->_print( "XX\n") \|\| return;
711
712							# if there, write the kw line
713							{
714	11					17	my( $kw );
	11					563
715	11	50				28	if ( my $func = $self->_kw_generation_func ) {
		100
716	0					0	$kw = &{$func}($seq);
	0					0
717							} elsif ( $seq->can('keywords') ) {
718	4					9	$kw = $seq->keywords;
719							}
720	11	100				25	if (defined $kw) {
721	4	50				7	$self->_write_line_EMBL_regex("KW ", "KW ", $kw, '\s+\|$', 80) \|\| return; #'
722	4	50				7	$self->_print( "XX\n") \|\| return;
723							}
724							}
725
726							# Organism lines
727
728	11	100	100			53	if ($seq->can('species') && (my $spec = $seq->species)) {
729	6					21	my @class = $spec->classification();
730	6					9	shift @class; # get rid of species name. Some embl files include
731							# the species name in the OC lines, but this seems
732							# more like an error than something we need to
733							# emulate
734	6					24	my $OS = $spec->scientific_name;
735	6	50				15	if ($spec->common_name) {
736	0					0	$OS .= ' ('.$spec->common_name.')';
737							}
738	6	50				20	$self->_print("OS $OS\n") \|\| return;
739	6					19	my $OC = join('; ', reverse(@class)) .'.';
740	6	50				13	$self->_write_line_EMBL_regex("OC ","OC ",$OC,'; \|$',80) \|\| return;
741	6	50				20	if ($spec->organelle) {
742	0	0				0	$self->_write_line_EMBL_regex("OG ","OG ",$spec->organelle,'; \|$',80) \|\| return;
743							}
744	6					15	my $ncbi_taxid = $spec->ncbi_taxid;
745	6	100				12	if ($ncbi_taxid) {
746	2	50				6	$self->_print("OX NCBI_TaxID=$ncbi_taxid\n") \|\| return;
747							}
748	6	50				12	$self->_print("XX\n") \|\| return;
749							}
750							# Reference lines
751	11					12	my $t = 1;
752	11	100	66			63	if ( $seq->can('annotation') && defined $seq->annotation ) {
753	9					18	foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
754	6	50				15	$self->_print( "RN [$t]\n") \|\| return;
755
756							# Having no RP line is legal, but we need both
757							# start and end for a valid location.
758	6	50				14	if ($ref->comment) {
759	0	0				0	$self->_write_line_EMBL_regex("RC ", "RC ", $ref->comment, '\s+\|$', 80) \|\| return; #'
760							}
761	6					15	my $start = $ref->start;
762	6					15	my $end = $ref->end;
763	6	50	33			21	if ($start and $end) {
		0	0
764	6	50				16	$self->_print( "RP $start-$end\n") \|\| return;
765							} elsif ($start or $end) {
766	0					0	$self->throw("Both start and end are needed for a valid RP line.".
767							" Got: start='$start' end='$end'");
768							}
769
770	6	50				11	if (my $med = $ref->medline) {
771	0	0				0	$self->_print( "RX MEDLINE; $med.\n") \|\| return;
772							}
773	6	50				9	if (my $pm = $ref->pubmed) {
774	0	0				0	$self->_print( "RX PUBMED; $pm.\n") \|\| return;
775							}
776	6					19	my $authors = $ref->authors;
777	6					59	$authors =~ s/([\w\.]) (\w)/$1#$2/g; # add word wrap protection char '#'
778
779	6	50				16	$self->_write_line_EMBL_regex("RA ", "RA ",
780							$authors . ";",
781							'\s+\|$', 80) \|\| return; #'
782
783							# If there is no title to the reference, it appears
784							# as a single semi-colon. All titles must end in
785							# a semi-colon.
786	6		100			13	my $ref_title = $ref->title \|\| '';
787	6					37	$ref_title =~ s/[\s;]*$/;/;
788	6	50				12	$self->_write_line_EMBL_regex("RT ", "RT ", $ref_title, '\s+\|$', 80) \|\| return; #'
789	6	50				12	$self->_write_line_EMBL_regex("RL ", "RL ", $ref->location, '\s+\|$', 80) \|\| return; #'
790	6	50				9	$self->_print("XX\n") \|\| return;
791	6					10	$t++;
792							}
793
794							# DB Xref lines
795	9	100				23	if (my @db_xref = $seq->annotation->get_Annotations('dblink') ) {
796	2					2	for my $dr (@db_xref) {
797	43					59	my $db_name = $dr->database;
798	43					49	my $prim = $dr->primary_id;
799
800	43		50			47	my $opt = $dr->optional_id \|\| '';
801	43	50				69	my $line = $opt ? "$db_name; $prim; $opt." : "$db_name; $prim.";
802	43	50				47	$self->_write_line_EMBL_regex("DR ", "DR ", $line, '\s+\|$', 80) \|\| return; #'
803							}
804	2	50				4	$self->_print("XX\n") \|\| return;
805							}
806
807							# Comment lines
808	9					20	foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
809	4	50				10	$self->_write_line_EMBL_regex("CC ", "CC ", $comment->text, '\s+\|$', 80) \|\| return; #'
810	4	50				8	$self->_print("XX\n") \|\| return;
811							}
812							}
813							# "\\s\+\\|\$"
814
815							## FEATURE TABLE
816
817	11	50				26	$self->_print("FH Key Location/Qualifiers\n") \|\| return;
818	11	50				23	$self->_print("FH\n") \|\| return;
819
820	11	100				65	my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : ();
821	11	100				24	if ($feats[0]) {
822	6	50				14	if ( defined $self->_post_sort ) {
823							# we need to read things into an array.
824							# Process. Sort them. Print 'em
825
826	0					0	my $post_sort_func = $self->_post_sort();
827	0					0	my @fth;
828
829	0					0	foreach my $sf ( @feats ) {
830	0					0	push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq));
831							}
832
833	0					0	@fth = sort { &$post_sort_func($a,$b) } @fth;
	0					0
834
835	0					0	foreach my $fth ( @fth ) {
836	0	0				0	$self->_print_EMBL_FTHelper($fth) \|\| return;
837							}
838							} else {
839							# not post sorted. And so we can print as we get them.
840							# lower memory load...
841
842	6					10	foreach my $sf ( @feats ) {
843	38					65	my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq);
844	38					34	foreach my $fth ( @fth ) {
845	38	50				52	if ( $fth->key eq 'CONTIG') {
846	0					0	$self->_show_dna(0);
847							}
848	38	50				66	$self->_print_EMBL_FTHelper($fth) \|\| return;
849							}
850							}
851							}
852							}
853
854	11	50				25	if ( $self->_show_dna() == 0 ) {
855	0	0				0	$self->_print( "//\n") \|\| return;
856	0					0	return;
857							}
858	11	50				24	$self->_print( "XX\n") \|\| return;
859
860							# finished printing features.
861
862							# print contig if present : bug#2982
863	11	100	66			62	if ( $seq->can('annotation') && defined $seq->annotation) {
864	9					19	foreach my $ctg ( $seq->annotation->get_Annotations('contig') ) {
865	0	0				0	if ($ctg->value) {
866	0	0				0	$self->_write_line_EMBL_regex("CO ","CO ", $ctg->value,
867							'[,]\|$', 80) \|\| return;
868							}
869							}
870							}
871							# print sequence lines only if sequence is present! bug#2982
872	11	100				26	if (length($str)) {
873	8					24	$str =~ tr/A-Z/a-z/;
874
875							# Count each nucleotide
876	8					19	my $alen = $str =~ tr/a/a/;
877	8					15	my $clen = $str =~ tr/c/c/;
878	8					15	my $glen = $str =~ tr/g/g/;
879	8					16	my $tlen = $str =~ tr/t/t/;
880
881	8					15	my $len = $seq->length();
882	8					13	my $olen = $seq->length() - ($alen + $tlen + $clen + $glen);
883	8	50				19	if ( $olen < 0 ) {
884	0					0	$self->warn("Weird. More atgc than bases. Problem!");
885							}
886
887	8	50				37	$self->_print("SQ Sequence $len BP; $alen A; $clen C; $glen G; $tlen T; $olen other;\n") \|\| return;
888
889	8					12	my $nuc = 60; # Number of nucleotides per line
890	8					14	my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line
891	8					9	my $out_pat = 'A11' x 6; # Pattern for packing a line
892	8					7	my $length = length($str);
893
894							# Calculate the number of nucleotides which fit on whole lines
895	8					25	my $whole = int($length / $nuc) * $nuc;
896
897							# Print the whole lines
898	8					6	my( $i );
899	8					19	for ($i = 0; $i < $whole; $i += $nuc) {
900	162					366	my $blocks = pack $out_pat,
901							unpack $whole_pat,
902							substr($str, $i, $nuc);
903	162	50				347	$self->_print(sprintf(" $blocks%9d\n", $i + $nuc)) \|\| return;
904							}
905
906							# Print the last line
907	8	50				26	if (my $last = substr($str, $i)) {
908	8					8	my $last_len = length($last);
909	8					31	my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10;
910	8					62	my $blocks = pack $out_pat,
911							unpack($last_pat, $last);
912	8	50				41	$self->_print(sprintf(" $blocks%9d\n", $length)) \|\|
913							return; # Add the length to the end
914							}
915							}
916
917	11	50				27	$self->_print( "//\n") \|\| return;
918
919	11	50	33			24	$self->flush if $self->_flush_on_write && defined $self->_fh;
920							}
921	11					41	return 1;
922							}
923
924							=head2 _print_EMBL_FTHelper
925
926							Title : _print_EMBL_FTHelper
927							Usage :
928							Function: Internal function
929							Returns : 1 if writing suceeded, otherwise undef
930							Args :
931
932
933							=cut
934
935							sub _print_EMBL_FTHelper {
936	38			38		34	my ($self,$fth) = @_;
937
938	38	50	33			119	if ( ! ref $fth \|\| ! $fth->isa('Bio::SeqIO::FTHelper') ) {
939	0					0	$fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!");
940							}
941
942
943							#$self->_print( "FH Key Location/Qualifiers\n");
944							#$self->_print( sprintf("FT %-15s %s\n",$fth->key,$fth->loc));
945							# let
946	38	50				48	if ( $fth->key eq 'CONTIG' ) {
947	0	0				0	$self->_print("XX\n") \|\| return;
948	0	0				0	$self->_write_line_EMBL_regex("CO ",
949							"CO ",$fth->loc,
950							'\,\|$',80) \|\| return; #'
951	0					0	return 1;
952							}
953	38	50				56	$self->_write_line_EMBL_regex(sprintf("FT %-15s ",$fth->key),
954							"FT ",$fth->loc,
955							'\,\|$',80) \|\| return; #'
956	38					37	foreach my $tag ( keys %{$fth->field} ) {
	38					64
957	108	50				147	if ( ! defined $fth->field->{$tag} ) {
958	0					0	next;
959							}
960	108					91	foreach my $value ( @{$fth->field->{$tag}} ) {
	108					121
961	139					141	$value =~ s/\"/\"\"/g;
962	139	50	66			346	if ($value eq "_no_value") {
		50
963	0	0				0	$self->_write_line_EMBL_regex("FT ",
964							"FT ",
965							"/$tag",'.\|$',80) \|\| return; #'
966							}
967							# there are almost 3x more quoted qualifier values and they
968							# are more common too so we take quoted ones first
969							#
970							# Long qualifiers, that will be line wrapped, are always quoted
971							elsif (!$FTQUAL_NO_QUOTE{$tag} or length("/$tag=$value")>=60) {
972	139	100				264	my $pat = $value =~ /\s+/ ? '\s+\|\-\|$' : '.\|\-\|$';
973	139	50				278	$self->_write_line_EMBL_regex("FT ",
974							"FT ",
975							"/$tag=\"$value\"",$pat,80) \|\| return;
976							} else {
977	0	0				0	$self->_write_line_EMBL_regex("FT ",
978							"FT ",
979							"/$tag=$value",'.\|$',80) \|\| return; #'
980							}
981							}
982							}
983
984	38					136	return 1;
985							}
986
987
988
989							=head2 _read_EMBL_Contig()
990
991							Title : _read_EMBL_Contig
992							Usage :
993							Function: convert CO lines into annotations
994							Returns :
995							Args :
996
997							=cut
998
999							sub _read_EMBL_Contig {
1000	4			4		5	my ($self, $buffer) = @_;
1001	4					7	my @ret;
1002	4	50				11	if ( $$buffer !~ /^CO/ ) {
1003	0					0	warn("Not parsing line '$$buffer' which maybe important");
1004							}
1005	4					11	$self->_pushback($$buffer);
1006	4					6	while ( defined ($_ = $self->_readline) ) {
1007	356	100				752	/^C/ \|\| last;
1008	353	50				775	/^CO\s+(.*)/ && do {
1009	353					767	push @ret, Bio::Annotation::SimpleValue->new( -tagname => 'contig',
1010							-value => $1);
1011							};
1012							}
1013	4					7	$$buffer = $_;
1014	4					48	return @ret;
1015
1016							}
1017							#'
1018							=head2 _read_EMBL_References
1019
1020							Title : _read_EMBL_References
1021							Usage :
1022							Function: Reads references from EMBL format. Internal function really
1023							Example :
1024							Returns :
1025							Args :
1026
1027
1028							=cut
1029
1030							sub _read_EMBL_References {
1031	99			99		90	my ($self,$buffer) = @_;
1032	99					74	my (@refs);
1033
1034							# assume things are starting with RN
1035
1036	99	50				198	if ( $$buffer !~ /^RN/ ) {
1037	0					0	warn("Not parsing line '$$buffer' which maybe important");
1038							}
1039	99					76	my $b1;
1040							my $b2;
1041	0					0	my $title;
1042	0					0	my $loc;
1043	0					0	my $au;
1044	0					0	my $med;
1045	0					0	my $pm;
1046	0					0	my $com;
1047
1048	99					160	while ( defined ($_ = $self->_readline) ) {
1049	727	100				1230	/^R/ \|\| last;
1050	628	100				806	/^RP (\d+)-(\d+)/ && do {$b1=$1;$b2=$2;};
	40					83
	40					54
1051	628	100				797	/^RX MEDLINE;\s+(\d+)/ && do {$med=$1};
	55					81
1052	628	100				730	/^RX PUBMED;\s+(\d+)/ && do {$pm=$1};
	10					15
1053	628	100				832	/^RA (.*)/ && do {
1054	193					341	$au = $self->_concatenate_lines($au,$1); next;
	193					297
1055							};
1056	435	100				615	/^RT (.*)/ && do {
1057	164					213	$title = $self->_concatenate_lines($title,$1); next;
	164					220
1058							};
1059	271	100				425	/^RL (.*)/ && do {
1060	156					224	$loc = $self->_concatenate_lines($loc,$1); next;
	156					238
1061							};
1062	115	100				227	/^RC (.*)/ && do {
1063	3					10	$com = $self->_concatenate_lines($com,$1); next;
	3					4
1064							};
1065							}
1066
1067	99					302	my $ref = Bio::Annotation::Reference->new();
1068	99					317	$au =~ s/;\s*$//g;
1069	99					198	$title =~ s/;\s*$//g;
1070
1071	99					188	$ref->start($b1);
1072	99					166	$ref->end($b2);
1073	99					135	$ref->authors($au);
1074	99					144	$ref->title($title);
1075	99					124	$ref->location($loc);
1076	99					136	$ref->medline($med);
1077	99					145	$ref->comment($com);
1078	99					130	$ref->pubmed($pm);
1079
1080	99					91	push(@refs,$ref);
1081	99					101	$$buffer = $_;
1082
1083	99					194	return @refs;
1084							}
1085
1086							=head2 _read_EMBL_Species
1087
1088							Title : _read_EMBL_Species
1089							Usage :
1090							Function: Reads the EMBL Organism species and classification
1091							lines.
1092							Example :
1093							Returns : A Bio::Species object
1094							Args : a reference to the current line buffer, accession number
1095
1096							=cut
1097
1098							sub _read_EMBL_Species {
1099	21			21		30	my( $self, $buffer, $acc ) = @_;
1100	21					20	my $org;
1101
1102	21					26	$_ = $$buffer;
1103	21					26	my( $sub_species, $species, $genus, $common, $sci_name, $class_lines );
1104	21		66			77	while (defined( $_ \|\|= $self->_readline )) {
1105	92	100				349	if (/^OS\s+(.+)/) {
		100
		50
1106	21	50				70	$sci_name .= ($sci_name) ? ' '.$1 : $1;
1107							} elsif (s/^OC\s+(.+)$//) {
1108	50					102	$class_lines .= $1;
1109							} elsif (/^OG\s+(.*)/) {
1110	0					0	$org = $1;
1111							} else {
1112	21					38	last;
1113							}
1114
1115	71					191	$_ = undef; # Empty $_ to trigger read of next line
1116							}
1117
1118							# $$buffer = $_;
1119	21					69	$self->_pushback($_);
1120
1121	21					32	$sci_name =~ s{\.$}{};
1122	21	50				39	$sci_name \|\| return;
1123
1124							# Convert data in classification lines into classification array.
1125							# only split on ';' or '.' so that classification that is 2 or more words
1126							# will still get matched, use map() to remove trailing/leading/intervening
1127							# spaces
1128	21					161	my @class = map { s/^\s+//; s/\s+$//; s/\s{2,}/ /g; $_; } split /(?
	224					258
	224					194
	224					164
	224					213
1129
1130							# do we have a genus?
1131	21					42	my $possible_genus = $class[-1];
1132	21	50				61	$possible_genus .= "\|$class[-2]" if $class[-2];
1133	21	50				511	if ($sci_name =~ /^($possible_genus)/) {
1134	21					42	$genus = $1;
1135	21					468	($species) = $sci_name =~ /^$genus\s+(.+)/;
1136							} else {
1137	0					0	$species = $sci_name;
1138							}
1139
1140							# Don't make a species object if it is "Unknown" or "None"
1141	21	50				62	if ($genus) {
1142	21	50				75	return if $genus =~ /^(Unknown\|None)$/i;
1143							}
1144
1145							# is this organism of rank species or is it lower?
1146							# (doesn't catch everything, but at least the guess isn't dangerous)
1147	21	50				85	if ($species =~ /subsp\.\|var\./) {
1148	0					0	($species, $sub_species) = $species =~ /(.+)\s+((?:subsp\.\|var\.).+)/;
1149							}
1150
1151							# sometimes things have common name in brackets, like
1152							# Schizosaccharomyces pombe (fission yeast), so get rid of the common
1153							# name bit. Probably dangerous if real scientific species name ends in
1154							# bracketed bit.
1155	21	50				54	unless ($class[-1] eq 'Viruses') {
1156	21					70	($species, $common) = $species =~ /^(.+)\s+$(.+)$$/;
1157	21	100				69	$sci_name =~ s/\s+$.+$$// if $common;
1158							}
1159
1160							# Bio::Species array needs array in Species -> Kingdom direction
1161	21	50				54	unless ($class[-1] eq $sci_name) {
1162	21					30	push(@class, $sci_name);
1163							}
1164	21					27	@class = reverse @class;
1165
1166							# do minimal sanity checks before we hand off to Bio::Species which won't
1167							# be able to give informative throw messages if it has to throw because
1168							# of problems here
1169	21	50				40	$self->throw("$acc seems to be missing its OS line: invalid.") unless $sci_name;
1170	21					24	my %names;
1171	21					68	foreach my $i (0..$#class) {
1172	245					166	my $name = $class[$i];
1173	245					272	$names{$name}++;
1174							# this code breaks examples like: Xenopus (Silurana) tropicalis
1175							# commenting out, see bug 3158
1176
1177							#if ($names{$name} > 1 && ($name ne $class[$i - 1])) {
1178							# $self->warn("$acc seems to have an invalid species classification:$name ne $class[$i - 1]");
1179							#}
1180							}
1181	21					129	my $make = Bio::Species->new();
1182	21					53	$make->scientific_name($sci_name);
1183	21					60	$make->classification(@class);
1184	21	50				56	unless ($class[-1] eq 'Viruses') {
1185	21	50				96	$make->genus($genus) if $genus;
1186	21	100				63	$make->species($species) if $species;
1187	21	50				42	$make->sub_species($sub_species) if $sub_species;
1188	21	100				61	$make->common_name($common) if $common;
1189							}
1190	21	50				38	$make->organelle($org) if $org;
1191	21					103	return $make;
1192							}
1193
1194							=head2 _read_EMBL_DBLink
1195
1196							Title : _read_EMBL_DBLink
1197							Usage :
1198							Function: Reads the EMBL database cross reference ("DR") lines
1199							Example :
1200							Returns : A list of Bio::Annotation::DBLink objects
1201							Args :
1202
1203							=cut
1204
1205							sub _read_EMBL_DBLink {
1206	5			5		6	my( $self,$buffer ) = @_;
1207	5					5	my( @db_link );
1208
1209	5					5	$_ = $$buffer;
1210	5		66			21	while (defined( $_ \|\|= $self->_readline )) {
1211	30	100				118	if ( /^DR ([^\s;]+);\s([^\s;]+);?\s([^\s;]+)?\.$/) {
1212	25					54	my ($databse, $prim_id, $sec_id) = ($1,$2,$3);
1213	25					82	my $link = Bio::Annotation::DBLink->new(-database => $databse,
1214							-primary_id => $prim_id,
1215							-optional_id => $sec_id);
1216
1217	25					41	push(@db_link, $link);
1218							} else {
1219	5					7	last;
1220							}
1221	25					67	$_ = undef; # Empty $_ to trigger read of next line
1222							}
1223
1224	5					8	$$buffer = $_;
1225	5					12	return @db_link;
1226							}
1227
1228							=head2 _read_EMBL_TaxID_DBLink
1229
1230							Title : _read_EMBL_TaxID_DBLink
1231							Usage :
1232							Function: Reads the EMBL database cross reference to NCBI TaxID ("OX") lines
1233							Example :
1234							Returns : A list of Bio::Annotation::DBLink objects
1235							Args :
1236
1237							=cut
1238
1239							sub _read_EMBL_TaxID_DBLink {
1240	3			3		5	my( $self,$buffer ) = @_;
1241	3					3	my( @db_link );
1242
1243	3					5	$_ = $$buffer;
1244	3		66			13	while (defined( $_ \|\|= $self->_readline )) {
1245	4	100				19	if ( /^OX (\S+)=(\d+);$/ ) {
1246	1					4	my ($databse, $prim_id) = ($1,$2);
1247	1					11	my $link = Bio::Annotation::DBLink->new(-database => $databse,
1248							-primary_id => $prim_id,);
1249	1					4	push(@db_link, $link);
1250							} else {
1251	3					4	last;
1252							}
1253	1					6	$_ = undef; # Empty $_ to trigger read of next line
1254							}
1255
1256	3					5	$$buffer = $_;
1257	3					9	return @db_link;
1258							}
1259
1260							=head2 _filehandle
1261
1262							Title : _filehandle
1263							Usage : $obj->_filehandle($newval)
1264							Function:
1265							Example :
1266							Returns : value of _filehandle
1267							Args : newvalue (optional)
1268
1269
1270							=cut
1271
1272							sub _filehandle{
1273	0			0		0	my ($obj,$value) = @_;
1274	0	0				0	if ( defined $value) {
1275	0					0	$obj->{'_filehandle'} = $value;
1276							}
1277	0					0	return $obj->{'_filehandle'};
1278
1279							}
1280
1281							=head2 _read_FTHelper_EMBL
1282
1283							Title : _read_FTHelper_EMBL
1284							Usage : _read_FTHelper_EMBL($buffer)
1285							Function: reads the next FT key line
1286							Example :
1287							Returns : Bio::SeqIO::FTHelper object
1288							Args : filehandle and reference to a scalar
1289
1290
1291							=cut
1292
1293							sub _read_FTHelper_EMBL {
1294	256			256		216	my ($self,$buffer) = @_;
1295
1296	256					187	my ($key, # The key of the feature
1297							$loc, # The location line from the feature
1298							@qual, # An arrray of lines making up the qualifiers
1299							);
1300
1301	256	50				814	if ($$buffer =~ /^FT\s{3}(\S+)\s+(\S+)/ ) {
		0
1302	256					386	$key = $1;
1303	256					274	$loc = $2;
1304							# Read all the lines up to the next feature
1305	256					481	while ( defined($_ = $self->_readline) ) {
1306	2038	100				9036	if (/^FT(\s+)(.+?)\s*$/) {
1307							# Lines inside features are preceeded by 19 spaces
1308							# A new feature is preceeded by 3 spaces
1309	2016	100				2608	if (length($1) > 4) {
1310							# Add to qualifiers if we're in the qualifiers
1311	1782	100				2024	if (@qual) {
		100
1312	1523					2949	push(@qual, $2);
1313							}
1314							# Start the qualifier list if it's the first qualifier
1315							elsif (substr($2, 0, 1) eq '/') {
1316	254					642	@qual = ($2);
1317							}
1318							# We're still in the location line, so append to location
1319							else {
1320	5					18	$loc .= $2;
1321							}
1322							} else {
1323							# We've reached the start of the next feature
1324	234					243	last;
1325							}
1326							} else {
1327							# We're at the end of the feature table
1328	22					29	last;
1329							}
1330							}
1331							} elsif ( $$buffer =~ /^CO\s+(\S+)/) {
1332	0					0	$key = 'CONTIG';
1333	0					0	$loc = $1;
1334							# Read all the lines up to the next feature
1335	0					0	while ( defined($_ = $self->_readline) ) {
1336	0	0				0	if (/^CO\s+(\S+)\s*$/) {
1337	0					0	$loc .= $1;
1338							} else {
1339							# We've reached the start of the next feature
1340	0					0	last;
1341							}
1342							}
1343							} else {
1344							# No feature key
1345	0					0	return;
1346							}
1347
1348							# Put the first line of the next feature into the buffer
1349	256					230	$$buffer = $_;
1350
1351							# Make the new FTHelper object
1352	256					625	my $out = Bio::SeqIO::FTHelper->new();
1353	256					440	$out->verbose($self->verbose());
1354	256					393	$out->key($key);
1355	256					381	$out->loc($loc);
1356
1357							# Now parse and add any qualifiers. (@qual is kept
1358							# intact to provide informative error messages.)
1359	256					181	my $last_unquoted_qualifier;
1360							QUAL:
1361	256					449	for (my $i = 0; $i < @qual; $i++) {
1362	1091					901	my $data = $qual[$i];
1363	1091					674	my ( $qualifier, $value );
1364
1365	1091	50				3513	unless (( $qualifier, $value ) = ($data =~ m{^/([^=]+)(?:=\s(.))?})) {
1366	0	0				0	if ( defined $last_unquoted_qualifier ) {
1367							# handle case of unquoted multiline - read up everything until the next qualifier
1368	0		0			0	do {
1369							# Protein sequence translations need to be joined without spaces,
1370							# other qualifiers need those.
1371	0	0				0	$value .= ' ' if $qualifier ne "translation";
1372	0					0	$value .= $data;
1373							} while defined($data = $qual[++$i]) && $data !~ m[^/];
1374	0					0	$i--;
1375	0					0	$out->field->{$last_unquoted_qualifier}->[-1] .= $value;
1376	0					0	$last_unquoted_qualifier = undef;
1377	0					0	next QUAL;
1378							} else {
1379	0					0	$self->throw("Can't see new qualifier in: $_\nfrom:\n"
1380							. join('', map "$_\n", @qual));
1381							}
1382							}
1383	1091	50				1351	$qualifier = '' if not defined $qualifier;
1384
1385	1091					670	$last_unquoted_qualifier = undef;
1386	1091	50				1064	if (defined $value) {
1387							# Do we have a quoted value?
1388	1091	100				1331	if (substr($value, 0, 1) eq '"') {
1389							# Keep adding to value until we find the trailing quote
1390							# and the quotes are balanced
1391							QUOTES:
1392	971		66			3337	while ($value !~ /"$/ or $value =~ tr/"/"/ % 2) { #"
1393	686					430	$i++;
1394	686					454	my $next = $qual[$i];
1395	686	50				701	if (!defined($next)) {
1396	0					0	$self->warn("Unbalanced quote in:\n".join("\n", @qual).
1397							"\nAdding quote to close...".
1398							"Check sequence quality!");
1399	0					0	$value .= '"';
1400	0					0	last QUOTES;
1401							}
1402
1403							# Protein sequence translations need to be joined without spaces,
1404							# other qualifiers need those.
1405	686	100				659	if ($qualifier eq "translation") {
1406	569					1241	$value .= $next;
1407							} else {
1408	117					454	$value .= " $next";
1409							}
1410							}
1411							# Trim leading and trailing quotes
1412	971					6297	$value =~ s/^"\|"$//g;
1413							# Undouble internal quotes
1414	971					902	$value =~ s/""/"/g; #"
1415							} else {
1416	120					122	$last_unquoted_qualifier = $qualifier;
1417							}
1418							} else {
1419	0					0	$value = '_no_value';
1420							}
1421
1422							# Store the qualifier
1423	1091		100			1465	$out->field->{$qualifier} \|\|= [];
1424	1091					813	push(@{$out->field->{$qualifier}},$value);
	1091					1193
1425							}
1426
1427	256					456	return $out;
1428							}
1429
1430							=head2 _write_line_EMBL
1431
1432							Title : _write_line_EMBL
1433							Usage :
1434							Function: internal function
1435							Example :
1436							Returns : 1 if writing suceeded, else undef
1437							Args :
1438
1439
1440							=cut
1441
1442							sub _write_line_EMBL {
1443	0			0		0	my ($self,$pre1,$pre2,$line,$length) = @_;
1444
1445	0	0				0	$length \|\| $self->throw("Miscalled write_line_EMBL without length. Programming error!");
1446	0					0	my $subl = $length - length $pre2;
1447	0					0	my $linel = length $line;
1448	0					0	my $i;
1449
1450	0					0	my $sub = substr($line,0,$length - length $pre1);
1451
1452	0	0				0	$self->_print( "$pre1$sub\n") \|\| return;
1453
1454	0					0	for ($i= ($length - length $pre1);$i < $linel;) {
1455	0					0	$sub = substr($line,$i,($subl));
1456	0	0				0	$self->_print( "$pre2$sub\n") \|\| return;
1457	0					0	$i += $subl;
1458							}
1459
1460	0					0	return 1;
1461							}
1462
1463							=head2 _write_line_EMBL_regex
1464
1465							Title : _write_line_EMBL_regex
1466							Usage :
1467							Function: internal function for writing lines of specified
1468							length, with different first and the next line
1469							left hand headers and split at specific points in the
1470							text
1471							Example :
1472							Returns : nothing
1473							Args : file handle, first header, second header, text-line, regex for line breaks, total line length
1474
1475
1476							=cut
1477
1478							sub _write_line_EMBL_regex {
1479	263			263		269	my ($self,$pre1,$pre2,$line,$regex,$length) = @_;
1480
1481							#print STDOUT "Going to print with $line!\n";
1482
1483	263	50				310	$length \|\| $self->throw("Programming error - called write_line_EMBL_regex without length.");
1484
1485	263					244	my $subl = $length - (length $pre1) -1 ;
1486	263					157	my( @lines );
1487
1488	263					308	CHUNK: while($line) {
1489	401					419	foreach my $pat ($regex, '[,;\.\/-]\s\|'.$regex, '[,;\.\/-]\|'.$regex) {
1490	401	50				2879	if ($line =~ m/^(.{1,$subl})($pat)(.*)/ ) {
1491	401					644	my $l = $1.$2;
1492	401	100				505	$l =~ s/#/ /g # remove word wrap protection char '#'
1493							if $pre1 eq "RA ";
1494	401					377	my $newl = $3;
1495	401					423	$line = substr($line,length($l));
1496							# be strict about not padding spaces according to
1497							# genbank format
1498	401					783	$l =~ s/\s+$//;
1499	401	50				477	next CHUNK if ($l eq '');
1500	401					327	push(@lines, $l);
1501	401					703	next CHUNK;
1502							}
1503							}
1504							# if we get here none of the patterns matched $subl or less chars
1505	0					0	$self->warn("trouble dissecting \"$line\"\n into chunks ".
1506							"of $subl chars or less - this tag won't print right");
1507							# insert a space char to prevent infinite loops
1508	0					0	$line = substr($line,0,$subl) . " " . substr($line,$subl);
1509							}
1510	263					226	my $s = shift @lines;
1511	263	100	50			683	($self->_print("$pre1$s\n") \|\| return) if $s;
1512	263					309	foreach my $s ( @lines ) {
1513	145	50				212	$self->_print("$pre2$s\n") \|\| return;
1514							}
1515
1516	263					531	return 1;
1517							}
1518
1519							=head2 _post_sort
1520
1521							Title : _post_sort
1522							Usage : $obj->_post_sort($newval)
1523							Function:
1524							Returns : value of _post_sort
1525							Args : newvalue (optional)
1526
1527
1528							=cut
1529
1530							sub _post_sort{
1531	6			6		7	my $obj = shift;
1532	6	50				13	if ( @_ ) {
1533	0					0	my $value = shift;
1534	0					0	$obj->{'_post_sort'} = $value;
1535							}
1536	6					14	return $obj->{'_post_sort'};
1537
1538							}
1539
1540							=head2 _show_dna
1541
1542							Title : _show_dna
1543							Usage : $obj->_show_dna($newval)
1544							Function:
1545							Returns : value of _show_dna
1546							Args : newvalue (optional)
1547
1548
1549							=cut
1550
1551							sub _show_dna{
1552	42			42		50	my $obj = shift;
1553	42	100				89	if ( @_ ) {
1554	31					34	my $value = shift;
1555	31					49	$obj->{'_show_dna'} = $value;
1556							}
1557	42					59	return $obj->{'_show_dna'};
1558
1559							}
1560
1561							=head2 _id_generation_func
1562
1563							Title : _id_generation_func
1564							Usage : $obj->_id_generation_func($newval)
1565							Function:
1566							Returns : value of _id_generation_func
1567							Args : newvalue (optional)
1568
1569
1570							=cut
1571
1572							sub _id_generation_func{
1573	11			11		11	my $obj = shift;
1574	11	50				25	if ( @_ ) {
1575	0					0	my $value = shift;
1576	0					0	$obj->{'_id_generation_func'} = $value;
1577							}
1578	11					24	return $obj->{'_id_generation_func'};
1579
1580							}
1581
1582							=head2 _ac_generation_func
1583
1584							Title : _ac_generation_func
1585							Usage : $obj->_ac_generation_func($newval)
1586							Function:
1587							Returns : value of _ac_generation_func
1588							Args : newvalue (optional)
1589
1590
1591							=cut
1592
1593							sub _ac_generation_func{
1594	11			11		23	my $obj = shift;
1595	11	50				25	if ( @_ ) {
1596	0					0	my $value = shift;
1597	0					0	$obj->{'_ac_generation_func'} = $value;
1598							}
1599	11					101	return $obj->{'_ac_generation_func'};
1600
1601							}
1602
1603							=head2 _sv_generation_func
1604
1605							Title : _sv_generation_func
1606							Usage : $obj->_sv_generation_func($newval)
1607							Function:
1608							Returns : value of _sv_generation_func
1609							Args : newvalue (optional)
1610
1611
1612							=cut
1613
1614							sub _sv_generation_func{
1615	0			0		0	my $obj = shift;
1616	0	0				0	if ( @_ ) {
1617	0					0	my $value = shift;
1618	0					0	$obj->{'_sv_generation_func'} = $value;
1619							}
1620	0					0	return $obj->{'_sv_generation_func'};
1621
1622							}
1623
1624							=head2 _kw_generation_func
1625
1626							Title : _kw_generation_func
1627							Usage : $obj->_kw_generation_func($newval)
1628							Function:
1629							Returns : value of _kw_generation_func
1630							Args : newvalue (optional)
1631
1632
1633							=cut
1634
1635							sub _kw_generation_func{
1636	11			11		15	my $obj = shift;
1637	11	50				21	if ( @_ ) {
1638	0					0	my $value = shift;
1639	0					0	$obj->{'_kw_generation_func'} = $value;
1640							}
1641	11					61	return $obj->{'_kw_generation_func'};
1642
1643							}
1644
1645							1;