File Coverage

Bio/SeqIO/embl.pm
Criterion Covered Total %
statement 538 651 82.6
branch 267 434 61.5
condition 90 128 70.3
subroutine 30 33 90.9
pod 2 2 100.0
total 927 1248 74.2


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::SeqIO::EMBL
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Ewan Birney
7             #
8             # Copyright Ewan Birney
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::SeqIO::embl - EMBL sequence input/output stream
17              
18             =head1 SYNOPSIS
19              
20             It is probably best not to use this object directly, but
21             rather go through the SeqIO handler system. Go:
22              
23             $stream = Bio::SeqIO->new(-file => $filename, -format => 'EMBL');
24              
25             while ( (my $seq = $stream->next_seq()) ) {
26             # do something with $seq
27             }
28              
29             =head1 DESCRIPTION
30              
31             This object can transform Bio::Seq objects to and from EMBL flat
32             file databases.
33              
34             There is a lot of flexibility here about how to dump things which
35             should be documented more fully.
36              
37             There should be a common object that this and Genbank share (probably
38             with Swissprot). Too much of the magic is identical.
39              
40             =head2 Optional functions
41              
42             =over 3
43              
44             =item _show_dna()
45              
46             (output only) shows the dna or not
47              
48             =item _post_sort()
49              
50             (output only) provides a sorting func which is applied to the FTHelpers
51             before printing
52              
53             =item _id_generation_func()
54              
55             This is function which is called as
56              
57             print "ID ", $func($annseq), "\n";
58              
59             To generate the ID line. If it is not there, it generates a sensible ID
60             line using a number of tools.
61              
62             If you want to output annotations in EMBL format they need to be
63             stored in a Bio::Annotation::Collection object which is accessible
64             through the Bio::SeqI interface method L.
65              
66             The following are the names of the keys which are polled from a
67             L object.
68              
69             reference - Should contain Bio::Annotation::Reference objects
70             comment - Should contain Bio::Annotation::Comment objects
71             dblink - Should contain Bio::Annotation::DBLink objects
72              
73             =back
74              
75             =head1 FEEDBACK
76              
77             =head2 Mailing Lists
78              
79             User feedback is an integral part of the evolution of this and other
80             Bioperl modules. Send your comments and suggestions preferably to one
81             of the Bioperl mailing lists. Your participation is much appreciated.
82              
83             bioperl-l@bioperl.org - General discussion
84             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
85              
86             =head2 Support
87              
88             Please direct usage questions or support issues to the mailing list:
89              
90             I
91              
92             rather than to the module maintainer directly. Many experienced and
93             reponsive experts will be able look at the problem and quickly
94             address it. Please include a thorough description of the problem
95             with code and data examples if at all possible.
96              
97             =head2 Reporting Bugs
98              
99             Report bugs to the Bioperl bug tracking system to help us keep track
100             the bugs and their resolution. Bug reports can be submitted via
101             the web:
102              
103             https://github.com/bioperl/bioperl-live/issues
104              
105             =head1 AUTHOR - Ewan Birney
106              
107             Email birney@ebi.ac.uk
108              
109             =head1 APPENDIX
110              
111             The rest of the documentation details each of the object
112             methods. Internal methods are usually preceded with a _
113              
114             =cut
115              
116              
117             # Let the code begin...
118              
119              
120             package Bio::SeqIO::embl;
121 7     7   514 use vars qw(%FTQUAL_NO_QUOTE);
  7         9  
  7         291  
122 7     7   24 use strict;
  7         7  
  7         111  
123 7     7   1372 use Bio::SeqIO::FTHelper;
  7         10  
  7         127  
124 7     7   28 use Bio::SeqFeature::Generic;
  7         8  
  7         98  
125 7     7   1401 use Bio::Species;
  7         11  
  7         139  
126 7     7   26 use Bio::Seq::SeqFactory;
  7         7  
  7         113  
127 7     7   21 use Bio::Annotation::Collection;
  7         7  
  7         111  
128 7     7   1114 use Bio::Annotation::Comment;
  7         10  
  7         122  
129 7     7   1500 use Bio::Annotation::Reference;
  7         10  
  7         143  
130 7     7   26 use Bio::Annotation::DBLink;
  7         8  
  7         110  
131              
132 7     7   20 use base qw(Bio::SeqIO);
  7         8  
  7         33181  
133              
134             # Note that a qualifier that exceeds one line (i.e. a long label) will
135             # automatically be quoted regardless:
136             %FTQUAL_NO_QUOTE=(
137             'anticodon'=>1,
138             'citation'=>1,
139             'codon'=>1,
140             'codon_start'=>1,
141             'cons_splice'=>1,
142             'direction'=>1,
143             'evidence'=>1,
144             'label'=>1,
145             'mod_base'=> 1,
146             'number'=> 1,
147             'rpt_type'=> 1,
148             'rpt_unit'=> 1,
149             'transl_except'=> 1,
150             'transl_table'=> 1,
151             'usedin'=> 1,
152             );
153              
154             sub _initialize {
155 31     31   49 my($self,@args) = @_;
156              
157 31         110 $self->SUPER::_initialize(@args);
158             # hash for functions for decoding keys.
159 31         83 $self->{'_func_ftunit_hash'} = {};
160             # sets this to one by default. People can change it
161 31         83 $self->_show_dna(1);
162 31 50       84 if ( ! defined $self->sequence_factory ) {
163 31         87 $self->sequence_factory(Bio::Seq::SeqFactory->new
164             (-verbose => $self->verbose(),
165             -type => 'Bio::Seq::RichSeq'));
166             }
167             }
168              
169             =head2 next_seq
170              
171             Title : next_seq
172             Usage : $seq = $stream->next_seq()
173             Function: returns the next sequence in the stream
174             Returns : Bio::Seq object
175             Args :
176              
177             =cut
178              
179             sub next_seq {
180 25     25 1 66 my ($self,@args) = @_;
181 25         27 my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div,
182             $date, $comment, @date_arr);
183              
184 25         139 my ($annotation, %params, @features) =
185             Bio::Annotation::Collection->new();
186              
187 25         88 $line = $self->_readline;
188             # This needs to be before the first eof() test
189              
190 25 100       60 if ( !defined $line ) {
191 1         4 return; # no throws - end of file
192             }
193              
194 24 100       109 if ( $line =~ /^\s+$/ ) {
195 1         4 while ( defined ($line = $self->_readline) ) {
196 1 50       4 $line =~/^\S/ && last;
197             }
198             # return without error if the whole next sequence was just a single
199             # blank line and then eof
200 1 50       4 return unless $line;
201             }
202              
203             # no ID as 1st non-blank line, need short circuit and exit routine
204 24 50       98 $self->throw("EMBL stream with no ID. Not embl in my book")
205             unless $line =~ /^ID\s+\S+/;
206              
207             # At this point we are sure that $line contains an ID header line
208 24         26 my $alphabet;
209 24 100       62 if ( $line =~ tr/;/;/ == 6) { # New style headers contain exactly six semicolons.
210              
211             # New style header (EMBL Release >= 87, after June 2006)
212 10         13 my $topology;
213             my $sv;
214              
215             # ID DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP.
216             # This regexp comes from the new2old.pl conversion script, from EBI
217 10 100       46 if ($line =~ m/^ID (\S+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) BP./) {
218 7         38 ($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6);
219             }
220 10 100       19 if (defined $sv) {
221 7         13 $params{'-seq_version'} = $sv;
222 7         12 $params{'-version'} = $sv;
223             }
224              
225 10 50 66     38 if (defined $topology && $topology eq 'circular') {
226 0         0 $params{'-is_circular'} = 1;
227             }
228            
229 10 100       20 if (defined $mol ) {
230 7 100       20 if ($mol =~ /DNA/) {
    50          
    0          
231 6         12 $alphabet = 'dna';
232             } elsif ($mol =~ /RNA/) {
233 1         2 $alphabet = 'rna';
234             } elsif ($mol =~ /AA/) {
235 0         0 $alphabet = 'protein';
236             }
237             }
238             } else {
239            
240             # Old style header (EMBL Release < 87, before June 2006)
241 14 100       92 if ($line =~ /^ID\s+(\S+)[^;]*;\s+(\S+)[^;]*;\s+(\S+)[^;]*;/) {
242 12         42 ($name, $mol, $div) = ($1, $2, $3);
243             }
244            
245 14 100       30 if ($mol) {
246 12 50       24 if ( $mol =~ /circular/ ) {
247 0         0 $params{'-is_circular'} = 1;
248 0         0 $mol =~ s|circular ||;
249             }
250 12 50       26 if (defined $mol ) {
251 12 100       36 if ($mol =~ /DNA/) {
    50          
    0          
252 10         12 $alphabet='dna';
253             } elsif ($mol =~ /RNA/) {
254 2         3 $alphabet='rna';
255             } elsif ($mol =~ /AA/) {
256 0         0 $alphabet='protein';
257             }
258             }
259             }
260             }
261              
262 24 100 66     96 unless( defined $name && length($name) ) {
263 5         9 $name = "unknown_id";
264             }
265              
266             # $self->warn("not parsing upper annotation in EMBL file yet!");
267 24         28 my $buffer = $line;
268 24         24 local $_;
269 24         26 BEFORE_FEATURE_TABLE :
270             my $ncbi_taxid;
271 24         52 until ( !defined $buffer ) {
272 499         399 $_ = $buffer;
273             # Exit at start of Feature table
274 499 100       957 if ( /^(F[HT]|SQ)/ ) {
275 24 100 100     134 $self->_pushback($_) if( $1 eq 'SQ' || $1 eq 'FT');
276 24         26 last;
277             }
278             # Description line(s)
279 475 100       617 if (/^DE\s+(\S.*\S)/) {
280 22 100       95 $desc .= $desc ? " $1" : $1;
281             }
282              
283             #accession number
284 475 100 100     1303 if ( /^AC\s+(.*)?/ || /^PA\s+(.*)?/) {
285 24         102 my @accs = split(/[; ]+/, $1); # allow space in addition
286             $params{'-accession_number'} = shift @accs
287 24 100       76 unless defined $params{'-accession_number'};
288 24         25 push @{$params{'-secondary_accessions'}}, @accs;
  24         61  
289             }
290              
291             #version number
292 475 100       611 if ( /^SV\s+\S+\.(\d+);?/ ) {
293 6         9 my $sv = $1;
294             #$sv =~ s/\;//;
295 6         13 $params{'-seq_version'} = $sv;
296 6         9 $params{'-version'} = $sv;
297             }
298              
299             #date (NOTE: takes last date line)
300 475 100       633 if ( /^DT\s+(.+)$/ ) {
301 26         44 my $line = $1;
302 26         56 my ($date, $version) = split(' ', $line, 2);
303 26         32 $date =~ tr/,//d; # remove comma if new version
304 26 100       42 if ($version) {
305 24 100       106 if ($version =~ /\(Rel\. (\d+), Created\)/ms ) {
    100          
306 11         73 my $release = Bio::Annotation::SimpleValue->new(
307             -tagname => 'creation_release',
308             -value => $1
309             );
310 11         31 $annotation->add_Annotation($release);
311             } elsif ($version =~ /\(Rel\. (\d+), Last updated, Version (\d+)\)/ms ) {
312 11         44 my $release = Bio::Annotation::SimpleValue->new(
313             -tagname => 'update_release',
314             -value => $1
315             );
316 11         27 $annotation->add_Annotation($release);
317              
318 11         39 my $update = Bio::Annotation::SimpleValue->new(
319             -tagname => 'update_version',
320             -value => $2
321             );
322 11         25 $annotation->add_Annotation($update);
323             }
324             }
325 26         22 push @{$params{'-dates'}}, $date;
  26         49  
326             }
327              
328             #keywords
329 475 100       1633 if ( /^KW (.*)\S*$/ ) {
    100          
    100          
    100          
    100          
    100          
330 29         138 my @kw = split(/\s*\;\s*/,$1);
331 29         35 push @{$params{'-keywords'}}, @kw;
  29         91  
332             }
333              
334             # Organism name and phylogenetic information
335             elsif (/^O[SC]/) {
336             # pass the accession number so we can give an informative throw message if necessary
337 21         76 my $species = $self->_read_EMBL_Species(\$buffer, $params{'-accession_number'});
338 21         46 $params{'-species'}= $species;
339             }
340              
341             # NCBI TaxID Xref
342             elsif (/^OX/) {
343 3 50       19 if (/NCBI_TaxID=(\d+)/) {
344 3         14 $ncbi_taxid=$1;
345             }
346              
347 3         12 my @links = $self->_read_EMBL_TaxID_DBLink(\$buffer);
348 3         6 foreach my $dblink ( @links ) {
349 1         6 $annotation->add_Annotation('dblink',$dblink);
350             }
351             }
352              
353             # References
354             elsif (/^R/) {
355 99         187 my @refs = $self->_read_EMBL_References(\$buffer);
356 99         122 foreach my $ref ( @refs ) {
357 99         211 $annotation->add_Annotation('reference',$ref);
358             }
359             }
360              
361             # DB Xrefs
362             elsif (/^DR/) {
363 5         32 my @links = $self->_read_EMBL_DBLink(\$buffer);
364 5         10 foreach my $dblink ( @links ) {
365 25         39 $annotation->add_Annotation('dblink',$dblink);
366             }
367             }
368              
369             # Comments
370             elsif (/^CC\s+(.*)/) {
371 15         40 $comment .= $1;
372 15         18 $comment .= " ";
373 15         37 while (defined ($_ = $self->_readline) ) {
374 257 100       519 if (/^CC\s+(.*)/) {
375 242         288 $comment .= $1;
376 242         325 $comment .= " ";
377             } else {
378 15         21 last;
379             }
380             }
381 15         96 my $commobj = Bio::Annotation::Comment->new();
382 15         45 $commobj->text($comment);
383 15         33 $annotation->add_Annotation('comment',$commobj);
384 15         31 $comment = "";
385             }
386              
387             # Get next line.
388 475         744 $buffer = $self->_readline;
389             }
390              
391 24         50 while ( defined ($_ = $self->_readline) ) {
392 46 100       114 /^FT\s{3}\w/ && last;
393 24 100       56 /^SQ / && last;
394 22 50       55 /^CO / && last;
395             }
396 24         27 $buffer = $_;
397              
398 24 100 66     116 if (defined($buffer) && $buffer =~ /^FT /) {
399 22         46 until ( !defined ($buffer) ) {
400 256         468 my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
401              
402             # process ftunit
403 256         530 my $feat =
404             $ftunit->_generic_seqfeature($self->location_factory(), $name);
405              
406             # add taxon_id from source if available
407             # Notice, this will override what is found in the OX line.
408             # this is by design as this seems to be the official way
409             # of specifying a TaxID
410 256 100 100     673 if ($params{'-species'} && ($feat->primary_tag eq 'source')
      100        
      66        
411             && $feat->has_tag('db_xref')
412             && (! $params{'-species'}->ncbi_taxid())) {
413 12         32 foreach my $tagval ($feat->get_tag_values('db_xref')) {
414 12 50       46 if (index($tagval,"taxon:") == 0) {
415 12         38 $params{'-species'}->ncbi_taxid(substr($tagval,6));
416 12         24 last;
417             }
418             }
419             }
420              
421             # add feature to list of features
422 256         305 push(@features, $feat);
423              
424 256 100       1126 if ( $buffer !~ /^FT/ ) {
425 22         69 last;
426             }
427             }
428             }
429             # Set taxid found in OX line
430 24 100 100     129 if ($params{'-species'} && defined $ncbi_taxid
      100        
431             && (! $params{'-species'}->ncbi_taxid())) {
432 2         7 $params{'-species'}->ncbi_taxid($ncbi_taxid);
433             }
434              
435             # skip comments
436 24   66     147 while ( defined ($buffer) && $buffer =~ /^XX/ ) {
437 20         49 $buffer = $self->_readline();
438             }
439            
440 24 100       82 if ( $buffer =~ /^CO/ ) {
441             # bug#2982
442             # special : create contig as annotation
443 4         11 while ( defined ($buffer) ) {
444 4         16 $annotation->add_Annotation($_) for $self->_read_EMBL_Contig(\$buffer);
445 4 50 66     23 if ( !$buffer || $buffer !~ /^CO/ ) {
446 4         5 last;
447             }
448             }
449 4   100     11 $buffer ||= '';
450             }
451 24 100       66 if ($buffer !~ /^\/\//) { # if no SQ lines following CO (bug#2958)
452 20 100       63 if ( $buffer !~ /^SQ/ ) {
453 1         3 while ( defined ($_ = $self->_readline) ) {
454 0 0       0 /^SQ/ && last;
455             }
456             }
457 20         24 $seqc = "";
458 20         48 while ( defined ($_ = $self->_readline) ) {
459 2924 100       3631 m{^//} && last;
460 2906         2431 $_ = uc($_);
461 2906         20861 s/[^A-Za-z]//g;
462 2906         4620 $seqc .= $_;
463             }
464             }
465 24         84 my $seq = $self->sequence_factory->create
466             (-verbose => $self->verbose(),
467             -division => $div,
468             -seq => $seqc,
469             -desc => $desc,
470             -display_id => $name,
471             -annotation => $annotation,
472             -molecule => $mol,
473             -alphabet => $alphabet,
474             -features => \@features,
475             %params);
476 24         197 return $seq;
477             }
478              
479              
480              
481             =head2 _write_ID_line
482              
483             Title : _write_ID_line
484             Usage : $self->_write_ID_line($seq);
485             Function: Writes the EMBL Release 87 format ID line to the stream, unless
486             : there is a user-supplied ID line generation function in which
487             : case that is used instead.
488             : ( See Bio::SeqIO::embl::_id_generation_function(). )
489             Returns : nothing
490             Args : Bio::Seq object
491              
492             =cut
493              
494             sub _write_ID_line {
495              
496 11     11   17 my ($self, $seq) = @_;
497              
498 11         14 my $id_line;
499             # If there is a user-supplied ID generation function, use it.
500 11 50       27 if ( $self->_id_generation_func ) {
501 0         0 $id_line = "ID " . &{$self->_id_generation_func}($seq) . "\nXX\n";
  0         0  
502             }
503             # Otherwise, generate a standard EMBL release 87 (June 2006) ID line.
504             else {
505              
506             # The sequence name is supposed to be the primary accession number,
507 11         36 my $name = $seq->accession_number();
508 11 100 66     57 if ( not(defined $name) || $name eq 'unknown') {
509             # but if it is not present, use the sequence ID or the empty string
510 5   100     14 $name = $seq->id() || '';
511             }
512            
513 11 50       35 $self->warn("No whitespace allowed in EMBL id [". $name. "]") if $name =~ /\s/;
514              
515             # Use the sequence version, or default to 1.
516 11   50     32 my $version = $seq->version() || 1;
517              
518 11         32 my $len = $seq->length();
519              
520             # Taxonomic division.
521 11         16 my $div;
522 11 100 100     122 if ( $seq->can('division') && defined($seq->division) &&
      66        
523             $self->_is_valid_division($seq->division) ) {
524 2         5 $div = $seq->division();
525             } else {
526 9   50     39 $div ||= 'UNC'; # 'UNC' is the EMBL division code for 'unclassified'.
527             }
528              
529 11         16 my $mol;
530             # If the molecule type is a valid EMBL type, use it.
531 11 50 100     86 if ( $seq->can('molecule')
    100 66        
      100        
532             && defined($seq->molecule)
533             && $self->_is_valid_molecule_type($seq->molecule)
534             ) {
535 0         0 $mol = $seq->molecule();
536             }
537             # Otherwise, choose unassigned DNA or RNA based on the alphabet.
538             elsif ($seq->can('primary_seq') && defined $seq->primary_seq->alphabet) {
539 6         10 my $alphabet =$seq->primary_seq->alphabet;
540 6 50       12 if ($alphabet eq 'dna') {
    0          
    0          
541 6         8 $mol ='unassigned DNA';
542             } elsif ($alphabet eq 'rna') {
543 0         0 $mol='unassigned RNA';
544             } elsif ($alphabet eq 'protein') {
545 0         0 $self->warn("Protein sequence found; EMBL is a nucleotide format.");
546 0         0 $mol='AA'; # AA is not a valid EMBL molecule type.
547             }
548             }
549              
550 11         18 my $topology = 'linear';
551 11 50       31 if ($seq->is_circular) {
552 0         0 $topology = 'circular';
553             }
554              
555 11   100     34 $mol ||= ''; # 'unassigned'; ?
556 11         46 $id_line = "ID $name; SV $version; $topology; $mol; STD; $div; $len BP.\nXX\n";
557 11         48 $self->_print($id_line);
558             }
559             }
560              
561             =head2 _is_valid_division
562              
563             Title : _is_valid_division
564             Usage : $self->_is_valid_division($div)
565             Function: tests division code for validity
566             Returns : true if $div is a valid EMBL release 87 taxonomic division.
567             Args : taxonomic division code string
568              
569             =cut
570              
571             sub _is_valid_division {
572 2     2   4 my ($self, $division) = @_;
573              
574 2         25 my %EMBL_divisions = (
575             "PHG" => 1, # Bacteriophage
576             "ENV" => 1, # Environmental Sample
577             "FUN" => 1, # Fungal
578             "HUM" => 1, # Human
579             "INV" => 1, # Invertebrate
580             "MAM" => 1, # Other Mammal
581             "VRT" => 1, # Other Vertebrate
582             "MUS" => 1, # Mus musculus
583             "PLN" => 1, # Plant
584             "PRO" => 1, # Prokaryote
585             "ROD" => 1, # Other Rodent
586             "SYN" => 1, # Synthetic
587             "UNC" => 1, # Unclassified
588             "VRL" => 1 # Viral
589             );
590              
591 2         10 return exists($EMBL_divisions{$division});
592             }
593              
594             =head2 _is_valid_molecule_type
595              
596             Title : _is_valid_molecule_type
597             Usage : $self->_is_valid_molecule_type($mol)
598             Function: tests molecule type for validity
599             Returns : true if $mol is a valid EMBL release 87 molecule type.
600             Args : molecule type string
601              
602             =cut
603              
604             sub _is_valid_molecule_type {
605 2     2   3 my ($self, $moltype) = @_;
606              
607 2         15 my %EMBL_molecule_types = (
608             "genomic DNA" => 1,
609             "genomic RNA" => 1,
610             "mRNA" => 1,
611             "tRNA" => 1,
612             "rRNA" => 1,
613             "snoRNA" => 1,
614             "snRNA" => 1,
615             "scRNA" => 1,
616             "pre-RNA" => 1,
617             "other RNA" => 1,
618             "other DNA" => 1,
619             "unassigned DNA" => 1,
620             "unassigned RNA" => 1
621             );
622              
623 2         29 return exists($EMBL_molecule_types{$moltype});
624             }
625              
626             =head2 write_seq
627              
628             Title : write_seq
629             Usage : $stream->write_seq($seq)
630             Function: writes the $seq object (must be seq) to the stream
631             Returns : 1 for success and undef for error
632             Args : array of 1 to n Bio::SeqI objects
633              
634              
635             =cut
636              
637             sub write_seq {
638 13     13 1 55 my ($self,@seqs) = @_;
639              
640 13         27 foreach my $seq ( @seqs ) {
641 13 50       32 $self->throw("Attempting to write with no seq!") unless defined $seq;
642 13 100 100     87 unless ( ref $seq && $seq->isa('Bio::SeqI' ) ) {
643 4 50       12 $self->warn("$seq is not a SeqI compliant sequence object!")
644             if $self->verbose >= 0;
645 4 100 66     22 unless ( ref $seq && $seq->isa('Bio::PrimarySeqI' ) ) {
646 2         27 $self->throw("$seq is not a PrimarySeqI compliant sequence object!");
647             }
648             }
649 11   100     38 my $str = $seq->seq || '';
650              
651             # Write the ID line.
652 11         30 $self->_write_ID_line($seq);
653              
654              
655             # Write the accession line if present
656 11         13 my( $acc );
657             {
658 11 50 66     10 if ( my $func = $self->_ac_generation_func ) {
  11 100       26  
    50          
659 0         0 $acc = &{$func}($seq);
  0         0  
660             } elsif ( $seq->isa('Bio::Seq::RichSeqI') &&
661             defined($seq->accession_number) ) {
662 4         7 $acc = $seq->accession_number;
663 4         10 $acc = join("; ", $acc, $seq->get_secondary_accessions);
664             } elsif ( $seq->can('accession_number') ) {
665 7         15 $acc = $seq->accession_number;
666             }
667              
668 11 50       22 if (defined $acc) {
669 11 50       37 $self->_print("AC $acc;\n",
670             "XX\n") || return;
671             }
672             }
673              
674             # Date lines
675 11         12 my $switch=0;
676 11 100       58 if ( $seq->can('get_dates') ) {
677 4         8 my @dates = $seq->get_dates();
678 4         4 my $ct = 1;
679 4         3 my $date_flag = 0;
680 4         9 my ($cr) = $seq->annotation->get_Annotations("creation_release");
681 4         12 my ($ur) = $seq->annotation->get_Annotations("update_release");
682 4         8 my ($uv) = $seq->annotation->get_Annotations("update_version");
683              
684 4 0 33     11 unless ($cr && $ur && $ur) {
      33        
685 4         5 $date_flag = 1;
686             }
687              
688 4         7 foreach my $dt (@dates) {
689 0 0       0 if (!$date_flag) {
690 0 0       0 $self->_write_line_EMBL_regex("DT ","DT ",
691             $dt." (Rel. ".($cr->value()).", Created)",
692             '\s+|$',80) if $ct == 1;
693 0 0       0 $self->_write_line_EMBL_regex("DT ","DT ",
694             $dt." (Rel. ".($ur->value()).", Last updated, Version ".($uv->value()).")",
695             '\s+|$',80) if $ct == 2;
696             } else { # other formats?
697 0         0 $self->_write_line_EMBL_regex("DT ","DT ",
698             $dt,'\s+|$',80);
699             }
700 0         0 $switch =1;
701 0         0 $ct++;
702             }
703 4 50       8 if ($switch == 1) {
704 0 0       0 $self->_print("XX\n") || return;
705             }
706             }
707              
708             # Description lines
709 11 50       31 $self->_write_line_EMBL_regex("DE ","DE ",$seq->desc(),'\s+|$',80) || return; #'
710 11 50       28 $self->_print( "XX\n") || return;
711              
712             # if there, write the kw line
713             {
714 11         17 my( $kw );
  11         563  
715 11 50       28 if ( my $func = $self->_kw_generation_func ) {
    100          
716 0         0 $kw = &{$func}($seq);
  0         0  
717             } elsif ( $seq->can('keywords') ) {
718 4         9 $kw = $seq->keywords;
719             }
720 11 100       25 if (defined $kw) {
721 4 50       7 $self->_write_line_EMBL_regex("KW ", "KW ", $kw, '\s+|$', 80) || return; #'
722 4 50       7 $self->_print( "XX\n") || return;
723             }
724             }
725              
726             # Organism lines
727              
728 11 100 100     53 if ($seq->can('species') && (my $spec = $seq->species)) {
729 6         21 my @class = $spec->classification();
730 6         9 shift @class; # get rid of species name. Some embl files include
731             # the species name in the OC lines, but this seems
732             # more like an error than something we need to
733             # emulate
734 6         24 my $OS = $spec->scientific_name;
735 6 50       15 if ($spec->common_name) {
736 0         0 $OS .= ' ('.$spec->common_name.')';
737             }
738 6 50       20 $self->_print("OS $OS\n") || return;
739 6         19 my $OC = join('; ', reverse(@class)) .'.';
740 6 50       13 $self->_write_line_EMBL_regex("OC ","OC ",$OC,'; |$',80) || return;
741 6 50       20 if ($spec->organelle) {
742 0 0       0 $self->_write_line_EMBL_regex("OG ","OG ",$spec->organelle,'; |$',80) || return;
743             }
744 6         15 my $ncbi_taxid = $spec->ncbi_taxid;
745 6 100       12 if ($ncbi_taxid) {
746 2 50       6 $self->_print("OX NCBI_TaxID=$ncbi_taxid\n") || return;
747             }
748 6 50       12 $self->_print("XX\n") || return;
749             }
750             # Reference lines
751 11         12 my $t = 1;
752 11 100 66     63 if ( $seq->can('annotation') && defined $seq->annotation ) {
753 9         18 foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
754 6 50       15 $self->_print( "RN [$t]\n") || return;
755              
756             # Having no RP line is legal, but we need both
757             # start and end for a valid location.
758 6 50       14 if ($ref->comment) {
759 0 0       0 $self->_write_line_EMBL_regex("RC ", "RC ", $ref->comment, '\s+|$', 80) || return; #'
760             }
761 6         15 my $start = $ref->start;
762 6         15 my $end = $ref->end;
763 6 50 33     21 if ($start and $end) {
    0 0        
764 6 50       16 $self->_print( "RP $start-$end\n") || return;
765             } elsif ($start or $end) {
766 0         0 $self->throw("Both start and end are needed for a valid RP line.".
767             " Got: start='$start' end='$end'");
768             }
769              
770 6 50       11 if (my $med = $ref->medline) {
771 0 0       0 $self->_print( "RX MEDLINE; $med.\n") || return;
772             }
773 6 50       9 if (my $pm = $ref->pubmed) {
774 0 0       0 $self->_print( "RX PUBMED; $pm.\n") || return;
775             }
776 6         19 my $authors = $ref->authors;
777 6         59 $authors =~ s/([\w\.]) (\w)/$1#$2/g; # add word wrap protection char '#'
778              
779 6 50       16 $self->_write_line_EMBL_regex("RA ", "RA ",
780             $authors . ";",
781             '\s+|$', 80) || return; #'
782              
783             # If there is no title to the reference, it appears
784             # as a single semi-colon. All titles must end in
785             # a semi-colon.
786 6   100     13 my $ref_title = $ref->title || '';
787 6         37 $ref_title =~ s/[\s;]*$/;/;
788 6 50       12 $self->_write_line_EMBL_regex("RT ", "RT ", $ref_title, '\s+|$', 80) || return; #'
789 6 50       12 $self->_write_line_EMBL_regex("RL ", "RL ", $ref->location, '\s+|$', 80) || return; #'
790 6 50       9 $self->_print("XX\n") || return;
791 6         10 $t++;
792             }
793              
794             # DB Xref lines
795 9 100       23 if (my @db_xref = $seq->annotation->get_Annotations('dblink') ) {
796 2         2 for my $dr (@db_xref) {
797 43         59 my $db_name = $dr->database;
798 43         49 my $prim = $dr->primary_id;
799              
800 43   50     47 my $opt = $dr->optional_id || '';
801 43 50       69 my $line = $opt ? "$db_name; $prim; $opt." : "$db_name; $prim.";
802 43 50       47 $self->_write_line_EMBL_regex("DR ", "DR ", $line, '\s+|$', 80) || return; #'
803             }
804 2 50       4 $self->_print("XX\n") || return;
805             }
806            
807             # Comment lines
808 9         20 foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
809 4 50       10 $self->_write_line_EMBL_regex("CC ", "CC ", $comment->text, '\s+|$', 80) || return; #'
810 4 50       8 $self->_print("XX\n") || return;
811             }
812             }
813             # "\\s\+\|\$"
814              
815             ## FEATURE TABLE
816              
817 11 50       26 $self->_print("FH Key Location/Qualifiers\n") || return;
818 11 50       23 $self->_print("FH\n") || return;
819              
820 11 100       65 my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : ();
821 11 100       24 if ($feats[0]) {
822 6 50       14 if ( defined $self->_post_sort ) {
823             # we need to read things into an array.
824             # Process. Sort them. Print 'em
825              
826 0         0 my $post_sort_func = $self->_post_sort();
827 0         0 my @fth;
828              
829 0         0 foreach my $sf ( @feats ) {
830 0         0 push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq));
831             }
832              
833 0         0 @fth = sort { &$post_sort_func($a,$b) } @fth;
  0         0  
834              
835 0         0 foreach my $fth ( @fth ) {
836 0 0       0 $self->_print_EMBL_FTHelper($fth) || return;
837             }
838             } else {
839             # not post sorted. And so we can print as we get them.
840             # lower memory load...
841              
842 6         10 foreach my $sf ( @feats ) {
843 38         65 my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq);
844 38         34 foreach my $fth ( @fth ) {
845 38 50       52 if ( $fth->key eq 'CONTIG') {
846 0         0 $self->_show_dna(0);
847             }
848 38 50       66 $self->_print_EMBL_FTHelper($fth) || return;
849             }
850             }
851             }
852             }
853              
854 11 50       25 if ( $self->_show_dna() == 0 ) {
855 0 0       0 $self->_print( "//\n") || return;
856 0         0 return;
857             }
858 11 50       24 $self->_print( "XX\n") || return;
859              
860             # finished printing features.
861              
862             # print contig if present : bug#2982
863 11 100 66     62 if ( $seq->can('annotation') && defined $seq->annotation) {
864 9         19 foreach my $ctg ( $seq->annotation->get_Annotations('contig') ) {
865 0 0       0 if ($ctg->value) {
866 0 0       0 $self->_write_line_EMBL_regex("CO ","CO ", $ctg->value,
867             '[,]|$', 80) || return;
868             }
869             }
870             }
871             # print sequence lines only if sequence is present! bug#2982
872 11 100       26 if (length($str)) {
873 8         24 $str =~ tr/A-Z/a-z/;
874              
875             # Count each nucleotide
876 8         19 my $alen = $str =~ tr/a/a/;
877 8         15 my $clen = $str =~ tr/c/c/;
878 8         15 my $glen = $str =~ tr/g/g/;
879 8         16 my $tlen = $str =~ tr/t/t/;
880            
881 8         15 my $len = $seq->length();
882 8         13 my $olen = $seq->length() - ($alen + $tlen + $clen + $glen);
883 8 50       19 if ( $olen < 0 ) {
884 0         0 $self->warn("Weird. More atgc than bases. Problem!");
885             }
886            
887 8 50       37 $self->_print("SQ Sequence $len BP; $alen A; $clen C; $glen G; $tlen T; $olen other;\n") || return;
888              
889 8         12 my $nuc = 60; # Number of nucleotides per line
890 8         14 my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line
891 8         9 my $out_pat = 'A11' x 6; # Pattern for packing a line
892 8         7 my $length = length($str);
893            
894             # Calculate the number of nucleotides which fit on whole lines
895 8         25 my $whole = int($length / $nuc) * $nuc;
896              
897             # Print the whole lines
898 8         6 my( $i );
899 8         19 for ($i = 0; $i < $whole; $i += $nuc) {
900 162         366 my $blocks = pack $out_pat,
901             unpack $whole_pat,
902             substr($str, $i, $nuc);
903 162 50       347 $self->_print(sprintf(" $blocks%9d\n", $i + $nuc)) || return;
904             }
905              
906             # Print the last line
907 8 50       26 if (my $last = substr($str, $i)) {
908 8         8 my $last_len = length($last);
909 8         31 my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10;
910 8         62 my $blocks = pack $out_pat,
911             unpack($last_pat, $last);
912 8 50       41 $self->_print(sprintf(" $blocks%9d\n", $length)) ||
913             return; # Add the length to the end
914             }
915             }
916            
917 11 50       27 $self->_print( "//\n") || return;
918            
919 11 50 33     24 $self->flush if $self->_flush_on_write && defined $self->_fh;
920             }
921 11         41 return 1;
922             }
923              
924             =head2 _print_EMBL_FTHelper
925              
926             Title : _print_EMBL_FTHelper
927             Usage :
928             Function: Internal function
929             Returns : 1 if writing suceeded, otherwise undef
930             Args :
931              
932              
933             =cut
934              
935             sub _print_EMBL_FTHelper {
936 38     38   34 my ($self,$fth) = @_;
937              
938 38 50 33     119 if ( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) {
939 0         0 $fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!");
940             }
941              
942              
943             #$self->_print( "FH Key Location/Qualifiers\n");
944             #$self->_print( sprintf("FT %-15s %s\n",$fth->key,$fth->loc));
945             # let
946 38 50       48 if ( $fth->key eq 'CONTIG' ) {
947 0 0       0 $self->_print("XX\n") || return;
948 0 0       0 $self->_write_line_EMBL_regex("CO ",
949             "CO ",$fth->loc,
950             '\,|$',80) || return; #'
951 0         0 return 1;
952             }
953 38 50       56 $self->_write_line_EMBL_regex(sprintf("FT %-15s ",$fth->key),
954             "FT ",$fth->loc,
955             '\,|$',80) || return; #'
956 38         37 foreach my $tag ( keys %{$fth->field} ) {
  38         64  
957 108 50       147 if ( ! defined $fth->field->{$tag} ) {
958 0         0 next;
959             }
960 108         91 foreach my $value ( @{$fth->field->{$tag}} ) {
  108         121  
961 139         141 $value =~ s/\"/\"\"/g;
962 139 50 66     346 if ($value eq "_no_value") {
    50          
963 0 0       0 $self->_write_line_EMBL_regex("FT ",
964             "FT ",
965             "/$tag",'.|$',80) || return; #'
966             }
967             # there are almost 3x more quoted qualifier values and they
968             # are more common too so we take quoted ones first
969             #
970             # Long qualifiers, that will be line wrapped, are always quoted
971             elsif (!$FTQUAL_NO_QUOTE{$tag} or length("/$tag=$value")>=60) {
972 139 100       264 my $pat = $value =~ /\s+/ ? '\s+|\-|$' : '.|\-|$';
973 139 50       278 $self->_write_line_EMBL_regex("FT ",
974             "FT ",
975             "/$tag=\"$value\"",$pat,80) || return;
976             } else {
977 0 0       0 $self->_write_line_EMBL_regex("FT ",
978             "FT ",
979             "/$tag=$value",'.|$',80) || return; #'
980             }
981             }
982             }
983              
984 38         136 return 1;
985             }
986              
987              
988              
989             =head2 _read_EMBL_Contig()
990              
991             Title : _read_EMBL_Contig
992             Usage :
993             Function: convert CO lines into annotations
994             Returns :
995             Args :
996              
997             =cut
998              
999             sub _read_EMBL_Contig {
1000 4     4   5 my ($self, $buffer) = @_;
1001 4         7 my @ret;
1002 4 50       11 if ( $$buffer !~ /^CO/ ) {
1003 0         0 warn("Not parsing line '$$buffer' which maybe important");
1004             }
1005 4         11 $self->_pushback($$buffer);
1006 4         6 while ( defined ($_ = $self->_readline) ) {
1007 356 100       752 /^C/ || last;
1008 353 50       775 /^CO\s+(.*)/ && do {
1009 353         767 push @ret, Bio::Annotation::SimpleValue->new( -tagname => 'contig',
1010             -value => $1);
1011             };
1012             }
1013 4         7 $$buffer = $_;
1014 4         48 return @ret;
1015              
1016             }
1017             #'
1018             =head2 _read_EMBL_References
1019              
1020             Title : _read_EMBL_References
1021             Usage :
1022             Function: Reads references from EMBL format. Internal function really
1023             Example :
1024             Returns :
1025             Args :
1026              
1027              
1028             =cut
1029              
1030             sub _read_EMBL_References {
1031 99     99   90 my ($self,$buffer) = @_;
1032 99         74 my (@refs);
1033              
1034             # assume things are starting with RN
1035              
1036 99 50       198 if ( $$buffer !~ /^RN/ ) {
1037 0         0 warn("Not parsing line '$$buffer' which maybe important");
1038             }
1039 99         76 my $b1;
1040             my $b2;
1041 0         0 my $title;
1042 0         0 my $loc;
1043 0         0 my $au;
1044 0         0 my $med;
1045 0         0 my $pm;
1046 0         0 my $com;
1047              
1048 99         160 while ( defined ($_ = $self->_readline) ) {
1049 727 100       1230 /^R/ || last;
1050 628 100       806 /^RP (\d+)-(\d+)/ && do {$b1=$1;$b2=$2;};
  40         83  
  40         54  
1051 628 100       797 /^RX MEDLINE;\s+(\d+)/ && do {$med=$1};
  55         81  
1052 628 100       730 /^RX PUBMED;\s+(\d+)/ && do {$pm=$1};
  10         15  
1053 628 100       832 /^RA (.*)/ && do {
1054 193         341 $au = $self->_concatenate_lines($au,$1); next;
  193         297  
1055             };
1056 435 100       615 /^RT (.*)/ && do {
1057 164         213 $title = $self->_concatenate_lines($title,$1); next;
  164         220  
1058             };
1059 271 100       425 /^RL (.*)/ && do {
1060 156         224 $loc = $self->_concatenate_lines($loc,$1); next;
  156         238  
1061             };
1062 115 100       227 /^RC (.*)/ && do {
1063 3         10 $com = $self->_concatenate_lines($com,$1); next;
  3         4  
1064             };
1065             }
1066              
1067 99         302 my $ref = Bio::Annotation::Reference->new();
1068 99         317 $au =~ s/;\s*$//g;
1069 99         198 $title =~ s/;\s*$//g;
1070              
1071 99         188 $ref->start($b1);
1072 99         166 $ref->end($b2);
1073 99         135 $ref->authors($au);
1074 99         144 $ref->title($title);
1075 99         124 $ref->location($loc);
1076 99         136 $ref->medline($med);
1077 99         145 $ref->comment($com);
1078 99         130 $ref->pubmed($pm);
1079              
1080 99         91 push(@refs,$ref);
1081 99         101 $$buffer = $_;
1082              
1083 99         194 return @refs;
1084             }
1085              
1086             =head2 _read_EMBL_Species
1087              
1088             Title : _read_EMBL_Species
1089             Usage :
1090             Function: Reads the EMBL Organism species and classification
1091             lines.
1092             Example :
1093             Returns : A Bio::Species object
1094             Args : a reference to the current line buffer, accession number
1095              
1096             =cut
1097              
1098             sub _read_EMBL_Species {
1099 21     21   30 my( $self, $buffer, $acc ) = @_;
1100 21         20 my $org;
1101              
1102 21         26 $_ = $$buffer;
1103 21         26 my( $sub_species, $species, $genus, $common, $sci_name, $class_lines );
1104 21   66     77 while (defined( $_ ||= $self->_readline )) {
1105 92 100       349 if (/^OS\s+(.+)/) {
    100          
    50          
1106 21 50       70 $sci_name .= ($sci_name) ? ' '.$1 : $1;
1107             } elsif (s/^OC\s+(.+)$//) {
1108 50         102 $class_lines .= $1;
1109             } elsif (/^OG\s+(.*)/) {
1110 0         0 $org = $1;
1111             } else {
1112 21         38 last;
1113             }
1114              
1115 71         191 $_ = undef; # Empty $_ to trigger read of next line
1116             }
1117              
1118             # $$buffer = $_;
1119 21         69 $self->_pushback($_);
1120            
1121 21         32 $sci_name =~ s{\.$}{};
1122 21 50       39 $sci_name || return;
1123              
1124             # Convert data in classification lines into classification array.
1125             # only split on ';' or '.' so that classification that is 2 or more words
1126             # will still get matched, use map() to remove trailing/leading/intervening
1127             # spaces
1128 21         161 my @class = map { s/^\s+//; s/\s+$//; s/\s{2,}/ /g; $_; } split /(?
  224         258  
  224         194  
  224         164  
  224         213  
1129              
1130             # do we have a genus?
1131 21         42 my $possible_genus = $class[-1];
1132 21 50       61 $possible_genus .= "|$class[-2]" if $class[-2];
1133 21 50       511 if ($sci_name =~ /^($possible_genus)/) {
1134 21         42 $genus = $1;
1135 21         468 ($species) = $sci_name =~ /^$genus\s+(.+)/;
1136             } else {
1137 0         0 $species = $sci_name;
1138             }
1139              
1140             # Don't make a species object if it is "Unknown" or "None"
1141 21 50       62 if ($genus) {
1142 21 50       75 return if $genus =~ /^(Unknown|None)$/i;
1143             }
1144              
1145             # is this organism of rank species or is it lower?
1146             # (doesn't catch everything, but at least the guess isn't dangerous)
1147 21 50       85 if ($species =~ /subsp\.|var\./) {
1148 0         0 ($species, $sub_species) = $species =~ /(.+)\s+((?:subsp\.|var\.).+)/;
1149             }
1150              
1151             # sometimes things have common name in brackets, like
1152             # Schizosaccharomyces pombe (fission yeast), so get rid of the common
1153             # name bit. Probably dangerous if real scientific species name ends in
1154             # bracketed bit.
1155 21 50       54 unless ($class[-1] eq 'Viruses') {
1156 21         70 ($species, $common) = $species =~ /^(.+)\s+\((.+)\)$/;
1157 21 100       69 $sci_name =~ s/\s+\(.+\)$// if $common;
1158             }
1159              
1160             # Bio::Species array needs array in Species -> Kingdom direction
1161 21 50       54 unless ($class[-1] eq $sci_name) {
1162 21         30 push(@class, $sci_name);
1163             }
1164 21         27 @class = reverse @class;
1165              
1166             # do minimal sanity checks before we hand off to Bio::Species which won't
1167             # be able to give informative throw messages if it has to throw because
1168             # of problems here
1169 21 50       40 $self->throw("$acc seems to be missing its OS line: invalid.") unless $sci_name;
1170 21         24 my %names;
1171 21         68 foreach my $i (0..$#class) {
1172 245         166 my $name = $class[$i];
1173 245         272 $names{$name}++;
1174             # this code breaks examples like: Xenopus (Silurana) tropicalis
1175             # commenting out, see bug 3158
1176            
1177             #if ($names{$name} > 1 && ($name ne $class[$i - 1])) {
1178             # $self->warn("$acc seems to have an invalid species classification:$name ne $class[$i - 1]");
1179             #}
1180             }
1181 21         129 my $make = Bio::Species->new();
1182 21         53 $make->scientific_name($sci_name);
1183 21         60 $make->classification(@class);
1184 21 50       56 unless ($class[-1] eq 'Viruses') {
1185 21 50       96 $make->genus($genus) if $genus;
1186 21 100       63 $make->species($species) if $species;
1187 21 50       42 $make->sub_species($sub_species) if $sub_species;
1188 21 100       61 $make->common_name($common) if $common;
1189             }
1190 21 50       38 $make->organelle($org) if $org;
1191 21         103 return $make;
1192             }
1193              
1194             =head2 _read_EMBL_DBLink
1195              
1196             Title : _read_EMBL_DBLink
1197             Usage :
1198             Function: Reads the EMBL database cross reference ("DR") lines
1199             Example :
1200             Returns : A list of Bio::Annotation::DBLink objects
1201             Args :
1202              
1203             =cut
1204              
1205             sub _read_EMBL_DBLink {
1206 5     5   6 my( $self,$buffer ) = @_;
1207 5         5 my( @db_link );
1208              
1209 5         5 $_ = $$buffer;
1210 5   66     21 while (defined( $_ ||= $self->_readline )) {
1211 30 100       118 if ( /^DR ([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?\.$/) {
1212 25         54 my ($databse, $prim_id, $sec_id) = ($1,$2,$3);
1213 25         82 my $link = Bio::Annotation::DBLink->new(-database => $databse,
1214             -primary_id => $prim_id,
1215             -optional_id => $sec_id);
1216              
1217 25         41 push(@db_link, $link);
1218             } else {
1219 5         7 last;
1220             }
1221 25         67 $_ = undef; # Empty $_ to trigger read of next line
1222             }
1223              
1224 5         8 $$buffer = $_;
1225 5         12 return @db_link;
1226             }
1227              
1228             =head2 _read_EMBL_TaxID_DBLink
1229              
1230             Title : _read_EMBL_TaxID_DBLink
1231             Usage :
1232             Function: Reads the EMBL database cross reference to NCBI TaxID ("OX") lines
1233             Example :
1234             Returns : A list of Bio::Annotation::DBLink objects
1235             Args :
1236              
1237             =cut
1238              
1239             sub _read_EMBL_TaxID_DBLink {
1240 3     3   5 my( $self,$buffer ) = @_;
1241 3         3 my( @db_link );
1242              
1243 3         5 $_ = $$buffer;
1244 3   66     13 while (defined( $_ ||= $self->_readline )) {
1245 4 100       19 if ( /^OX (\S+)=(\d+);$/ ) {
1246 1         4 my ($databse, $prim_id) = ($1,$2);
1247 1         11 my $link = Bio::Annotation::DBLink->new(-database => $databse,
1248             -primary_id => $prim_id,);
1249 1         4 push(@db_link, $link);
1250             } else {
1251 3         4 last;
1252             }
1253 1         6 $_ = undef; # Empty $_ to trigger read of next line
1254             }
1255              
1256 3         5 $$buffer = $_;
1257 3         9 return @db_link;
1258             }
1259              
1260             =head2 _filehandle
1261              
1262             Title : _filehandle
1263             Usage : $obj->_filehandle($newval)
1264             Function:
1265             Example :
1266             Returns : value of _filehandle
1267             Args : newvalue (optional)
1268              
1269              
1270             =cut
1271              
1272             sub _filehandle{
1273 0     0   0 my ($obj,$value) = @_;
1274 0 0       0 if ( defined $value) {
1275 0         0 $obj->{'_filehandle'} = $value;
1276             }
1277 0         0 return $obj->{'_filehandle'};
1278              
1279             }
1280              
1281             =head2 _read_FTHelper_EMBL
1282              
1283             Title : _read_FTHelper_EMBL
1284             Usage : _read_FTHelper_EMBL($buffer)
1285             Function: reads the next FT key line
1286             Example :
1287             Returns : Bio::SeqIO::FTHelper object
1288             Args : filehandle and reference to a scalar
1289              
1290              
1291             =cut
1292              
1293             sub _read_FTHelper_EMBL {
1294 256     256   216 my ($self,$buffer) = @_;
1295              
1296 256         187 my ($key, # The key of the feature
1297             $loc, # The location line from the feature
1298             @qual, # An arrray of lines making up the qualifiers
1299             );
1300              
1301 256 50       814 if ($$buffer =~ /^FT\s{3}(\S+)\s+(\S+)/ ) {
    0          
1302 256         386 $key = $1;
1303 256         274 $loc = $2;
1304             # Read all the lines up to the next feature
1305 256         481 while ( defined($_ = $self->_readline) ) {
1306 2038 100       9036 if (/^FT(\s+)(.+?)\s*$/) {
1307             # Lines inside features are preceeded by 19 spaces
1308             # A new feature is preceeded by 3 spaces
1309 2016 100       2608 if (length($1) > 4) {
1310             # Add to qualifiers if we're in the qualifiers
1311 1782 100       2024 if (@qual) {
    100          
1312 1523         2949 push(@qual, $2);
1313             }
1314             # Start the qualifier list if it's the first qualifier
1315             elsif (substr($2, 0, 1) eq '/') {
1316 254         642 @qual = ($2);
1317             }
1318             # We're still in the location line, so append to location
1319             else {
1320 5         18 $loc .= $2;
1321             }
1322             } else {
1323             # We've reached the start of the next feature
1324 234         243 last;
1325             }
1326             } else {
1327             # We're at the end of the feature table
1328 22         29 last;
1329             }
1330             }
1331             } elsif ( $$buffer =~ /^CO\s+(\S+)/) {
1332 0         0 $key = 'CONTIG';
1333 0         0 $loc = $1;
1334             # Read all the lines up to the next feature
1335 0         0 while ( defined($_ = $self->_readline) ) {
1336 0 0       0 if (/^CO\s+(\S+)\s*$/) {
1337 0         0 $loc .= $1;
1338             } else {
1339             # We've reached the start of the next feature
1340 0         0 last;
1341             }
1342             }
1343             } else {
1344             # No feature key
1345 0         0 return;
1346             }
1347              
1348             # Put the first line of the next feature into the buffer
1349 256         230 $$buffer = $_;
1350              
1351             # Make the new FTHelper object
1352 256         625 my $out = Bio::SeqIO::FTHelper->new();
1353 256         440 $out->verbose($self->verbose());
1354 256         393 $out->key($key);
1355 256         381 $out->loc($loc);
1356              
1357             # Now parse and add any qualifiers. (@qual is kept
1358             # intact to provide informative error messages.)
1359 256         181 my $last_unquoted_qualifier;
1360             QUAL:
1361 256         449 for (my $i = 0; $i < @qual; $i++) {
1362 1091         901 my $data = $qual[$i];
1363 1091         674 my ( $qualifier, $value );
1364            
1365 1091 50       3513 unless (( $qualifier, $value ) = ($data =~ m{^/([^=]+)(?:=\s*(.*))?})) {
1366 0 0       0 if ( defined $last_unquoted_qualifier ) {
1367             # handle case of unquoted multiline - read up everything until the next qualifier
1368 0   0     0 do {
1369             # Protein sequence translations need to be joined without spaces,
1370             # other qualifiers need those.
1371 0 0       0 $value .= ' ' if $qualifier ne "translation";
1372 0         0 $value .= $data;
1373             } while defined($data = $qual[++$i]) && $data !~ m[^/];
1374 0         0 $i--;
1375 0         0 $out->field->{$last_unquoted_qualifier}->[-1] .= $value;
1376 0         0 $last_unquoted_qualifier = undef;
1377 0         0 next QUAL;
1378             } else {
1379 0         0 $self->throw("Can't see new qualifier in: $_\nfrom:\n"
1380             . join('', map "$_\n", @qual));
1381             }
1382             }
1383 1091 50       1351 $qualifier = '' if not defined $qualifier;
1384              
1385 1091         670 $last_unquoted_qualifier = undef;
1386 1091 50       1064 if (defined $value) {
1387             # Do we have a quoted value?
1388 1091 100       1331 if (substr($value, 0, 1) eq '"') {
1389             # Keep adding to value until we find the trailing quote
1390             # and the quotes are balanced
1391             QUOTES:
1392 971   66     3337 while ($value !~ /"$/ or $value =~ tr/"/"/ % 2) { #"
1393 686         430 $i++;
1394 686         454 my $next = $qual[$i];
1395 686 50       701 if (!defined($next)) {
1396 0         0 $self->warn("Unbalanced quote in:\n".join("\n", @qual).
1397             "\nAdding quote to close...".
1398             "Check sequence quality!");
1399 0         0 $value .= '"';
1400 0         0 last QUOTES;
1401             }
1402              
1403             # Protein sequence translations need to be joined without spaces,
1404             # other qualifiers need those.
1405 686 100       659 if ($qualifier eq "translation") {
1406 569         1241 $value .= $next;
1407             } else {
1408 117         454 $value .= " $next";
1409             }
1410             }
1411             # Trim leading and trailing quotes
1412 971         6297 $value =~ s/^"|"$//g;
1413             # Undouble internal quotes
1414 971         902 $value =~ s/""/"/g; #"
1415             } else {
1416 120         122 $last_unquoted_qualifier = $qualifier;
1417             }
1418             } else {
1419 0         0 $value = '_no_value';
1420             }
1421              
1422             # Store the qualifier
1423 1091   100     1465 $out->field->{$qualifier} ||= [];
1424 1091         813 push(@{$out->field->{$qualifier}},$value);
  1091         1193  
1425             }
1426              
1427 256         456 return $out;
1428             }
1429              
1430             =head2 _write_line_EMBL
1431              
1432             Title : _write_line_EMBL
1433             Usage :
1434             Function: internal function
1435             Example :
1436             Returns : 1 if writing suceeded, else undef
1437             Args :
1438              
1439              
1440             =cut
1441              
1442             sub _write_line_EMBL {
1443 0     0   0 my ($self,$pre1,$pre2,$line,$length) = @_;
1444              
1445 0 0       0 $length || $self->throw("Miscalled write_line_EMBL without length. Programming error!");
1446 0         0 my $subl = $length - length $pre2;
1447 0         0 my $linel = length $line;
1448 0         0 my $i;
1449              
1450 0         0 my $sub = substr($line,0,$length - length $pre1);
1451              
1452 0 0       0 $self->_print( "$pre1$sub\n") || return;
1453              
1454 0         0 for ($i= ($length - length $pre1);$i < $linel;) {
1455 0         0 $sub = substr($line,$i,($subl));
1456 0 0       0 $self->_print( "$pre2$sub\n") || return;
1457 0         0 $i += $subl;
1458             }
1459              
1460 0         0 return 1;
1461             }
1462              
1463             =head2 _write_line_EMBL_regex
1464              
1465             Title : _write_line_EMBL_regex
1466             Usage :
1467             Function: internal function for writing lines of specified
1468             length, with different first and the next line
1469             left hand headers and split at specific points in the
1470             text
1471             Example :
1472             Returns : nothing
1473             Args : file handle, first header, second header, text-line, regex for line breaks, total line length
1474              
1475              
1476             =cut
1477              
1478             sub _write_line_EMBL_regex {
1479 263     263   269 my ($self,$pre1,$pre2,$line,$regex,$length) = @_;
1480              
1481             #print STDOUT "Going to print with $line!\n";
1482              
1483 263 50       310 $length || $self->throw("Programming error - called write_line_EMBL_regex without length.");
1484              
1485 263         244 my $subl = $length - (length $pre1) -1 ;
1486 263         157 my( @lines );
1487              
1488 263         308 CHUNK: while($line) {
1489 401         419 foreach my $pat ($regex, '[,;\.\/-]\s|'.$regex, '[,;\.\/-]|'.$regex) {
1490 401 50       2879 if ($line =~ m/^(.{1,$subl})($pat)(.*)/ ) {
1491 401         644 my $l = $1.$2;
1492 401 100       505 $l =~ s/#/ /g # remove word wrap protection char '#'
1493             if $pre1 eq "RA ";
1494 401         377 my $newl = $3;
1495 401         423 $line = substr($line,length($l));
1496             # be strict about not padding spaces according to
1497             # genbank format
1498 401         783 $l =~ s/\s+$//;
1499 401 50       477 next CHUNK if ($l eq '');
1500 401         327 push(@lines, $l);
1501 401         703 next CHUNK;
1502             }
1503             }
1504             # if we get here none of the patterns matched $subl or less chars
1505 0         0 $self->warn("trouble dissecting \"$line\"\n into chunks ".
1506             "of $subl chars or less - this tag won't print right");
1507             # insert a space char to prevent infinite loops
1508 0         0 $line = substr($line,0,$subl) . " " . substr($line,$subl);
1509             }
1510 263         226 my $s = shift @lines;
1511 263 100 50     683 ($self->_print("$pre1$s\n") || return) if $s;
1512 263         309 foreach my $s ( @lines ) {
1513 145 50       212 $self->_print("$pre2$s\n") || return;
1514             }
1515              
1516 263         531 return 1;
1517             }
1518              
1519             =head2 _post_sort
1520              
1521             Title : _post_sort
1522             Usage : $obj->_post_sort($newval)
1523             Function:
1524             Returns : value of _post_sort
1525             Args : newvalue (optional)
1526              
1527              
1528             =cut
1529              
1530             sub _post_sort{
1531 6     6   7 my $obj = shift;
1532 6 50       13 if ( @_ ) {
1533 0         0 my $value = shift;
1534 0         0 $obj->{'_post_sort'} = $value;
1535             }
1536 6         14 return $obj->{'_post_sort'};
1537              
1538             }
1539              
1540             =head2 _show_dna
1541              
1542             Title : _show_dna
1543             Usage : $obj->_show_dna($newval)
1544             Function:
1545             Returns : value of _show_dna
1546             Args : newvalue (optional)
1547              
1548              
1549             =cut
1550              
1551             sub _show_dna{
1552 42     42   50 my $obj = shift;
1553 42 100       89 if ( @_ ) {
1554 31         34 my $value = shift;
1555 31         49 $obj->{'_show_dna'} = $value;
1556             }
1557 42         59 return $obj->{'_show_dna'};
1558              
1559             }
1560              
1561             =head2 _id_generation_func
1562              
1563             Title : _id_generation_func
1564             Usage : $obj->_id_generation_func($newval)
1565             Function:
1566             Returns : value of _id_generation_func
1567             Args : newvalue (optional)
1568              
1569              
1570             =cut
1571              
1572             sub _id_generation_func{
1573 11     11   11 my $obj = shift;
1574 11 50       25 if ( @_ ) {
1575 0         0 my $value = shift;
1576 0         0 $obj->{'_id_generation_func'} = $value;
1577             }
1578 11         24 return $obj->{'_id_generation_func'};
1579              
1580             }
1581              
1582             =head2 _ac_generation_func
1583              
1584             Title : _ac_generation_func
1585             Usage : $obj->_ac_generation_func($newval)
1586             Function:
1587             Returns : value of _ac_generation_func
1588             Args : newvalue (optional)
1589              
1590              
1591             =cut
1592              
1593             sub _ac_generation_func{
1594 11     11   23 my $obj = shift;
1595 11 50       25 if ( @_ ) {
1596 0         0 my $value = shift;
1597 0         0 $obj->{'_ac_generation_func'} = $value;
1598             }
1599 11         101 return $obj->{'_ac_generation_func'};
1600              
1601             }
1602              
1603             =head2 _sv_generation_func
1604              
1605             Title : _sv_generation_func
1606             Usage : $obj->_sv_generation_func($newval)
1607             Function:
1608             Returns : value of _sv_generation_func
1609             Args : newvalue (optional)
1610              
1611              
1612             =cut
1613              
1614             sub _sv_generation_func{
1615 0     0   0 my $obj = shift;
1616 0 0       0 if ( @_ ) {
1617 0         0 my $value = shift;
1618 0         0 $obj->{'_sv_generation_func'} = $value;
1619             }
1620 0         0 return $obj->{'_sv_generation_func'};
1621              
1622             }
1623              
1624             =head2 _kw_generation_func
1625              
1626             Title : _kw_generation_func
1627             Usage : $obj->_kw_generation_func($newval)
1628             Function:
1629             Returns : value of _kw_generation_func
1630             Args : newvalue (optional)
1631              
1632              
1633             =cut
1634              
1635             sub _kw_generation_func{
1636 11     11   15 my $obj = shift;
1637 11 50       21 if ( @_ ) {
1638 0         0 my $value = shift;
1639 0         0 $obj->{'_kw_generation_func'} = $value;
1640             }
1641 11         61 return $obj->{'_kw_generation_func'};
1642              
1643             }
1644              
1645             1;