File Coverage

blib/lib/Chemistry/File.pm
Criterion Covered Total %
statement 104 140 74.2
branch 35 60 58.3
condition 16 27 59.2
subroutine 22 29 75.8
pod 20 21 95.2
total 197 277 71.1


line stmt bran cond sub pod time code
1             package Chemistry::File;
2              
3             our $VERSION = '0.40'; # VERSION
4              
5             =head1 NAME
6              
7             Chemistry::File - Molecule file I/O base class
8              
9             =head1 SYNOPSIS
10              
11             # As a convenient interface for several mol readers:
12             use Chemistry::File qw(PDB MDLMol); # load PDB and MDL modules
13            
14             # or try to use every file I/O module installed in the system:
15             use Chemistry::File ':auto';
16              
17             my $mol1 = Chemistry::Mol->read("file.pdb");
18             my $mol2 = Chemistry::Mol->read("file.mol");
19              
20              
21             # as a base for a mol reader:
22              
23             package Chemistry::File::Myfile;
24             use base qw(Chemistry::File);
25             use Chemistry::Mol;
26             Chemistry::Mol->register_format("myfile", __PACKAGE__);
27              
28             # override the read_mol method
29             sub read_mol {
30             my ($self, $fh, %opts) = shift;
31             my $mol_class = $opts{mol_class} || "Chemistry::Mol";
32             my $mol = $mol_class->new;
33             # ... do some stuff with $fh and $mol ...
34             return $mol;
35             }
36              
37             # override the write_mol method
38             sub write_mol {
39             my ($self, $fh, $mol, %opts) = shift;
40             print $fh $mol->name, "\n";
41             # ... do some stuff with $fh and $mol ...
42             }
43              
44             =head1 DESCRIPTION
45              
46             The main use of this module is as a base class for other molecule file I/O
47             modules (for example, Chemistry::File::PDB). Such modules should override and
48             extend the Chemistry::File methods as needed. You only need to care about the
49             methods here if if you are writing a file I/O module or if you want a finer
50             degree of control than what is offered by the simple read and write methods
51             in the Chemistry::Mol class.
52              
53             From the user's point of view, this module can also be used as shorthand
54             for using several Chemistry::File modules at the same time.
55              
56             use Chemistry::File qw(PDB MDLMol);
57              
58             is exactly equivalent to
59              
60             use Chemistry::File::PDB;
61             use Chemistry::File::MDLMol;
62              
63             If you use the :auto keyword, Chemistry::File will autodetect and load
64             all the Chemistry::File::* modules installed in your system.
65              
66             use Chemistry::File ':auto';
67              
68             =head1 FILE I/O MODEL
69              
70             Before version 0.30, file I/O modules typically used only parse_string,
71             write_string, parse_file, and write_file, and they were generally used as class
72             methods. A file could contain one or more molecules and only be read or written
73             whole; reading it would return every molecule on the file. This was problematic
74             when dealing with large multi-molecule files (such as SDF files), because all
75             the molecules would have to be loaded into memory at the same time.
76              
77             While version 0.30 retains backward compatibility with that simple model, it
78             also allows a more flexible interface that allows reading one molecule at a
79             time, skipping molecules, and reading and writing file-level information that
80             is not associated with specific molecules. The following diagram shows the
81             global structure of a file according to the new model:
82              
83             +-----------+
84             | header |
85             +-----------+
86             | molecule |
87             +-----------+
88             | molecule |
89             +-----------+
90             | ... |
91             +-----------+
92             | footer |
93             +-----------+
94              
95             In cases where the header and the footer are empty, the model reduces to the
96             pre-0.30 version. The low-level steps to read a file are the following:
97              
98             $file = Chemistry::File::MyFormat->new(file => 'xyz.mol');
99             $file->open('<');
100             $file->read_header;
101             while (my $mol = $self->read_mol($file->fh, %opts)) {
102             # do something with $mol...
103             }
104             $self->read_footer;
105              
106             The C method does all the above automatically, and it stores all the
107             molecules read in the mols property.
108              
109             =head1 STANDARD OPTIONS
110              
111             All the methods below include a list of options %opts at the end of the
112             parameter list. Each class implementing this interface may have its own
113             particular options. However, the following options should be recognized by all
114             classes:
115              
116             =over
117              
118             =item mol_class
119              
120             A class or object with a C method that constructs a molecule. This is
121             needed when the user want to specify a molecule subclass different from the
122             default. When this option is not defined, the module may use Chemistry::Mol
123             or whichever class is appropriate for that file format.
124              
125             =item format
126              
127             The name of the file format being used, as registered by
128             Chemistry::Mol->register_format.
129              
130             =item fatal
131              
132             If true, parsing errors should throw an exception; if false, they should just
133             try to recover if possible. True by default.
134              
135             =back
136              
137             =head1 CLASS METHODS
138              
139             The class methods in this class (or rather, its derived classes) are usually
140             not called directly. Instead, use Chemistry::Mol->read, write, print, parse,
141             and file. These methods also work if called as instance methods.
142              
143             =over
144              
145              
146             =cut
147              
148 13     13   108809 use strict;
  13         37  
  13         528  
149 13     13   70 use warnings;
  13         26  
  13         4295  
150 13     13   78 no warnings qw(uninitialized);
  13         26  
  13         584  
151 13     13   76 use Carp;
  13         29  
  13         1154  
152 13     13   7078 use FileHandle;
  13         189035  
  13         97  
153 13     13   5101 use base qw(Chemistry::Obj);
  13         28  
  13         18832  
154             # don't blame our problems in the Chemistry::Mol module ;-)
155             our @CARP_NOT = qw(Chemistry::Mol);
156              
157             # This subroutine implements the :auto functionality
158             sub import {
159 13     13   289 my $pack = shift;
160 13         21314 for my $param (@_){
161 0 0       0 if ($param eq ':auto') {
162 0         0 for my $pmfile (map {glob "$_/Chemistry/File/*.pm"} @INC) {
  0         0  
163 0         0 my ($pm) = $pmfile =~ m|(Chemistry/File/.*\.pm)$|;
164             #warn "requiring $pm\n";
165 0         0 eval { require $pm };
  0         0  
166 0 0       0 die "Error in Chemistry::File: '$@'; pmfile='$pmfile'; pm='$pm'\n" if $@;
167             }
168             } else {
169 0         0 eval "use ${pack}::$param";
170 0 0       0 die "$@" if $@;
171             }
172             }
173             }
174              
175             =item $class->parse_string($s, %options)
176              
177             Parse a string $s and return one or more molecule objects. This is an abstract
178             method, so it should be provided by all derived classes.
179              
180             =cut
181              
182             sub parse_string {
183 0     0 1 0 my ($self, $s, %opts) = @_;
184 0 0       0 if ($opts{_must_override}) {
185 0   0     0 my $class = ref $self || $self;
186 0         0 croak "parse_string() is not implemented for $class";
187             }
188 0         0 $self->new(file => \$s, opts => \%opts)->read;
189             }
190              
191              
192             =item $class->write_string($mol, %options)
193              
194             Convert a molecule to a string. This is an abstract method, so it should be
195             provided by all derived classes.
196              
197             =cut
198              
199             sub write_string {
200 3     3 1 12 my ($self, $mol, %opts) = @_;
201 3 50       12 if ($opts{_must_override}) {
202 0   0     0 my $class = ref $self || $self;
203 0         0 croak "write_string() is not implemented for $class";
204             }
205 3         6 my $s;
206 3         12 $self->new(file => \$s, mols => [$mol], opts => \%opts)->write;
207 3         10 $s;
208             }
209              
210             =item $class->parse_file($file, %options)
211              
212             Reads the file $file and returns one or more molecules. The default method
213             slurps the whole file and then calls parse_string, but derived classes may
214             choose to override it. $file can be a filehandle, a filename, or a scalar
215             reference. See C for details.
216              
217             =cut
218              
219             sub parse_file {
220 13     13 1 59 my ($self, $file, %opts) = @_;
221 13         125 $self->new(file => $file, opts => \%opts)->read;
222             }
223              
224             =item $class->write_file($mol, $file, %options)
225              
226             Writes a file $file containing the molecule $mol. The default method calls
227             write_string first and then saves the string to a file, but derived classes
228             may choose to override it. $file can be either a filehandle or a filename.
229              
230             =cut
231              
232             sub write_file {
233 3     3 1 10 my ($self, $mol, $file, %opts) = @_;
234              
235 3         20 $self->new(file => $file, mols => [$mol], opts => \%opts)->write;
236             }
237              
238             =item $class->name_is($fname, %options)
239              
240             Returns true if a filename is of the format corresponding to the class.
241             It should look at the filename only, because it may be called with
242             non-existent files. It is used to determine with which format to save a file.
243             For example, the Chemistry::File::PDB returns true if the file ends in .pdb.
244              
245             =cut
246              
247             sub name_is {
248 0     0 1 0 0;
249             }
250              
251             =item $class->string_is($s, %options)
252              
253             Examines the string $s and returns true if it has the format of the class.
254              
255             =cut
256              
257             sub string_is {
258 0     0 1 0 0;
259             }
260              
261             =item $class->file_is($file, %options)
262              
263             Examines the file $file and returns true if it has the format of the class.
264             The default method slurps the whole file and then calls string_is, but derived
265             classes may choose to override it.
266              
267             =cut
268              
269             sub file_is {
270 11     11 1 38 my ($self, $file, %opts) = @_;
271            
272 11         23 my $s = eval {
273 11         72 $self->open('<');
274 0         0 $self->slurp;
275             };
276 11 50       1122 if ($s) {
    50          
277 0         0 $self->string_is($s, %opts);
278             } elsif (! ref $file) {
279 11         80 $self->name_is($file, %opts);
280             }
281             }
282              
283             =item $class->slurp
284              
285             Reads a file into a scalar. Automatic decompression of gzipped files is
286             supported if the Compress::Zlib module is installed. Files ending in .gz are
287             assumed to be compressed; otherwise it is possible to force decompression by
288             passing the gzip => 1 option (or no decompression with gzip => 0).
289              
290             =cut
291              
292             # slurp a file into a scalar, with transparent decompression
293             sub slurp {
294 0     0 1 0 my ($self) = @_;
295              
296 0         0 my $fh = $self->fh;
297 0         0 local $/;
298 0         0 <$fh>;
299             }
300              
301             =item $class->new(file => $file, opts => \%opts)
302              
303             Create a new file object. This method is usually called indirectly via
304             the Chemistry::Mol->file method. $file may be a scalar with a filename, an
305             open filehandle, or a reference to a scalar. If a reference to a scalar is
306             used, the string contained in the scalar is used as an in-memory file.
307              
308             =cut
309              
310             sub new {
311 24     24 1 143589 my $self = shift->SUPER::new(@_);
312 24 50       152 $self->{opts}{fatal} = 1 unless exists $self->{opts}{fatal};
313 24         169 $self;
314             }
315              
316             Chemistry::Obj::accessor(qw(file fh opts mols mode));
317              
318             =back
319              
320             =head1 INSTANCE METHODS
321              
322             =head2 Accessors
323              
324             Chemistry::File objects are derived from Chemistry::Obj and have the same
325             properties (name, id, and type), as well as the following ones:
326              
327             =over
328              
329             =item file
330              
331             The "file" as described above under C.
332              
333             =item fh
334              
335             The filehandle used for reading and writing molecules. It is opened by C.
336              
337             =item opts
338              
339             A hashref containing the options that are passed through to the old-style class
340             methods. They are also passed to the instance method to keep a similar
341             interface, but they could access them via $self->opts anyway.
342              
343             =item mode
344              
345             '>' if the file is open for writing, '<' for reading, and false if not open.
346              
347             =item mols
348              
349             C stores all the molecules that were read in this property as an array
350             reference. C gets the molecules to write from here.
351              
352             =back
353              
354             =head2 Abstract methods
355              
356             These methods should be overridden, because they don't really do much by
357             default.
358              
359             =over
360              
361             =item $file->read_header
362              
363             Read whatever information is available in the file before the first molecule.
364             Does nothing by default.
365              
366             =cut
367              
368       15 1   sub read_header { }
369              
370             =item $file->read_footer
371              
372             Read whatever information is available in the file after the last molecule.
373             Does nothing by default.
374              
375             =cut
376              
377       15 1   sub read_footer { }
378              
379             =item $self->slurp_mol($fh)
380              
381             Reads from the input string until the end of the current molecule and returns
382             the "slurped" string. It does not parse the string. It returns undefined if
383             there are no more molecules in the file. This method should be overridden if
384             needed; by default, it slurps until the end of the file.
385              
386             =cut
387              
388             sub slurp_mol {
389 0     0 1 0 my ($self, $fh) = @_;
390 0         0 local $/; <$fh>;
  0         0  
391             }
392              
393             =item $self->skip_mol($fh)
394              
395             Similar to slurp_mol, but it doesn't need to return anything except true or
396             false. It should also be overridden if needed; by default, it just calls
397             slurp_mol.
398              
399             =cut
400              
401 0     0 1 0 sub skip_mol { shift->slurp_mol(@_) }
402              
403             =item $file->read_mol($fh, %opts)
404              
405             Read the next molecule in the input stream. It returns false if there are no
406             more molecules in the file. This method should be overridden by derived
407             classes; otherwise it will call slurp_mol and parse_string (for backwards
408             compatibility; it is recommended to override read_mol directly in new modules).
409              
410             Note: some old file I/O modules (written before the 0.30 interface) may return
411             more than one molecule anyway, so it is recommended to call read_mol in list
412             context to be safe:
413              
414             ($mol) = $file->read_mol($fh, %opts);
415              
416             =cut
417              
418             sub read_mol {
419 4     4 1 7 my ($self, $fh, %opts) = @_;
420 4         8 my $s = $self->slurp_mol($fh);
421 4 100 66     27 return unless defined $s and length $s;
422 3         8 $self->parse_string($s, %opts, _must_override => 1);
423             }
424             =item $file->write_header
425              
426             Write whatever information is needed before the first molecule.
427             Does nothing by default.
428              
429             =cut
430              
431       6 0   sub write_header { }
432              
433             =item $file->write_footer
434              
435             Write whatever information is needed after the last molecule.
436             Does nothing by default.
437              
438             =cut
439              
440       6 1   sub write_footer { }
441              
442             =item $self->write_mol($fh, $mol, %opts)
443              
444             Write one molecule to $fh. By default and for backward compatibility, it just
445             calls C and prints its return value to $self->fh. New classes
446             should override it.
447              
448             =cut
449              
450             sub write_mol {
451 0     0 1 0 my ($self, $fh, $mol, %opts) = @_;
452 0         0 print $fh $self->write_string($mol, %opts, _must_override => 1);
453             }
454              
455             ########################## OTHER ##################################
456              
457             =back
458              
459             =head2 Other methods
460              
461             =over
462              
463             =item $self->open($mode)
464              
465             Opens the file (held in $self->file) for reading by default, or for writing if
466             $mode eq '>'. This method sets $self->fh transparently regardless of whether
467             $self->file is a filename (compressed or not), a scalar reference, or a
468             filehandle.
469              
470             =cut
471              
472             sub open {
473 34     34 1 90 my ($self, $mode) = @_;
474 34         61 my $fh;
475             my $s;
476 34   50     101 $mode ||= '<';
477 34         158 $self->mode($mode);
478 34         103 my $file = $self->file;
479 34 100       2532 croak "Chemistry::File::open: no file supplied" unless defined $file;
480 23 100 66     348 if (ref $file eq 'SCALAR') {
    50 66        
    100          
481 4 50       14 croak "decompression only supported for files" if $self->{opts}{gzip};
482 4 50       10 if ($] >= 5.008) {
483 4         45 open $fh, $mode, $file;
484             } else {
485 0         0 require IO::String;
486 0         0 $fh = IO::String->new($$file);
487             }
488             } elsif (ref $file) {
489 0 0       0 croak "decompression only supported for files" if $self->{opts}{gzip};
490 0         0 $fh = $file;
491             } elsif ($self->{opts}{gzip}
492             or !defined $self->{opts}{gzip} and $file =~ /.gz$/)
493             {
494 4 50       7 eval { require Compress::Zlib } # Carp
  4         81  
495             or croak "Compress::Zlib not installed!";
496 4         1894 require File::Temp;
497              
498 4         9783 $fh = File::Temp::tempfile();
499 4   100     3301 $self->{opts}{gzip} ||= 1;
500 4 100       21 unless ($mode eq '>') {
501 2 50       12 my $gz = Compress::Zlib::gzopen($file, "rb")
502             or croak "Cannot open compressed $file: "
503             . "$Compress::Zlib::gzerrno\n";
504              
505 2         4413 my $buffer;
506 2         13 print $fh $buffer while $gz->gzread($buffer) > 0;
507            
508 2 50       2788 if ($Compress::Zlib::gzerrno != Compress::Zlib::Z_STREAM_END()) {
509 0         0 croak "Error reading from $file: $Compress::Zlib::gzerrno"
510             . ($Compress::Zlib::gzerrno+0) . "\n";
511             }
512 2         17 $gz->gzclose();
513 2         363 seek $fh, 0, 0;
514             }
515             } else {
516 15 100       187 $fh = FileHandle->new("$mode$file")
517             or croak "Could not open file $file: $!";
518             }
519 22         2282 $self->fh($fh);
520 22         48 $self;
521             }
522              
523             =item $self->close
524              
525             Close the file. For regular files this just closes the filehandle, but for
526             gzipped files it does some additional postprocessing. This method is called
527             automatically on object destruction, so it is not mandatory to call it
528             explicitly.
529              
530             =cut
531              
532             sub close {
533 46     46 1 96 my ($self) = @_;
534 46         124 my $fh = $self->fh;
535 46 100 100     186 if ($fh and $self->mode eq '>' and $self->{opts}{gzip}) {
      100        
536 2   50     6 my $level = $self->{opts}{gzip} || 6;
537 2 50       11 $level = 6 if $level == 1;
538 2         8 my $file = $self->file;
539 2 50       10 if (ref $file) {
540 0         0 croak "compression only supported for files";
541             } else {
542 2         146 seek $fh, 0, 0;
543 2 50       20 my $gz = Compress::Zlib::gzopen($file, "wb$level")
544             or croak "Cannot open $file $Compress::Zlib::gzerrno\n";
545 2         4442 local $_;
546 2         56 while (<$fh>) {
547 508 50       35953 $gz->gzwrite($_)
548             or croak "error writing: $Compress::Zlib::gzerrno\n";
549             }
550 2         276 $gz->gzclose;
551             }
552             }
553 46 100       1395 if ($self->mode) {
554 23 50       61 if ($fh) { $fh->close or croak "$!" };
  22 100       153  
555 23         914 $self->mode('');
556             }
557             }
558              
559 24     24   1495 sub DESTROY { shift->close }
560              
561             =item $file->read
562              
563             Read the whole file. This calls open, read_header, read_mol until there are no
564             more molecules left, read_footer, and close. Returns a list of molecules if
565             called in list context, or the first molecule in scalar context.
566              
567             =cut
568              
569             sub read {
570 17     17 1 948 my ($self) = @_;
571 17         73 $self->open('<');
572 16         76 $self->read_header;
573 16         28 my @all_mols;
574 16         153 $self->mols(\@all_mols);
575 16         55 while (my @mols = $self->read_mol($self->fh, %{$self->{opts}})) {
  34         183  
576 18         138 push @all_mols, @mols;
577             }
578 16         113 $self->read_footer;
579 16         86 $self->close;
580 16 100       259 wantarray ? @all_mols : $all_mols[0];
581             }
582              
583             =item $self->write
584              
585             Write all the molecules in $self->mols. It just calls open, write_header,
586             write_mol (per each molecule), write_footer, and close.
587              
588             =cut
589              
590             sub write {
591 6     6 1 16 my ($self) = @_;
592 6         22 $self->open('>');
593 6         30 $self->write_header;
594 6         10 for my $mol (@{$self->mols}) {
  6         18  
595 6         18 $self->write_mol($self->fh, $mol, %{$self->{opts}});
  6         39  
596             }
597 6         5515 $self->write_footer;
598 6         20 $self->close;
599             }
600              
601             1;
602              
603             =back
604              
605             =head1 CAVEATS
606              
607             The :auto feature may not be entirely portable, but it is known to work under
608             Unix and Windows (either Cygwin or ActiveState).
609              
610             =head1 SOURCE CODE REPOSITORY
611              
612             L
613              
614             =head1 SEE ALSO
615              
616             L
617              
618             =head1 AUTHOR
619              
620             Ivan Tubert-Brohman-Brohman
621              
622             =head1 COPYRIGHT
623              
624             Copyright (c) 2005 Ivan Tubert-Brohman. All rights reserved. This program is
625             free software; you can redistribute it and/or modify it under the same terms as
626             Perl itself.
627              
628             =cut
629