File Coverage

blib/lib/Chemistry/File/PDB.pm

Criterion	Covered	Total	%
statement	81	97	83.5
branch	18	36	50.0
condition	6	13	46.1
subroutine	12	12	100.0
pod	4	4	100.0
total	121	162	74.6

line	stmt	bran	cond	sub	pod	time	code
1							package Chemistry::File::PDB;
2
3							our $VERSION = '0.23';
4							# $Id: PDB.pm,v 1.13 2009/05/10 21:57:32 itubert Exp $
5
6	2			2		85539	use base qw(Chemistry::File);
	2					4
	2					3194
7	2			2		75433	use Chemistry::MacroMol;
	2					178360
	2					291
8	2			2		3699	use Chemistry::Domain;
	2					1870
	2					99
9	2			2		13	use Scalar::Util qw(weaken);
	2					4
	2					291
10	2			2		12	use Carp;
	2					6
	2					221
11	2			2		10	use strict;
	2					5
	2					55
12	2			2		13	use warnings;
	2					85
	2					3103
13
14							=head1 NAME
15
16							Chemistry::File::PDB - Protein Data Bank file format reader/writer
17
18							=head1 SYNOPSIS
19
20							use Chemistry::File::PDB;
21
22							# read a PDB file
23							my $macro_mol = Chemistry::MacroMol->read("myfile.pdb");
24
25							# write a PDB file
26							$macro_mol->write("out.pdb");
27
28							# read all models in a multi-model file
29							my @mols = Chemistry::MacroMol->read("models.pdb");
30
31							# read one model at a time
32							my $file = Chemistry::MacroMol->file("models.pdb");
33							$file->open;
34							while (my $mol = $file->read_mol($file->fh)) {
35							# do something with $mol
36							}
37
38							=cut
39
40							=head1 DESCRIPTION
41
42							This module reads and writes PDB files. The PDB file format is commonly used to
43							describe proteins, particularly those stored in the Protein Data Bank
44							(L). The current version of this module only reads the
45							following record types, ignoring everything else:
46
47							ATOM
48							HETATM
49							ENDMDL
50							END
51
52							This module automatically registers the 'pdb' format with Chemistry::Mol,
53							so that PDB files may be identified and read by Chemistry::Mol->read(). For
54							autodetection purpuses, it assumes that files ending in .pdb or having
55							a line matching /^(ATOM \|HETATM)/ are PDB files.
56
57							The PDB reader and writer is designed for dealing with Chemistry::MacroMol
58							objects, but it can also create and use Chemistry::Mol objects by throwing some
59							information away.
60
61							=head2 Properties
62
63							When reading and writing files, this module stores or gets some of the
64							information in the following places:
65
66							=over
67
68							=item $domain->type
69
70							The residue type, such as "ARG".
71
72							=item $domain->name
73
74							The type and sequence number, such as "ARG114".
75
76							=item $domain->attr("pdb/sequence_number")
77
78							The residue sequence number as given in the PDB file.
79
80							=item $domain->attr("pdb/chain_id")
81
82							The chain to which this residue belongs (one character).
83
84							=item $domain->attr("pdb/insertion_code")
85
86							The residue insertion code (see the PDB specification for details).
87
88							=item $atom->name
89
90							The PDB atom name, such as "CA".
91
92							=item $atom->attr("pdb/residue_name")
93
94							The name of the residue, as discussed above.
95
96							=item $atom->attr("pdb/serial_number")
97
98							The serial number for the atom, as given in the PDB file.
99
100							=back
101
102							If some of this information is not available when writing a PDB file, this
103							module tries to make it up (by counting the atoms or residues, for example).
104							The default residue name for writing is UNK (unknown). Atom names are just the
105							atomic symbols.
106
107							=head2 Multi-model files
108
109							If a PDB file has multiple models (separated by END or ENDMDL records), each
110							call to read_mol will return one model.
111
112							=head2 Output features
113
114							On writing Chemistry::Mol objects, which don't have macromolecule information
115							and usually don't have atom names, the atom names are made up by concatenating
116							the atomic symbol with a unique ID (up to 1296 atoms are possible). The ID can
117							be disabled by setting the option 'noid':
118
119							$mol->write("out.pdb", noid => 1);
120
121							The molecule's name is written as a HEADER record; A REMARK record is added
122							listing the version of Chemistry::File::PDB that was used.
123
124							=cut
125
126							Chemistry::Mol->register_format(pdb => __PACKAGE__);
127
128							sub read_mol {
129	6			6	1	1232	my ($self, $fh, %options) = @_;
130	6	100				65	return if $fh->eof;
131
132	3					109	my $domain;
133	3		50			15	my $mol_class = $options{mol_class} \|\| "Chemistry::MacroMol";
134	3					31	my $mol = $mol_class->new;
135	3					425	my $is_macro = $mol->isa('Chemistry::MacroMol');
136	3	100				13	$domain = $mol unless $is_macro;
137
138	3					7	local $_;
139	3					9	my $curr_residue = 0;
140	3					24	while (<$fh>) {
141	423	100				13408	if (/^TER/) {
		100
		50
142							# TODO read separated molecules?
143							} elsif (/^(?:HETATM\|ATOM)/) {
144	417					4415	my ($atom_n, $symbol, $suff, $res_name, $chain_id,
145							$seq_n, $ins_code, $x, $y, $z
146							) = unpack "x6A5x1A2A2x1A3x1A1A4A1x3A8A8A8", $_;
147							#print "S:$symbol; N:$name; x:$x; y:$y; z:$z\n";
148	417					866	$seq_n *= 1;
149	417	100	100			1751	if (!$domain \|\| $seq_n != $curr_residue) {
150							# new residue
151	30	100				322	if ($is_macro) {
152	20					156	$domain = Chemistry::Domain->new(
153							parent => $mol, name => "$res_name$seq_n",
154							type => $res_name);
155	20					2063	$mol->add_domain($domain);
156	20					261	weaken $domain;
157	20					70	$domain->attr('pdb/sequence_number', $seq_n);
158	20					418	$domain->attr('pdb/chain_id', $chain_id);
159	20					453	$domain->attr('pdb/insertion_code', $ins_code);
160							}
161	30					230	$curr_residue = $seq_n;
162							}
163	417					5492	$symbol =~ s/ //g;
164	417					879	my $atom_name = $symbol.$suff;
165	417					537	$atom_name =~ s/ //g;
166	417					861	$symbol = ucfirst(lc($symbol));
167	417					3259	my $a = $domain->new_atom(
168							symbol => $symbol,
169							coords => [$x, $y, $z],
170							name => $atom_name,
171							);
172	417					74307	$a->attr('pdb/residue_name', "$res_name$seq_n");
173	417					7682	$a->attr('pdb/serial_number', $atom_n*1);
174							#$a->attr('pdb/residue', $domain); # XXX causes memory leak
175							} elsif (/^END(MDL)?/) {
176	3					64	return $mol;
177							}
178							}
179	0					0	return $mol;
180							}
181
182
183							sub name_is {
184	1			1	1	180	my ($class, $fname) = @_;
185	1					8	$fname =~ /\.pdb$/i;
186							}
187
188							sub file_is {
189	3			3	1	1007	my ($class, $fname) = @_;
190
191	3	50				30	return 1 if $fname =~ /\.pdb$/i;
192
193	0	0				0	open F, $fname or croak "Could not open file $fname";
194
195	0					0	while (){
196	0	0	0			0	if (/^ATOM / or /^HETATM/) {
197	0					0	close F;
198	0					0	return 1;
199							}
200							}
201
202	0					0	return 0;
203							}
204
205							sub write_string {
206	1			1	1	350	my ($class, $mol, %opts) = @_;
207	1					3	my $ret = '';
208	1	50				11	$ret .= 'TITLE ' . $mol->name . "\n" if defined $mol->name;
209	1					20	$ret .= 'REMARK Generated by ' . __PACKAGE__ . " version $VERSION\n";
210	1	50				8	if ($mol->isa("Chemistry::MacroMol")) {
211	1					2	my $i = 1;
212	1					2	my $j = 1;
213	1					7	for my $res ($mol->domains) {
214	10					47	my $seq_n = $res->attr("pdb/sequence_number");
215	10	50				101	$seq_n = $j++ unless defined $seq_n;
216	10		50			29	my $res_name = substr($res->name, 0, 3) \|\| "UNK";
217
218	10					95	for my $atom ($res->atoms) {
219	139					1880	my $serial_n = $res->attr("pdb/serial_number");
220	139					1366	my $ins_code = $res->attr("pdb/insertion_code");
221	139	50				1773	$serial_n = $i++ unless defined $serial_n;
222	139					430	my @coords = $atom->coords->array;
223
224							# make atom name
225	2			2		16	no warnings 'uninitialized';
	2					5
	2					1221
226	139					2008	my $symbol = substr $atom->symbol, 0, 2;
227	139		33			1082	my $atom_name = $atom->name \|\| $symbol;
228	139					2495	$atom_name =~ /^(\dH\|$symbol)(.{0,2})/;
229							#print "NAME: '$atom_name' ($1,$2); SYMBOL: '$symbol'\n";
230	139					521	$atom_name = sprintf "%2s%-2s", $1, $2;
231	139					1574	$ret .= sprintf "ATOM %5d %4s %-3s %4d%1s %8.3f%8.3f%8.3f\n",
232							$serial_n, $atom_name, $res_name, $seq_n, $ins_code, @coords;
233							}
234							}
235							} else {
236	0					0	my $i = 1;
237	0					0	for my $atom ($mol->atoms) {
238	0					0	my @coords = $atom->coords->array;
239	0	0				0	if ($atom->name) {
240	0					0	$ret .= sprintf "ATOM %5d %4s %-3s %4d %8.3f%8.3f%8.3f\n",
241							$i++, $atom->name, "UNK", 1, @coords;
242							} else {
243	0					0	my ($i0, $i1) = ($i % 36, int($i/36));
244	0	0				0	my $id = ($i1 < 10 ? $i1 : chr(55+$i1))
		0
245							. ($i0 < 10 ? $i0 : chr(55+$i0));
246	0	0				0	$id = ' ' if $opts{noid};
247	0					0	$ret .= sprintf "HETATM%5d %2s%2s UNK 1 %8.3f%8.3f%8.3f\n",
248							$i++, $atom->symbol, $id, @coords;
249							}
250							}
251							}
252	1					5	$ret .= "TER\nEND\n";
253	1					49	$ret;
254							}
255
256							1;
257
258							=head1 VERSION
259
260							0.23
261
262							=head1 SEE ALSO
263
264							L, L, L,
265							L.
266
267							The PDB format description at
268							L
269
270							There is another PDB reader in Perl, as part of the BioPerl project:
271							L.
272
273							=head1 AUTHOR
274
275							Ivan Tubert-Brohman
276
277							=head1 COPYRIGHT
278
279							Copyright (c) 2009 Ivan Tubert-Brohman. All rights reserved. This program is
280							free software; you can redistribute it and/or modify it under the same terms as
281							Perl itself.
282
283							=cut
284
285
286