line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Chemistry::File::PDB; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
our $VERSION = '0.23'; |
4
|
|
|
|
|
|
|
# $Id: PDB.pm,v 1.13 2009/05/10 21:57:32 itubert Exp $ |
5
|
|
|
|
|
|
|
|
6
|
2
|
|
|
2
|
|
85539
|
use base qw(Chemistry::File); |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
3194
|
|
7
|
2
|
|
|
2
|
|
75433
|
use Chemistry::MacroMol; |
|
2
|
|
|
|
|
178360
|
|
|
2
|
|
|
|
|
291
|
|
8
|
2
|
|
|
2
|
|
3699
|
use Chemistry::Domain; |
|
2
|
|
|
|
|
1870
|
|
|
2
|
|
|
|
|
99
|
|
9
|
2
|
|
|
2
|
|
13
|
use Scalar::Util qw(weaken); |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
291
|
|
10
|
2
|
|
|
2
|
|
12
|
use Carp; |
|
2
|
|
|
|
|
6
|
|
|
2
|
|
|
|
|
221
|
|
11
|
2
|
|
|
2
|
|
10
|
use strict; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
55
|
|
12
|
2
|
|
|
2
|
|
13
|
use warnings; |
|
2
|
|
|
|
|
85
|
|
|
2
|
|
|
|
|
3103
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Chemistry::File::PDB - Protein Data Bank file format reader/writer |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 SYNOPSIS |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
use Chemistry::File::PDB; |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
# read a PDB file |
23
|
|
|
|
|
|
|
my $macro_mol = Chemistry::MacroMol->read("myfile.pdb"); |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# write a PDB file |
26
|
|
|
|
|
|
|
$macro_mol->write("out.pdb"); |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
# read all models in a multi-model file |
29
|
|
|
|
|
|
|
my @mols = Chemistry::MacroMol->read("models.pdb"); |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# read one model at a time |
32
|
|
|
|
|
|
|
my $file = Chemistry::MacroMol->file("models.pdb"); |
33
|
|
|
|
|
|
|
$file->open; |
34
|
|
|
|
|
|
|
while (my $mol = $file->read_mol($file->fh)) { |
35
|
|
|
|
|
|
|
# do something with $mol |
36
|
|
|
|
|
|
|
} |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=cut |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=head1 DESCRIPTION |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
This module reads and writes PDB files. The PDB file format is commonly used to |
43
|
|
|
|
|
|
|
describe proteins, particularly those stored in the Protein Data Bank |
44
|
|
|
|
|
|
|
(L). The current version of this module only reads the |
45
|
|
|
|
|
|
|
following record types, ignoring everything else: |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
ATOM |
48
|
|
|
|
|
|
|
HETATM |
49
|
|
|
|
|
|
|
ENDMDL |
50
|
|
|
|
|
|
|
END |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
This module automatically registers the 'pdb' format with Chemistry::Mol, |
53
|
|
|
|
|
|
|
so that PDB files may be identified and read by Chemistry::Mol->read(). For |
54
|
|
|
|
|
|
|
autodetection purpuses, it assumes that files ending in .pdb or having |
55
|
|
|
|
|
|
|
a line matching /^(ATOM |HETATM)/ are PDB files. |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
The PDB reader and writer is designed for dealing with Chemistry::MacroMol |
58
|
|
|
|
|
|
|
objects, but it can also create and use Chemistry::Mol objects by throwing some |
59
|
|
|
|
|
|
|
information away. |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
=head2 Properties |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
When reading and writing files, this module stores or gets some of the |
64
|
|
|
|
|
|
|
information in the following places: |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=over |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
=item $domain->type |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
The residue type, such as "ARG". |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=item $domain->name |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
The type and sequence number, such as "ARG114". |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
=item $domain->attr("pdb/sequence_number") |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
The residue sequence number as given in the PDB file. |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
=item $domain->attr("pdb/chain_id") |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
The chain to which this residue belongs (one character). |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
=item $domain->attr("pdb/insertion_code") |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
The residue insertion code (see the PDB specification for details). |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=item $atom->name |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
The PDB atom name, such as "CA". |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
=item $atom->attr("pdb/residue_name") |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
The name of the residue, as discussed above. |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
=item $atom->attr("pdb/serial_number") |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
The serial number for the atom, as given in the PDB file. |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=back |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
If some of this information is not available when writing a PDB file, this |
103
|
|
|
|
|
|
|
module tries to make it up (by counting the atoms or residues, for example). |
104
|
|
|
|
|
|
|
The default residue name for writing is UNK (unknown). Atom names are just the |
105
|
|
|
|
|
|
|
atomic symbols. |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
=head2 Multi-model files |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
If a PDB file has multiple models (separated by END or ENDMDL records), each |
110
|
|
|
|
|
|
|
call to read_mol will return one model. |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=head2 Output features |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
On writing Chemistry::Mol objects, which don't have macromolecule information |
115
|
|
|
|
|
|
|
and usually don't have atom names, the atom names are made up by concatenating |
116
|
|
|
|
|
|
|
the atomic symbol with a unique ID (up to 1296 atoms are possible). The ID can |
117
|
|
|
|
|
|
|
be disabled by setting the option 'noid': |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
$mol->write("out.pdb", noid => 1); |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
The molecule's name is written as a HEADER record; A REMARK record is added |
122
|
|
|
|
|
|
|
listing the version of Chemistry::File::PDB that was used. |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
=cut |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
Chemistry::Mol->register_format(pdb => __PACKAGE__); |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
sub read_mol { |
129
|
6
|
|
|
6
|
1
|
1232
|
my ($self, $fh, %options) = @_; |
130
|
6
|
100
|
|
|
|
65
|
return if $fh->eof; |
131
|
|
|
|
|
|
|
|
132
|
3
|
|
|
|
|
109
|
my $domain; |
133
|
3
|
|
50
|
|
|
15
|
my $mol_class = $options{mol_class} || "Chemistry::MacroMol"; |
134
|
3
|
|
|
|
|
31
|
my $mol = $mol_class->new; |
135
|
3
|
|
|
|
|
425
|
my $is_macro = $mol->isa('Chemistry::MacroMol'); |
136
|
3
|
100
|
|
|
|
13
|
$domain = $mol unless $is_macro; |
137
|
|
|
|
|
|
|
|
138
|
3
|
|
|
|
|
7
|
local $_; |
139
|
3
|
|
|
|
|
9
|
my $curr_residue = 0; |
140
|
3
|
|
|
|
|
24
|
while (<$fh>) { |
141
|
423
|
100
|
|
|
|
13408
|
if (/^TER/) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
142
|
|
|
|
|
|
|
# TODO read separated molecules? |
143
|
|
|
|
|
|
|
} elsif (/^(?:HETATM|ATOM)/) { |
144
|
417
|
|
|
|
|
4415
|
my ($atom_n, $symbol, $suff, $res_name, $chain_id, |
145
|
|
|
|
|
|
|
$seq_n, $ins_code, $x, $y, $z |
146
|
|
|
|
|
|
|
) = unpack "x6A5x1A2A2x1A3x1A1A4A1x3A8A8A8", $_; |
147
|
|
|
|
|
|
|
#print "S:$symbol; N:$name; x:$x; y:$y; z:$z\n"; |
148
|
417
|
|
|
|
|
866
|
$seq_n *= 1; |
149
|
417
|
100
|
100
|
|
|
1751
|
if (!$domain || $seq_n != $curr_residue) { |
150
|
|
|
|
|
|
|
# new residue |
151
|
30
|
100
|
|
|
|
322
|
if ($is_macro) { |
152
|
20
|
|
|
|
|
156
|
$domain = Chemistry::Domain->new( |
153
|
|
|
|
|
|
|
parent => $mol, name => "$res_name$seq_n", |
154
|
|
|
|
|
|
|
type => $res_name); |
155
|
20
|
|
|
|
|
2063
|
$mol->add_domain($domain); |
156
|
20
|
|
|
|
|
261
|
weaken $domain; |
157
|
20
|
|
|
|
|
70
|
$domain->attr('pdb/sequence_number', $seq_n); |
158
|
20
|
|
|
|
|
418
|
$domain->attr('pdb/chain_id', $chain_id); |
159
|
20
|
|
|
|
|
453
|
$domain->attr('pdb/insertion_code', $ins_code); |
160
|
|
|
|
|
|
|
} |
161
|
30
|
|
|
|
|
230
|
$curr_residue = $seq_n; |
162
|
|
|
|
|
|
|
} |
163
|
417
|
|
|
|
|
5492
|
$symbol =~ s/ //g; |
164
|
417
|
|
|
|
|
879
|
my $atom_name = $symbol.$suff; |
165
|
417
|
|
|
|
|
537
|
$atom_name =~ s/ //g; |
166
|
417
|
|
|
|
|
861
|
$symbol = ucfirst(lc($symbol)); |
167
|
417
|
|
|
|
|
3259
|
my $a = $domain->new_atom( |
168
|
|
|
|
|
|
|
symbol => $symbol, |
169
|
|
|
|
|
|
|
coords => [$x, $y, $z], |
170
|
|
|
|
|
|
|
name => $atom_name, |
171
|
|
|
|
|
|
|
); |
172
|
417
|
|
|
|
|
74307
|
$a->attr('pdb/residue_name', "$res_name$seq_n"); |
173
|
417
|
|
|
|
|
7682
|
$a->attr('pdb/serial_number', $atom_n*1); |
174
|
|
|
|
|
|
|
#$a->attr('pdb/residue', $domain); # XXX causes memory leak |
175
|
|
|
|
|
|
|
} elsif (/^END(MDL)?/) { |
176
|
3
|
|
|
|
|
64
|
return $mol; |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
} |
179
|
0
|
|
|
|
|
0
|
return $mol; |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
sub name_is { |
184
|
1
|
|
|
1
|
1
|
180
|
my ($class, $fname) = @_; |
185
|
1
|
|
|
|
|
8
|
$fname =~ /\.pdb$/i; |
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
sub file_is { |
189
|
3
|
|
|
3
|
1
|
1007
|
my ($class, $fname) = @_; |
190
|
|
|
|
|
|
|
|
191
|
3
|
50
|
|
|
|
30
|
return 1 if $fname =~ /\.pdb$/i; |
192
|
|
|
|
|
|
|
|
193
|
0
|
0
|
|
|
|
0
|
open F, $fname or croak "Could not open file $fname"; |
194
|
|
|
|
|
|
|
|
195
|
0
|
|
|
|
|
0
|
while (){ |
196
|
0
|
0
|
0
|
|
|
0
|
if (/^ATOM / or /^HETATM/) { |
197
|
0
|
|
|
|
|
0
|
close F; |
198
|
0
|
|
|
|
|
0
|
return 1; |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
|
202
|
0
|
|
|
|
|
0
|
return 0; |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
sub write_string { |
206
|
1
|
|
|
1
|
1
|
350
|
my ($class, $mol, %opts) = @_; |
207
|
1
|
|
|
|
|
3
|
my $ret = ''; |
208
|
1
|
50
|
|
|
|
11
|
$ret .= 'TITLE ' . $mol->name . "\n" if defined $mol->name; |
209
|
1
|
|
|
|
|
20
|
$ret .= 'REMARK Generated by ' . __PACKAGE__ . " version $VERSION\n"; |
210
|
1
|
50
|
|
|
|
8
|
if ($mol->isa("Chemistry::MacroMol")) { |
211
|
1
|
|
|
|
|
2
|
my $i = 1; |
212
|
1
|
|
|
|
|
2
|
my $j = 1; |
213
|
1
|
|
|
|
|
7
|
for my $res ($mol->domains) { |
214
|
10
|
|
|
|
|
47
|
my $seq_n = $res->attr("pdb/sequence_number"); |
215
|
10
|
50
|
|
|
|
101
|
$seq_n = $j++ unless defined $seq_n; |
216
|
10
|
|
50
|
|
|
29
|
my $res_name = substr($res->name, 0, 3) || "UNK"; |
217
|
|
|
|
|
|
|
|
218
|
10
|
|
|
|
|
95
|
for my $atom ($res->atoms) { |
219
|
139
|
|
|
|
|
1880
|
my $serial_n = $res->attr("pdb/serial_number"); |
220
|
139
|
|
|
|
|
1366
|
my $ins_code = $res->attr("pdb/insertion_code"); |
221
|
139
|
50
|
|
|
|
1773
|
$serial_n = $i++ unless defined $serial_n; |
222
|
139
|
|
|
|
|
430
|
my @coords = $atom->coords->array; |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
# make atom name |
225
|
2
|
|
|
2
|
|
16
|
no warnings 'uninitialized'; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
1221
|
|
226
|
139
|
|
|
|
|
2008
|
my $symbol = substr $atom->symbol, 0, 2; |
227
|
139
|
|
33
|
|
|
1082
|
my $atom_name = $atom->name || $symbol; |
228
|
139
|
|
|
|
|
2495
|
$atom_name =~ /^(\dH|$symbol)(.{0,2})/; |
229
|
|
|
|
|
|
|
#print "NAME: '$atom_name' ($1,$2); SYMBOL: '$symbol'\n"; |
230
|
139
|
|
|
|
|
521
|
$atom_name = sprintf "%2s%-2s", $1, $2; |
231
|
139
|
|
|
|
|
1574
|
$ret .= sprintf "ATOM %5d %4s %-3s %4d%1s %8.3f%8.3f%8.3f\n", |
232
|
|
|
|
|
|
|
$serial_n, $atom_name, $res_name, $seq_n, $ins_code, @coords; |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
} else { |
236
|
0
|
|
|
|
|
0
|
my $i = 1; |
237
|
0
|
|
|
|
|
0
|
for my $atom ($mol->atoms) { |
238
|
0
|
|
|
|
|
0
|
my @coords = $atom->coords->array; |
239
|
0
|
0
|
|
|
|
0
|
if ($atom->name) { |
240
|
0
|
|
|
|
|
0
|
$ret .= sprintf "ATOM %5d %4s %-3s %4d %8.3f%8.3f%8.3f\n", |
241
|
|
|
|
|
|
|
$i++, $atom->name, "UNK", 1, @coords; |
242
|
|
|
|
|
|
|
} else { |
243
|
0
|
|
|
|
|
0
|
my ($i0, $i1) = ($i % 36, int($i/36)); |
244
|
0
|
0
|
|
|
|
0
|
my $id = ($i1 < 10 ? $i1 : chr(55+$i1)) |
|
|
0
|
|
|
|
|
|
245
|
|
|
|
|
|
|
. ($i0 < 10 ? $i0 : chr(55+$i0)); |
246
|
0
|
0
|
|
|
|
0
|
$id = ' ' if $opts{noid}; |
247
|
0
|
|
|
|
|
0
|
$ret .= sprintf "HETATM%5d %2s%2s UNK 1 %8.3f%8.3f%8.3f\n", |
248
|
|
|
|
|
|
|
$i++, $atom->symbol, $id, @coords; |
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
} |
252
|
1
|
|
|
|
|
5
|
$ret .= "TER\nEND\n"; |
253
|
1
|
|
|
|
|
49
|
$ret; |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
1; |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
=head1 VERSION |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
0.23 |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
=head1 SEE ALSO |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
L, L, L, |
265
|
|
|
|
|
|
|
L. |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
The PDB format description at |
268
|
|
|
|
|
|
|
L |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
There is another PDB reader in Perl, as part of the BioPerl project: |
271
|
|
|
|
|
|
|
L. |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
=head1 AUTHOR |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
Ivan Tubert-Brohman |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
=head1 COPYRIGHT |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
Copyright (c) 2009 Ivan Tubert-Brohman. All rights reserved. This program is |
280
|
|
|
|
|
|
|
free software; you can redistribute it and/or modify it under the same terms as |
281
|
|
|
|
|
|
|
Perl itself. |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
=cut |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
|