File Coverage

blib/lib/Chemistry/MidasPattern.pm

Criterion	Covered	Total	%
statement	122	145	84.1
branch	45	66	68.1
condition	9	14	64.2
subroutine	18	22	81.8
pod	5	8	62.5
total	199	255	78.0

line	stmt	bran	cond	sub	pod	time	code
1							package Chemistry::MidasPattern;
2
3							$VERSION = '0.11';
4
5							# $Id: MidasPattern.pm,v 1.3 2005/05/16 23:03:37 itubert Exp $
6
7							=head1 NAME
8
9							Chemistry::MidasPattern - Select atoms in macromolecules
10
11							=head1 SYNOPSIS
12
13							use Chemistry::MidasPattern;
14							use Chemistry::File::PDB;
15
16							# read a molecule
17							my $mol = Chemistry::MacroMol->read("test.pdb");
18
19							# define a pattern matching carbons alpha and beta
20							# in all valine residues
21							my $str = ':VAL@CA,CB';
22							my $patt = Chemistry::MidasPattern->new($str);
23
24							# apply the pattern to the molecule
25							$patt->match($mol);
26
27							# extract the results
28							for my $atom ($patt->atom_map) {
29							printf "%s\t%s\n", $atom->attr("pdb/residue_name"), $atom->name;
30							}
31							printf "FOUND %d atoms\n", scalar($patt->atom_map);
32
33							=head1 DESCRIPTION
34
35							This module partially implements a pattern matching engine for selecting atoms
36							in macromolecules by using Midas/Chimera patterns. See
37							L
38							for a detailed description of this language.
39
40							This module shares the same interface as L; to perform a
41							pattern matching operation on a molecule, follow these steps.
42
43							1) Create a pattern object, by parsing a string. Let's assume that the pattern
44							object is stored in $patt and that the molecule is $mol.
45
46							2) Execute the pattern on the molecule by calling $patt->match($mol).
47
48							3) If $patt->match() returns true, extract the "map" that relates the pattern
49							to the molecule by calling $patt->atom_map. These method returns a list of the
50							atoms in the molecule that are matched by the pattern. Thus $patt->atom_map(1)
51							would be analogous to the $1 special variable used for regular expresion
52							matching. The difference between Chemistry::Pattern and Perl regular
53							expressions is that atoms are always captured, and that each atom always uses
54							one "slot".
55
56							=head1 MIDAS ATOM SPECIFICATION LANGUAGE QUICK SUMMARY
57
58							The current implementation does not have the concept of a model, only of
59							residues and atoms.
60
61							What follows is not exactly a formal grammar specification, but it should give
62							a general idea:
63
64							SELECTOR = ((:RESIDUE(.CHAIN)?)(@ATOM))*
65
66							The star here means "zero or more", the question mark means "zero or one", and
67							the parentheses are used to delimit the effect of the star. All other
68							characters are used verbatim.
69
70							RESIDUE can be a name (e.g., LYS), a sequence number (e.g., 108), a range
71							(e.g., 1-10), or a comma-separated list of RESIDUEs (e.g. 1-10,6,LYS).
72
73							ATOM is an atom name, a serial number (this is a non-standard extension) or a
74							comma-separated list of ATOMs.
75
76							Names can have wildcards: * matches the whole name; ? matches one character;
77							and = matches zero or more characters. An @ATOM specification is asociated with
78							the closest preceding residue specification.
79
80							DISTANCE_SELECTOR = SELECTOR za< DISTANCE
81
82							Atoms within a certain distance of those that are matched by a selector can be
83							selected by using the za< operator, where DISTANCE is a number in Angstroms.
84
85							EXPR = ( SELECTOR \| DISTANCE_SELECTOR ) (& (SELECTOR \| DISTANCE_SELECTOR))*
86
87							The result of two or more selectors can be intersected using the & operator.
88
89							=head1 EXAMPLES
90
91							:ARG All arginine atoms
92							:ARG.A All arginine atoms in chain 'A'
93							:ARG@* All arginine atoms
94							@CA All alpha carbons
95							:*@CA All alpha carbons
96							:ARG@CA Arginine alpha carbons
97							:VAL@C= Valine carbons
98							:VAL@C? Valine carbons with two-letter names
99							:ARG,VAL@CA Arginine and valine alpha carbons
100							:ARG:VAL@CA All arginine atoms and valine alpha carbons
101							:ARG@CA,CB Arginine alpha and beta carbons
102							:ARG@CA@CB Arginine alpha and beta carbons
103							:1-10 Atoms in residues 1 to 10
104							:48-* Atoms in residues 11 to the last one
105							:30-40@CA & :ARG Alpha carbons in residues 1-10 which are
106							also arginines.
107							@123 Atom 123
108							@123 za<5.0 Atoms within 5.0 Angstroms of atom 123
109							@123 za>30.0 Atoms not within 30.0 Angstroms of atom 123
110							@CA & @123 za<5.0 Alpha carbons within 5.0 Angstroms of atom 123
111
112							=cut
113
114
115	1			1		6	use strict;
	1					2
	1					39
116	1			1		5	use warnings;
	1					2
	1					32
117	1			1		991	use Chemistry::MacroMol;
	1					83770
	1					76
118	1			1		15	use Carp;
	1					2
	1					77
119	1			1		6	use base "Chemistry::Pattern";
	1					2
	1					1262
120
121							our $DEBUG = 0;
122
123							sub new {
124	18			18	1	42	my ($class, $str) = @_;
125	18		33			227	my $self = bless {
126							atom_map => [],
127							options => {},
128							matched => 0,
129							name => $str,
130							}, ref $class \|\| $class;
131	18					62	$self->parse_midas_pattern($str);
132	18					82	$self;
133							}
134
135							sub atom_map {
136	36			36	1	6391	my ($self) = @_;
137	36					43	@{$self->{atom_map}};
	36					173
138							}
139
140	0			0	1	0	sub bond_map { () }
141
142							sub options {
143	18			18	1	25	my $self = shift;
144	18					64	$self->{options} = {@_};
145	18					45	$self;
146							}
147
148							# residue specifier: name \| range \| number \| *
149							# atom specifier: name \| number \| *
150							# names may have ? and = as wildcards (equivalent to . and .*)
151							sub parse_midas_pattern {
152	18			18	0	30	my ($self, $string) = @_;
153	18					28	my $tree = [];
154	18					236	$self->{tree} = $tree;
155
156	18					84	for my $str (split / +& +/, $string) {
157	20					83	my $node = { type => 'normal' };
158	20	100				94	if ($str =~ s/ +z(.?)([<>])\s*(\d+(\.\d+)?)//) {
159	3		50			21	$node->{type} = 'zone_' . ($1 \|\| 'r');
160	3					10	$node->{zone_op} = $2;
161	3					14	$node->{zone_val} = $3;
162							}
163	20					36	push @$tree, $node;
164	20					36	my $conditions = [];
165	20					39	$node->{conditions} = $conditions;
166
167	20					43	$str =~ s/\s+//g; # ignore whitespace
168	20	100				79	$str = ":*$str" unless $str =~ /^:/; # add implicit "every residue"
169	20					75	my (undef, @residues) = split ':', $str;# =~ /:([^:]+)/g;
170	20					43	for my $residue (@residues) {
171	21					61	my ($res_base, $atom_base) = split '@', $residue, 2;
172							#print "res_base = $res_base\n";
173	21					46	my @res = map { parse_res($_) } split ',', $res_base;
	22					59
174	21					75	my $res_node = { residue => \@res };
175	21					49	push @$conditions, $res_node;
176
177	21		100			90	my (@atoms) = map { parse_atom($_) } split /[\@,]/, $atom_base\|\|'*';
	23					52
178	21					159	$res_node->{atoms} = \@atoms;
179							}
180							}
181							}
182
183							sub parse_res {
184	22			22	0	36	my ($s) = @_;
185	22					43	$s = lc $s;
186	22					29	my $chain_id;
187							my $sub;
188	22	50				68	if ($s =~ s/\.(.)$//) { # chain id
189	0					0	$chain_id = $1;
190							}
191	22	50				188	if ($s =~ /^\d+$/) {
		100
		50
		100
		50
192	0			0		0	$sub = sub {$_->attr('pdb/sequence_number') == $s};
	0					0
193							} elsif ($s =~ /^[a-zA-Z]+$/) {
194	12			600		48	$sub = sub {lc $_->type eq $s};
	600					4318
195							} elsif ($s =~ /^[a-zA-Z=?]+$/) {
196	0					0	$s =~ s/\?/./g;
197	0					0	$s =~ s/=/.*/g;
198	0			0		0	$sub = sub {$_->type =~ qr/$s/i};
	0					0
199							} elsif ($s eq '*') {
200	7			350		34	$sub = sub {1};
	350					737
201							} elsif ($s =~ /^(\d+\|\)-(\d+\|\)/) {
202	3					13	my ($from, $to) = ($1, $2);
203	3	50				11	$from = 0 if $from eq '*';
204	3	100				11	$to = 1e9 if $to eq '*';
205
206							$sub = sub {
207	150	100		150		441	my $n = $_->attr('pdb/sequence_number'); $n >= $from and $n <= $to
	150					1605
208	3					15	};
209							} else {
210	0					0	croak "Invalid residue specification '$s'\n";
211							}
212
213	0	0		0		0	return $chain_id ?
214							sub {lc $_->attr('pdb/chain_id') eq $chain_id and $sub->()}
215	22	50				97	: $sub;
216							}
217
218							sub parse_atom {
219	23			23	0	41	my ($s) = @_;
220	23					43	$s = lc $s;
221	23	100				123	if ($s =~ /^\d+$/) {
		100
		50
222	4			3164		23	return sub {$_->attr('pdb/serial_number') == $s};
	3164					27701
223							} elsif ($s =~ /^[a-zA-Z0-9='"?]+$/) {
224	13					26	$s =~ s/\?/./g;
225	13					21	$s =~ s/=/.*/g;
226	13			3093		64	return sub {$_->name =~ qr/^$s$/i};
	3093					34990
227							} elsif ($s eq '*') {
228	6			387		33	return sub {1};
	387					938
229							} else {
230	0					0	croak "Invalid atom specification '$s'\n";
231							}
232							}
233
234							sub match {
235	18			18	1	89	my ($self, $mol) = @_;
236	18					36	my $tree = $self->{tree};
237	18					25	my @ret;
238	18					25	my $first = 1;
239
240	18	50	33			64	if ($self->{matched} and $self->{matched} eq $mol) {
241	0					0	$self->{matched} = 0;
242	0					0	return 0;
243							}
244
245	18					36	for my $term (@$tree) {
246	20	50				48	print "term\n" if $DEBUG;
247	20					27	my @atoms;
248	20					26	for my $residue_spec (@{$term->{conditions}}) {
	20					50
249	21	50				47	print "\tresidue_spec\n" if $DEBUG;
250	21					25	my @residues;
251
252							# get the matching residues
253	21					28	for my $cond (@{$residue_spec->{residue}}) {
	21					46
254	22	50				52	print "\t\tresidue condition\n" if $DEBUG;
255	22					94	push @residues, grep $cond->(), $mol->domains;
256							}
257	21	50				172	if ($self->{options}{unique}) {
258	0					0	my %seen;
259	0					0	@seen{@residues} = @residues;
260	0					0	@residues = values %seen;
261							}
262	21	50				47	printf "got %d residues\n", scalar @residues if $DEBUG;
263
264							# get the matching atoms
265	21					40	for my $res (@residues) {
266	406					3991	for my $cond (@{$residue_spec->{atoms}}) {
	406					796
267	410	50				895	print "atom_cond\n" if $DEBUG;
268	410					1070	push @atoms, grep $cond->(), $res->atoms;
269							}
270							}
271							}
272	20					263	my %seen;
273	20	50				81	if ($self->{options}{unique}) {
274	0					0	@seen{@atoms} = @atoms;
275	0					0	@atoms = values %seen;
276							}
277	20	100				79	if ($term->{type} eq 'normal') {
		50
		50
278	17					96	@seen{@atoms} = @atoms;
279							} elsif ($term->{type} eq 'zone_r') {
280	0					0	croak "zr operator not implemented\n";
281							} elsif ($term->{type} eq 'zone_a') {
282	3					9	my $d = $term->{zone_val};
283	3					9	my $gt = $term->{zone_op} eq '>';
284							#print "zone_a ret(@ret)\n" unless $first;
285	1632					2437	@atoms = grep {
286	3	100				27	my $found = 0;
287	1632					2449	for my $a (@atoms) {
288	1632	100				8191	$found = 1, last if ($a->coords - $_->coords)->length < $d;
289							}
290	1632		100			142657	$found xor $gt;
291							} $first ? $mol->atoms : @ret;
292							#print "zone_a atoms(@atoms)\n" unless $first;
293	3					277	@seen{@atoms} = @atoms;
294							}
295	20	100				3492	if ($first) {
296							# keep all the atoms from this term
297	18					97	@ret = @atoms;
298	18					34	$first = 0;
299							} else {
300							# keep the intersection of the previous terms with the
301							# atoms from the current term
302	2					17	my @tmp = @ret;
303	2					15	@ret = ();
304	2					5	for (@tmp) {
305	61	100				502	push @ret, $_ if $seen{$_};
306							}
307							}
308	20	50				195	last unless @ret; # don't bother evaluating the other terms
309							}
310							# optionally sort the result
311	18	50				72	if ($self->{options}{sort}) {
312	0					0	@ret = map { $_->[0] } sort { $a->[1] <=> $b->[1] }
	0					0
	0					0
313	0					0	map { [$_, $_->attr("pdb/serial_number")] } @ret;
314							}
315	18					36	$self->{atom_map} = \@ret;
316	18					71	$self->{matched} = $mol;
317							}
318
319							1;
320
321
322
323							=head1 CAVEATS
324
325							If a feature does not appear in any of the examples, it is probably not
326							implemented. For example, the zr< zone specifier, atom properties, and
327							various Chimera extensions.
328
329							The zone specifiers (selection by distance) currently use a brute-force N^2
330							algorithm. You can optimize an & expression by putting the most unlikely
331							selectors first; for example
332
333							:1-20 zr<10.0 & :38 atoms in residue 38 within 10 A of atoms
334							in residues 1-20 (slow)
335
336							:38 & :1-20 zr<10.0 atoms in residue 38 within 10 A of atoms
337							in residues 1-20 (not so slow)
338
339							In the first case, the N^2 search measures the distance between every atom in
340							the molecule and every atom in residues 1-20, and then intersects the results
341							with the atom list of residue 28; the second case only measures the distance
342							between every atom in residue 38 with every atom in residues 1-20. The second
343							way is much, much faster for large systems.
344
345							Some day, a future version may implement a smarter algorithm...
346
347							=head1 VERSION
348
349							0.11
350
351							=head1 SEE ALSO
352
353							L, L
354
355							The PerlMol website L
356
357							=head1 AUTHOR
358
359							Ivan Tubert Eitub@cpan.orgE
360
361							=head1 COPYRIGHT
362
363							Copyright (c) 2005 Ivan Tubert. All rights reserved. This program is free
364							software; you can redistribute it and/or modify it under the same terms as
365							Perl itself.
366
367							=cut
368
369
370