File Coverage

blib/lib/Chemistry/Pattern.pm
Criterion Covered Total %
statement 227 231 98.2
branch 110 150 73.3
condition 16 24 66.6
subroutine 21 21 100.0
pod 8 14 57.1
total 382 440 86.8


line stmt bran cond sub pod time code
1             package Chemistry::Pattern;
2             $VERSION = '0.27';
3             # $Id: Pattern.pm,v 1.21 2009/05/10 20:56:06 itubert Exp $
4              
5             =head1 NAME
6              
7             Chemistry::Pattern - Chemical substructure pattern matching
8              
9             =head1 SYNOPSIS
10              
11             use Chemistry::Pattern;
12             use Chemistry::Mol;
13             use Chemistry::File::SMILES;
14              
15             # Create a pattern and a molecule from SMILES strings
16             my $mol_str = "C1CCCC1C(Cl)=O";
17             my $patt_str = "C(=O)Cl";
18             my $mol = Chemistry::Mol->parse($mol_str, format => 'smiles');
19             my $patt = Chemistry::Pattern->parse($patt_str, format => 'smiles');
20              
21             # try to match the pattern
22             while ($patt->match($mol)) {
23             @matched_atoms = $patt->atom_map;
24             print "Matched: (@matched_atoms)\n";
25             # should print something like "Matched: (a6 a8 a7)"
26             }
27              
28             =head1 DESCRIPTION
29              
30             This module implements basic pattern matching for molecules.
31             The Chemistry::Pattern class is a subclass of Chemistry::Mol, so patterns
32             have all the properties of molecules and can come from reading the same
33             file formats. Of course there are certain formats (such as SMARTS)
34             that are exclusively used to describe patterns.
35              
36             To perform a pattern matching operation on a molecule, follow these steps.
37              
38             1) Create a pattern object, either by parsing a file or string, or by adding
39             atoms and bonds by hand by using Chemistry::Mol methods. Note that atoms and
40             bonds in a pattern should be Chemistry::Pattern::Atom and
41             Chemistry::Patern::Bond objects. Let's assume that the pattern object is
42             stored in $patt and that the molecule is $mol.
43              
44             2) Execute the pattern on the molecule by calling $patt->match($mol).
45              
46             3) If $patt->match() returns true, extract the "map" that relates the pattern to
47             the molecule by calling $patt->atom_map or $patt->bond_map. These methods
48             return a list of the atoms or bonds in the molecule that are matched by the
49             corresponding atoms in the pattern. Thus $patt->atom_map(1) would be analogous
50             to the $1 special variable used for regular expresion matching. The difference
51             between Chemistry::Pattern and Perl regular expressions is that atoms and bonds
52             are always captured.
53              
54             4) If more than one match for the molecule is desired, repeat from step (2)
55             until match() returns false.
56              
57             =cut
58              
59 1     1   217653 use 5.006;
  1         5  
  1         45  
60 1     1   5 use strict;
  1         2  
  1         34  
61 1     1   5 use Carp;
  1         8  
  1         75  
62 1     1   5 use base qw(Chemistry::Mol);
  1         2  
  1         96  
63 1     1   800 use Chemistry::Pattern::Atom;
  1         2  
  1         43  
64 1     1   707 use Chemistry::Pattern::Bond;
  1         3  
  1         4435  
65              
66             =head1 METHODS
67              
68             =over 4
69              
70             =item Chemistry::Pattern->new(name => value, ...)
71              
72             Create a new empty pattern. This is just like the Chemistry::Mol constructor,
73             with one additional option: "options", which expects a hash reference (the
74             options themselves are described under the options() method).
75              
76             =cut
77              
78             sub new {
79 17     17 1 24347 my $class = shift;
80 17         128 my $self = $class->SUPER::new(
81             options => { overlap => 1, permute => 0},
82             @_
83             );
84 17         127 $self->reset;
85 17         66 $self;
86             }
87              
88             =item $pattern->options(option => value,...)
89              
90             Available options:
91              
92             =over
93              
94             =item overlap
95              
96             If true, matches may overlap. For example, the CC pattern could match twice
97             on propane if this option is true, but only once if it is false. This option
98             is true by default.
99              
100             =item permute
101              
102             Sometimes there is more than one way of matching the same set of pattern atoms
103             on the same set of molecule atoms. If true, return these "redundant" matches.
104             For example, the CC pattern could match ethane with two different permutations
105             (forwards and backwards). This option is false by default.
106              
107             =back
108              
109             =cut
110              
111             sub options {
112 34     34 1 33456 my $self = shift;
113 34 100       85 if (@_ == 1) {
114 17 50       20 $self->{options} = {%{$self->{options}||{}}, %{$_[0]}};
  17         271  
  17         108  
115             } else {
116 17         22 $self->{options} = {%{$self->{options}}, @_};
  17         172  
117             }
118             }
119              
120 45     45 1 5473 sub atom_class { "Chemistry::Pattern::Atom" }
121              
122 30     30 1 2824 sub bond_class { "Chemistry::Pattern::Bond" }
123              
124             our $DEBUG = 0;
125              
126             =item $patt->reset
127              
128             Reset the state of the pattern matching object, so that it begins the next
129             match from scratch instead of where it left off after the last one.
130              
131             =cut
132              
133             sub reset {
134 34     34 1 72 my ($self, $mol, %opts) = @_;
135 34 50       78 print "Resetting to ($mol, $opts{atom})\n" if $DEBUG;
136 34         86 $self->{flat} = $self->flatten;
137 34         107 $self->map_to($mol);
138 34 100       251 $self->{pending_atoms} = [$mol ? $mol->atoms : ()];
139 34         330 $self->{already_matched} = {};
140 34         71 $self->{paint_tab} = {};
141 34         61 $self->{anchor} = '';
142 34         102 my $atom = $self->next_atom($opts{atom});
143             #$self->match_local_init($atom) if $atom;
144             }
145              
146             sub already_matched {
147 50     50 0 100 my ($self, @objs) = @_;
148 50         69 my @ids = map {$_->id} @objs;
  172         813  
149 50         373 my $unsorted_key = join " ", @ids;
150 50         94 my $key;
151 50 100       118 if ($self->{options}{permute}) {
152 20         28 $key = $unsorted_key;
153             } else {
154 30         116 $key = join " ", sort @ids;
155             }
156              
157 50 100       123 if ($self->{already_matched}{$key}) {
158 12 50       60 print "already matched $unsorted_key\n" if $DEBUG;
159 12         43 return 1;
160             } else {
161 38         94 $self->{already_matched}{$key} = 1;
162 38 50       74 print "first match of $key\n" if $DEBUG;
163 38         258 return 0;
164             }
165             }
166              
167             sub next_atom {
168 89     89 0 141 my ($self, $atom) = @_;
169              
170 89 50       179 print "next_atom\n" if $DEBUG;
171 89 100       186 return if $self->{anchor};
172 88 100       141 if ($atom) {
  87 100       218  
173 1         7 $self->{anchor} = $atom;
174             } elsif (@{$self->{pending_atoms}}) {
175 54         61 $atom = shift @{$self->{pending_atoms}};
  54         95  
176 54 50       119 print "\tatom $atom\n" if $DEBUG;
177             }
178            
179 88         133 $self->{next_atom} = $atom;
180 88         232 $atom;
181             }
182              
183             Chemistry::Obj::accessor "map_to";
184              
185             =item $pattern->atom_map
186              
187             Returns the list of atoms that matched the last time $pattern->match was called.
188              
189             =cut
190              
191             sub atom_map {
192 88     88 1 203 my $self = shift;
193 88         240 my @atoms = map { $_->map_to } $self->atoms(@_);
  210         1304  
194 88 100       712 @atoms == 1 ? $atoms[0] : @atoms;
195             }
196              
197             =item $pattern->bond_map
198              
199             Returns the list of bonds that matched the last time $pattern->match was called.
200              
201             =cut
202              
203             sub bond_map {
204 50     50 1 62 my $self = shift;
205 50         143 my @bonds = map { $_->map_to } $self->bonds(@_);
  52         278  
206 50 100       260 @bonds == 1 ? $bonds[0] : @bonds;
207             }
208              
209             =item $pattern->match($mol, %options)
210              
211             Returns true if the pattern matches the molecule. If called again for the same
212             molecule, continues matching where it left off (in a way similar to global
213             regular expressions under scalar context). When there are no matches left,
214             returns false. To force the match to always start from scratch instead of
215             continuing where it left off, the C option may be used.
216              
217             $pattern->match($mol, atom => $atom)
218              
219             If atom => $atom is given as an option, match will only look for matches that
220             start at $atom (which should be an atom in $mol, of course). This is somewhat
221             analog to anchored regular expressions.
222              
223             To find out which atoms and bonds matched, use the atom_map and bond_map
224             methods.
225              
226             =cut
227              
228             sub match {
229 55     55 1 806 my ($self, $mol, %opts) = @_;
230 55 50       131 print "match $self $mol\n" if $DEBUG;
231 55 50 66     230 if (defined($mol) and $self->map_to ne $mol
      66        
      66        
      66        
232             or $opts{reset}
233             or defined($opts{atom}) and $opts{atom} ne $self->{anchor}
234             ){
235 17         240 $self->reset($mol, %opts);
236             }
237 55         662 my $match = $self->match_next;
238 55 100       137 $self->map_to(undef) unless $match;
239 55 50       170 print "returning match: '$match'\n" if $DEBUG;
240 55         150 $match;
241             }
242              
243             sub match_next {
244 55     55 0 109 my $self = shift;
245 55         62 my $match;
246 55 50       108 print "match_next\n" if $DEBUG;
247              
248 55         55 while (1) {
249 103 100       462 $self->match_local_init($self->{next_atom}) if $self->{next_atom};
250 103         231 $match = $self->match_local_next;
251 103 100       184 if ($match) {
252 50 100       115 if ($self->already_matched($self->atom_map, $self->bond_map)) {
253 12         25 $match = 0, next; # try again
254             } else {
255 38 100       107 $self->next_atom unless ($self->{options}{overlap});
256 38         40 last; # matched!
257             }
258             } else {
259 53 100       103 $self->next_atom or last;
260             }
261             }
262 55         120 $match;
263             }
264              
265              
266             sub match_local_init {
267 55     55 0 333 my ($patt, $atom) = @_;
268 55         141 my $mol = $patt->map_to;
269              
270 55 50       323 print "match_local_init(",$patt->atoms(1),", $atom)\n" if $DEBUG;
271              
272             # clean up
273 55         153 $_->map_to(undef) for ($patt->atoms, $patt->bonds);
274 55 100 66     364 $patt->{paint_tab} = {} if ($mol and $patt->{options}{overlap});
275 55         579 $patt->{stack} = [[0]];
276 55         224 $patt->{backtrack} = 0;
277 55         74 $patt->{next_atom} = 0;
278 55         133 $patt->{current_atom} = $atom;
279             }
280              
281              
282             sub match_local_next {
283 103     103 0 135 my ($patt) = @_;
284 103         145 my $stack = $patt->{stack};
285 103         135 my $paint_tab = $patt->{paint_tab};
286 103         212 my $match = 0;
287 103         122 my $backtrack = $patt->{backtrack};
288 103         120 my $where = $patt->{current_atom};
289 103         124 my $flat = $patt->{flat};
290 103         240 my $mol = $patt->map_to;
291              
292 103 50       543 print "match_local_next($where)\n" if $DEBUG;
293              
294 103         116 while (1) {
295 698         754 my ($idx) = @{$stack->[-1]};
  698         1127  
296 698         920 my $pos = @$stack - 1;
297 698         908 my $what = $flat->[$pos];
298 698 50       1199 if($DEBUG) { print "[@$_] " for @$stack; print "\n" }
  0         0  
  0         0  
299              
300 698 100       4086 if ($backtrack) {
    100          
    100          
    100          
    50          
301 183 50       320 print "\tbacktracking...\n" if $DEBUG;
302 183 100       479 if (@$stack <= 2) {
303 48 50       88 print "\tnowhere to backtrack\n" if $DEBUG;
304 48         66 last;
305             }
306 135         201 my $to;
307 135         181 pop @$stack;
308 135         205 (undef, $what, $to) = @{$stack->[-1]};
  135         301  
309 135         184 $backtrack = 0;
310 135 100       343 if ($what) {
311 111 50       591 print "\tcleaning $what\n" if $DEBUG;
312 111         294 $paint_tab->{$what->map_to} = 0;
313 111         951 $what->map_to(undef);
314             }
315 135 100       456 if ($to) {
316 123 50       701 print "\tcleaning $to\n" if $DEBUG;
317 123         311 $paint_tab->{$to->map_to} = 0;
318 123         1309 $to->map_to(undef);
319             }
320 135         690 next;
321             } elsif ($pos >= @$flat) { # nothing else to match
322 50 50       98 print "\tmatched!\n" if $DEBUG;
323 50         52 $backtrack = 1;
324 50         47 $match = 1;
325 50         69 last;
326             } elsif (@$stack == 1) { # match first atom (anchored)
327 55 100 100     132 if (!$paint_tab->{$where} and $what->test($where)) {
328 50 50       774 print "\tmatched initial atom $what on $where\n" if $DEBUG;
329 50         134 $paint_tab->{$where} = 1;
330 50         427 $what->map_to($where);
331 50         298 push @$stack, [0];
332 50         116 next;
333             } else {
334 5         55 last;
335             }
336             } elsif ($what->isa("Chemistry::Pattern::Atom")) {
337             # match unanchored atom
338 50 100       143 if($idx >= $mol->atoms) {
339 10 50       76 print "\tno more atoms at $mol\n" if $DEBUG;
340 10         18 $backtrack = 1, next;
341             }
342 40         337 my $next = $mol->atoms($idx+1);
343 40         394 $stack->[-1] = [++$idx];
344 40 100       121 if ($paint_tab->{$next}) {
345 16 50       143 print "\t$next already visited\n" if $DEBUG;
346 16         30 next;
347             }
348 24 50       346 if ($what->test($next)) {
349 24 50       268 print "\tmatched unanchored atom $what on $next\n" if $DEBUG;
350 24         64 $paint_tab->{$next} = 1;
351 24         203 $what->map_to($next);
352 24         149 $stack->[-1] = [$idx, undef, $what];
353 24         59 push @$stack, [0];
354             } else {
355 0 0       0 print "\tunanchored atom $what didn't match $next\n" if $DEBUG;
356             }
357 24         52 next;
358             } elsif ($what->isa("Chemistry::Pattern::Bond")) { # match bond
359 360         984 my ($a1, $a2) = $what->atoms;
360 360 50       2793 my ($from, $to) = $a1->map_to ? ($a1, $a2) : ($a2, $a1);
361 360   33     3642 $where = $a1->map_to || $a2->map_to;
362 360 50       3945 print "\tfrom $from to $to\n" if $DEBUG;
363 360         928 my @bn = $where->bonds_neighbors;
364 360 100       5902 if ($idx >= @bn) {
365 125 50       227 print "\tno more bonds at $where\n" if $DEBUG;
366 125         452 $backtrack = 1, next;
367             }
368 235         433 my $nei = $bn[$idx]{to};
369 235         307 my $bond = $bn[$idx]{bond};
370 235         448 $stack->[-1] = [++$idx];
371              
372 235 100       647 if ($paint_tab->{$bond}) {
373 92 50       858 print "\t$bond already visited\n" if $DEBUG;
374 92         304 next;
375             }
376 143 100 66     1286 if ($a1->map_to and $a2->map_to) { # ring closure bond
377 22 50       420 if ($what->test($bond)) {
378 22 50       239 print "\tmatched ring bond $what on $bond\n" if $DEBUG;
379 22 100       58 if ($to->map_to eq $nei) {
380 12 50       109 print "\tring closed $to on $nei\n" if $DEBUG;
381 12         29 $paint_tab->{$bond} = 1;
382 12         101 $what->map_to($bond);
383 12         27 $stack->[-1] = [$idx, $what];
384 12         29 push @$stack, [0];
385 12         52 next;
386             } else {
387 10 50       90 print "\tring didn't close $to on $nei\n" if $DEBUG;
388             }
389             }
390             } else {
391 121 100       1727 if ($paint_tab->{$nei}) {
392 7 50       58 print "\t$nei already visited\n" if $DEBUG;
393 7         24 next;
394             }
395 114 100       1037 if ($what->test($bond)) {
396 106 50       1438 print "\tmatched bond $what on $bond\n" if $DEBUG;
397 106 100       316 if ($to->test($nei)) {
398 101 50       1079 print "\tmatched atom $to on $nei\n" if $DEBUG;
399 101         243 $paint_tab->{$nei} = 1;
400 101         900 $to->map_to($nei);
401 101         662 $paint_tab->{$bond} = 1;
402 101         937 $what->map_to($bond);
403 101         252 $stack->[-1] = [$idx, $what, $to];
404 101         230 push @$stack, [0];
405 101         441 next;
406             } else {
407 5 50       55 print "\tatom $to didn't match $nei\n" if $DEBUG;
408             }
409             } else {
410 8 50       158 print "\tbond $what didn't match $bond\n" if $DEBUG;
411             }
412             }
413 23         146 next;
414             } else {
415 0         0 die "shouldn't be here; what=($what)!";
416             }
417             }
418 103         157 $patt->{backtrack} = $backtrack;
419 103         285 $patt->{stack} = $stack;
420 103         212 $match;
421             }
422              
423             sub flatten {
424 34     34 0 45 my ($patt) = @_;
425 34         50 my $visited = {};
426 34         53 my $list = [];
427 34         107 for my $atom ($patt->atoms) {
428 45 100       474 next if $visited->{$atom};
429 19         162 push @$list, $atom;
430 19         47 _flatten($patt, $atom, $visited, $list);
431             }
432 34 50       305 print "flattened list: (@$list)\n" if $DEBUG;
433 34         103 $list;
434             }
435              
436             sub _flatten {
437 45     45   68 my ($patt, $atom, $visited, $list) = @_;
438              
439 45         108 $visited->{$atom} = 1;
440 45         370 for my $bn ($atom->bonds_neighbors) {
441 60         770 my $nei = $bn->{to};
442 60         76 my $bond = $bn->{bond};
443 60 100       141 next if $visited->{$bond};
444 30         253 $visited->{$bond} = 1;
445 30         202 push @$list, $bond;
446 30 100       135 unless ($visited->{$nei}) {
447 26         208 _flatten($patt, $nei, $visited, $list);
448             }
449             }
450             }
451              
452             1;
453              
454             =back
455              
456             =head1 VERSION
457              
458             0.27
459              
460             =head1 SEE ALSO
461              
462             L, L, L,
463             L, L.
464              
465             The PerlMol website L
466              
467             =head1 AUTHOR
468              
469             Ivan Tubert-Brohman Eitub@cpan.orgE
470              
471             =head1 COPYRIGHT
472              
473             Copyright (c) 2009 Ivan Tubert-Brohman. All rights reserved. This program is
474             free software; you can redistribute it and/or modify it under the same terms as
475             Perl itself.
476              
477             =cut
478