line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Chemistry::Canonicalize; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
# $Id: Canonicalize.pm,v 1.2 2009/05/10 20:16:22 itubert Exp $ |
4
|
|
|
|
|
|
|
$VERSION = '0.11'; |
5
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
44197
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
102
|
|
7
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
28
|
|
8
|
|
|
|
|
|
|
|
9
|
1
|
|
|
1
|
|
1570
|
use Math::BigInt; |
|
1
|
|
|
|
|
46656
|
|
|
1
|
|
|
|
|
6
|
|
10
|
1
|
|
|
1
|
|
41910
|
use Carp; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
103
|
|
11
|
1
|
|
|
1
|
|
6
|
use base 'Exporter'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
879
|
|
12
|
|
|
|
|
|
|
our @EXPORT_OK = qw(canonicalize); |
13
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( all => \@EXPORT_OK ); |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 NAME |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
Chemistry::Canonicalize - Number the atoms in a molecule in a unique way |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 SYNOPSIS |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
use Chemistry::Canonicalize ':all'; |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
# $mol is a Chemistry::Mol object |
25
|
|
|
|
|
|
|
canonicalize($mol); |
26
|
|
|
|
|
|
|
print "The canonical number for atom 1 is: ", |
27
|
|
|
|
|
|
|
$mol->atoms(1)->attr("canon/class"); |
28
|
|
|
|
|
|
|
print "The symmetry class for for atom 1 is: ", |
29
|
|
|
|
|
|
|
$mol->atoms(1)->attr("canon/symmetry_class"); |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=head1 DESCRIPTION |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
This module provides functions for "canonicalizing" a molecular structure; that |
34
|
|
|
|
|
|
|
is, to number the atoms in a unique way regardless of the input order. |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
The canonicalization algorithm is based on: Weininger, et. al., J. Chem. Inf. |
37
|
|
|
|
|
|
|
Comp. Sci. 29[2], 97-101 (1989) |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
This module is part of the PerlMol project, L. |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=head1 ATOM ATTRIBUTES |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
During the canonicalization process, the following attributes are set on each |
44
|
|
|
|
|
|
|
atom: |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=over |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
=item canon/class |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
The unique canonical number; it is an integer going from 1 to the number of |
51
|
|
|
|
|
|
|
atoms. |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
=item canon/symmetry_class |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
The symmetry class number. Atoms that have the same symmetry class are |
56
|
|
|
|
|
|
|
considered to be topologicaly equivalent. For example, the two methyl carbons |
57
|
|
|
|
|
|
|
on 2-propanol would have the same symmetry class. |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=back |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
=head1 FUNCTIONS |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
These functions may be exported, although nothing is exported by default. |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=over |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=cut |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
my @PRIMES = qw( 1 |
70
|
|
|
|
|
|
|
2 3 5 7 11 13 17 19 23 29 |
71
|
|
|
|
|
|
|
31 37 41 43 47 53 59 61 67 71 |
72
|
|
|
|
|
|
|
73 79 83 89 97 101 103 107 109 113 |
73
|
|
|
|
|
|
|
127 131 137 139 149 151 157 163 167 173 |
74
|
|
|
|
|
|
|
179 181 191 193 197 199 211 223 227 229 |
75
|
|
|
|
|
|
|
233 239 241 251 257 263 269 271 277 281 |
76
|
|
|
|
|
|
|
283 293 307 311 313 317 331 337 347 349 |
77
|
|
|
|
|
|
|
353 359 367 373 379 383 389 397 401 409 |
78
|
|
|
|
|
|
|
419 421 431 433 439 443 449 457 461 463 |
79
|
|
|
|
|
|
|
467 479 487 491 499 503 509 521 523 541 |
80
|
|
|
|
|
|
|
547 557 563 569 571 577 587 593 599 601 |
81
|
|
|
|
|
|
|
607 613 617 619 631 641 643 647 653 659 |
82
|
|
|
|
|
|
|
661 673 677 683 691 701 709 719 727 733 |
83
|
|
|
|
|
|
|
739 743 751 757 761 769 773 787 797 809 |
84
|
|
|
|
|
|
|
811 821 823 827 829 839 853 857 859 863 |
85
|
|
|
|
|
|
|
877 881 883 887 907 911 919 929 937 941 |
86
|
|
|
|
|
|
|
947 953 967 971 977 983 991 997 1009 1013 |
87
|
|
|
|
|
|
|
1019 1021 1031 1033 1039 1049 1051 1061 1063 1069 |
88
|
|
|
|
|
|
|
1087 1091 1093 1097 1103 1109 1117 1123 1129 1151 |
89
|
|
|
|
|
|
|
1153 1163 1171 1181 1187 1193 1201 1213 1217 1223 |
90
|
|
|
|
|
|
|
1229 1231 1237 1249 1259 1277 1279 1283 1289 1291 |
91
|
|
|
|
|
|
|
1297 1301 1303 1307 1319 1321 1327 1361 1367 1373 |
92
|
|
|
|
|
|
|
1381 1399 1409 1423 1427 1429 1433 1439 1447 1451 |
93
|
|
|
|
|
|
|
1453 1459 1471 1481 1483 1487 1489 1493 1499 1511 |
94
|
|
|
|
|
|
|
1523 1531 1543 1549 1553 1559 1567 1571 1579 1583 |
95
|
|
|
|
|
|
|
1597 1601 1607 1609 1613 1619 1621 1627 1637 1657 |
96
|
|
|
|
|
|
|
1663 1667 1669 1693 1697 1699 1709 1721 1723 1733 |
97
|
|
|
|
|
|
|
1741 1747 1753 1759 1777 1783 1787 1789 1801 1811 |
98
|
|
|
|
|
|
|
1823 1831 1847 1861 1867 1871 1873 1877 1879 1889 |
99
|
|
|
|
|
|
|
1901 1907 1913 1931 1933 1949 1951 1973 1979 1987 |
100
|
|
|
|
|
|
|
1993 1997 1999 2003 2011 2017 2027 2029 2039 2053 |
101
|
|
|
|
|
|
|
2063 2069 2081 2083 2087 2089 2099 2111 2113 2129 |
102
|
|
|
|
|
|
|
2131 2137 2141 2143 2153 2161 2179 2203 2207 2213 |
103
|
|
|
|
|
|
|
2221 2237 2239 2243 2251 2267 2269 2273 2281 2287 |
104
|
|
|
|
|
|
|
2293 2297 2309 2311 2333 2339 2341 2347 2351 2357 |
105
|
|
|
|
|
|
|
2371 2377 2381 2383 2389 2393 2399 2411 2417 2423 |
106
|
|
|
|
|
|
|
2437 2441 2447 2459 2467 2473 2477 2503 2521 2531 |
107
|
|
|
|
|
|
|
2539 2543 2549 2551 2557 2579 2591 2593 2609 2617 |
108
|
|
|
|
|
|
|
2621 2633 2647 2657 2659 2663 2671 2677 2683 2687 |
109
|
|
|
|
|
|
|
2689 2693 2699 2707 2711 2713 2719 2729 2731 2741 |
110
|
|
|
|
|
|
|
2749 2753 2767 2777 2789 2791 2797 2801 2803 2819 |
111
|
|
|
|
|
|
|
2833 2837 2843 2851 2857 2861 2879 2887 2897 2903 |
112
|
|
|
|
|
|
|
2909 2917 2927 2939 2953 2957 2963 2969 2971 2999 |
113
|
|
|
|
|
|
|
); |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
=item canonicalize($mol, %opts) |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
Canonicalizes the molecule. It adds the canon/class and canon/symmetry class to |
118
|
|
|
|
|
|
|
every atom, as discussed above. This function may take the following options: |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
=over |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=item sort |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
If true, sort the atoms in the molecule in ascending canonical number order. |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=item invariants |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
This should be a subroutine reference that takes an atom and returns a number. |
129
|
|
|
|
|
|
|
These number should be based on the topological invariant properties of the |
130
|
|
|
|
|
|
|
atom, such as symbol, charge, number of bonds, etc. |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=back |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
=cut |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
sub canonicalize { |
137
|
0
|
|
|
0
|
1
|
|
my ($mol, %opts) = @_; |
138
|
|
|
|
|
|
|
|
139
|
0
|
0
|
|
|
|
|
if ($mol->atoms > @PRIMES - 1) { |
140
|
0
|
|
|
|
|
|
croak "maximum number of atoms exceeded for canonicalization\n"; |
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
|
143
|
0
|
|
0
|
|
|
|
my $invariants_sub = $opts{invariants} || \&atom_invariants; |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
# set up initial classes |
146
|
0
|
|
|
|
|
|
for my $atom ($mol->atoms) { |
147
|
0
|
|
|
|
|
|
$atom->attr("canon/class", $invariants_sub->($atom)); |
148
|
0
|
|
|
|
|
|
$atom->attr("canon/prev_class", 1); |
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
#printf "$_: %s\n", $_->attr("canon/class") for $mol->atoms; |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
# run one canonicalization step |
153
|
0
|
|
|
|
|
|
my $atoms; |
154
|
|
|
|
|
|
|
my $n_classes; |
155
|
0
|
|
|
|
|
|
($atoms, $n_classes) = rank_classes($mol); |
156
|
0
|
|
|
|
|
|
($atoms, $n_classes) = canon($mol, $n_classes); |
157
|
0
|
|
|
|
|
|
my $n_atom = $mol->atoms; |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
# atoms with the same class are topologically symmetric |
160
|
0
|
|
|
|
|
|
for my $atom ($mol->atoms) { |
161
|
0
|
|
|
|
|
|
$atom->attr("canon/symmetry_class", $atom->attr("canon/class")); |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
#printf "$_: %s\n", $_->attr("canon/class") for $mol->atoms; |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
# break symmetry to get a canonical numbering |
166
|
0
|
|
|
|
|
|
while ($n_classes < $n_atom) { |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
# multiply all classes by 2 |
169
|
0
|
|
|
|
|
|
for my $atom (@$atoms) { |
170
|
0
|
|
|
|
|
|
my $class = $atom->attr("canon/class"); |
171
|
0
|
|
|
|
|
|
$atom->attr("canon/class", $class * 2); |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
# break first tie |
175
|
0
|
|
|
|
|
|
my $last_class = -1; |
176
|
0
|
|
|
|
|
|
my $last_atom; |
177
|
0
|
|
|
|
|
|
for my $atom (@$atoms) { |
178
|
0
|
|
|
|
|
|
my $class = $atom->attr("canon/class"); |
179
|
0
|
0
|
|
|
|
|
if ($class == $last_class) { # tie |
180
|
|
|
|
|
|
|
#print "breaking tie for $last_atom\n"; |
181
|
0
|
|
|
|
|
|
$last_atom->attr("canon/class", $class - 1); |
182
|
0
|
|
|
|
|
|
last; |
183
|
|
|
|
|
|
|
} |
184
|
0
|
|
|
|
|
|
$last_class = $class; |
185
|
0
|
|
|
|
|
|
$last_atom = $atom; |
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
#printf "$_: %s\n", $_->attr("canon/class") for $mol->atoms; |
188
|
|
|
|
|
|
|
#print "---\n"; |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
# run another canonicalization step |
191
|
0
|
|
|
|
|
|
($atoms, $n_classes) = canon($mol, $n_classes); |
192
|
|
|
|
|
|
|
#printf "$_: %s\n", $_->attr("canon/class") for $mol->atoms; |
193
|
|
|
|
|
|
|
} |
194
|
0
|
0
|
|
|
|
|
if ($opts{'sort'}) { |
195
|
|
|
|
|
|
|
$mol->sort_atoms( |
196
|
0
|
|
|
0
|
|
|
sub { $_[0]->attr("canon/class") <=> $_[1]->attr("canon/class") } |
197
|
0
|
|
|
|
|
|
); |
198
|
|
|
|
|
|
|
} |
199
|
|
|
|
|
|
|
# clean up temporary classes |
200
|
0
|
|
|
|
|
|
$_->del_attr("canon/new_class") for $mol->atoms; |
201
|
0
|
|
|
|
|
|
$n_classes; |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
sub atom_invariants { |
205
|
1
|
|
|
1
|
|
7
|
no warnings 'uninitialized'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
604
|
|
206
|
0
|
|
|
0
|
0
|
|
my ($atom) = @_; |
207
|
0
|
|
|
|
|
|
my $n_bonds = $atom->bonds; |
208
|
0
|
|
|
|
|
|
my $valence = 0; |
209
|
|
|
|
|
|
|
#$valence += $_->order for $atom->bonds; |
210
|
0
|
|
|
|
|
|
for ($atom->bonds) { |
211
|
0
|
0
|
|
|
|
|
$valence += $_->order if defined $_ |
212
|
|
|
|
|
|
|
} |
213
|
0
|
|
|
|
|
|
my $Z = $atom->Z; |
214
|
0
|
|
|
|
|
|
my $q = $atom->formal_charge + 5; |
215
|
0
|
|
|
|
|
|
return $n_bonds*10_000 + $valence*1000 + $q*100 + $Z; |
216
|
|
|
|
|
|
|
} |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
# atom class comparison function. Only compare the class if the |
219
|
|
|
|
|
|
|
# previous classes are equal |
220
|
|
|
|
|
|
|
sub _cmp { |
221
|
0
|
0
|
|
0
|
|
|
$a->attr("canon/prev_class") <=> $b->attr("canon/prev_class") |
222
|
|
|
|
|
|
|
or $a->attr("canon/class") <=> $b->attr("canon/class") |
223
|
|
|
|
|
|
|
} |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
sub rank_classes { |
226
|
0
|
|
|
0
|
0
|
|
my ($mol) = @_; |
227
|
0
|
|
|
|
|
|
my @atoms = sort _cmp $mol->atoms; # consider Schwartzian transform? |
228
|
0
|
|
|
|
|
|
my $n = 0; |
229
|
0
|
|
|
|
|
|
local ($a, $b); |
230
|
0
|
|
|
|
|
|
for $b (@atoms) { |
231
|
0
|
0
|
0
|
|
|
|
$n++ if (!$a || _cmp); |
232
|
0
|
|
|
|
|
|
$a = $b; |
233
|
0
|
|
|
|
|
|
$b->attr("canon/new_class", $n); |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
#use diagnostics; |
236
|
0
|
|
|
|
|
|
for my $atom (@atoms) { |
237
|
0
|
|
|
|
|
|
$atom->attr("canon/class", $atom->attr("canon/new_class")); |
238
|
|
|
|
|
|
|
} |
239
|
0
|
|
|
|
|
|
(\@atoms, $n); |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
sub canon { |
243
|
0
|
|
|
0
|
1
|
|
my ($mol, $n) = @_; |
244
|
|
|
|
|
|
|
|
245
|
0
|
|
|
|
|
|
my $old_classes = 0; |
246
|
0
|
|
|
|
|
|
my $n_atom = $mol->atoms; |
247
|
0
|
|
|
|
|
|
my $atoms; |
248
|
0
|
|
0
|
|
|
|
while ($n > $old_classes and $n < $n_atom) { |
249
|
0
|
|
|
|
|
|
$old_classes = $n; |
250
|
|
|
|
|
|
|
# save current classes |
251
|
0
|
|
|
|
|
|
for my $atom ($mol->atoms) { |
252
|
0
|
|
|
|
|
|
$atom->attr("canon/prev_class", $atom->attr("canon/class")); |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
# set new class to product of neighbor's primes |
256
|
0
|
|
|
|
|
|
for my $atom ($mol->atoms) { |
257
|
0
|
|
|
|
|
|
my $class = Math::BigInt->new('1'); |
258
|
|
|
|
|
|
|
#my $class = 1; |
259
|
0
|
|
|
|
|
|
for my $neighbor ($atom->neighbors) { |
260
|
0
|
|
|
|
|
|
$class *= $PRIMES[$neighbor->attr("canon/prev_class")]; |
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
#print "$class\n"; |
263
|
0
|
|
|
|
|
|
$atom->attr("canon/class", $class); |
264
|
|
|
|
|
|
|
} |
265
|
0
|
|
|
|
|
|
($atoms, $n) = rank_classes($mol); |
266
|
|
|
|
|
|
|
} |
267
|
0
|
|
|
|
|
|
($atoms, $n); |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
1; |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
=back |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
=head1 VERSION |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
0.11 |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
=head1 TO DO |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
Add some tests. |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
=head1 CAVEATS |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
Currently there is an atom limit of about 430 atoms. |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
These algorithm is known to fail to discriminate between non-equivalent atoms |
287
|
|
|
|
|
|
|
for some complicated cases. These are usually highly bridged structures |
288
|
|
|
|
|
|
|
explicitly designed to break canonicalization algorithms; I don't know of any |
289
|
|
|
|
|
|
|
"real-looking structure" (meaning something that someone would actually |
290
|
|
|
|
|
|
|
synthesize or find in nature) that fails, but don't say I didn't warn you! |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
=head1 SEE ALSO |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
L, L, L, |
295
|
|
|
|
|
|
|
L. |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
=head1 AUTHOR |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
Ivan Tubert Eitub@cpan.orgE |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
=head1 COPYRIGHT |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
Copyright (c) 2009 Ivan Tubert. All rights reserved. This program is free |
304
|
|
|
|
|
|
|
software; you can redistribute it and/or modify it under the same terms as |
305
|
|
|
|
|
|
|
Perl itself. |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
=cut |
308
|
|
|
|
|
|
|
|