File Coverage

blib/lib/Chemistry/File/Formula.pm
Criterion Covered Total %
statement 98 99 98.9
branch 31 38 81.5
condition 7 11 63.6
subroutine 12 13 92.3
pod 3 8 37.5
total 151 169 89.3


line stmt bran cond sub pod time code
1             package Chemistry::File::Formula;
2              
3             our $VERSION = '0.38'; # VERSION
4             # $Id$
5              
6 3     3   2201 use strict;
  3         6  
  3         100  
7 3     3   15 use base "Chemistry::File";
  3         6  
  3         1211  
8 3     3   481 use Chemistry::Mol;
  3         6  
  3         147  
9 3     3   17 use Carp;
  3         6  
  3         168  
10 3     3   2160 use Text::Balanced qw(extract_bracketed);
  3         27083  
  3         3758  
11              
12             =head1 NAME
13              
14             Chemistry::File::Formula - Molecular formula reader/formatter
15              
16             =head1 SYNOPSIS
17              
18             use Chemistry::File::Formula;
19              
20             my $mol = Chemistry::Mol->parse("H2O");
21             print $mol->print(format => formula);
22             print $mol->formula; # this is a shorthand for the above
23             print $mol->print(format => formula,
24             formula_format => "%s%d{%d});
25              
26             =cut
27              
28             Chemistry::Mol->register_format('formula');
29              
30             =head1 DESCRIPTION
31              
32             This module converts a molecule object to a string with the formula and back.
33             It registers the 'formula' format with Chemistry::Mol. Besides its obvious
34             use, it is included in the Chemistry::Mol distribution because it is a very
35             simple example of a Chemistry::File derived I/O module.
36              
37             =head2 Writing formulas
38              
39             The format can be specified as a printf-like string with the following control
40             sequences, which are specified with the formula_format parameter to $mol->print
41             or $mol->write.
42              
43             =over
44              
45             =item %s symbol
46              
47             =item %D number of atoms
48              
49             =item %d number of atoms, included only when it is greater than one
50              
51             =item %d{substr} substr is only included when number of atoms is greater than
52             one
53              
54             =item %j{substr} substr is inserted between the formatted string for each
55             element. (The 'j' stands for 'joiner'.) The format should have only one joiner,
56             but its location in the format string doesn't matter.
57              
58             =item %% a percent sign
59              
60             =back
61              
62             If no format is specified, the default is "%s%d". Some examples follow. Let's
63             assume that the formula is C2H6O, as it would be formatted by default.
64              
65             =over
66              
67             =item C<< %s%D >>
68              
69             Like the default, but include explicit indices for all atoms.
70             The formula would be formatted as "C2H6O1"
71              
72             =item C<< %s%d{EsubE%dE/subE} >>
73              
74             HTML format. The output would be
75             "CEsubE2E/subEHEsubE6E/subEO".
76              
77             =item C<< %D %s%j{, } >>
78              
79             Use a comma followed by a space as a joiner. The output would be
80             "2 C, 6 H, 1 O".
81              
82             =back
83              
84             =head3 Symbol Sort Order
85              
86             The elements in the formula are sorted by default in the "Hill order", which
87             means that:
88              
89             1) if the formula contains carbon, C goes first, followed by H,
90             and the rest of the symbols in alphabetical order. For example, "CH2BrF".
91              
92             2) if there is no carbon, all the symbols (including H) are listed
93             alphabetically. For example, "BrH".
94              
95             It is possible to supply a custom sorting subroutine with the 'formula_sort'
96             option. It expects a subroutine reference that takes a hash reference
97             describing the formula (similar to what is returned by parse_formula, discussed
98             below), and that returns a list of symbols in the desired order.
99              
100             For example, this will sort the symbols in reverse asciibetical order:
101              
102             my $formula = $mol->print(
103             format => 'formula',
104             formula_sort => sub {
105             my $formula_hash = shift;
106             return reverse sort keys %$formula_hash;
107             }
108             );
109              
110             =head2 Parsing Formulas
111              
112             Formulas can also be parsed back into Chemistry::Mol objects.
113             The formula may have parentheses and square or triangular brackets, and
114             it may have the following abbreviations:
115              
116             Me => '(CH3)',
117             Et => '(CH3CH2)',
118             Bu => '(C4H9)',
119             Bn => '(C6H5CH2)',
120             Cp => '(C5H5)',
121             Ph => '(C6H5)',
122             Bz => '(C6H5CO)',
123              
124             The formula may also be preceded by a number, which multiplies the whole
125             formula. Some examples of valid formulas:
126              
127             =over
128              
129             Formula Equivalent to
130             --------------------------------------------------------------
131             CH3(CH2)3CH3 C5H12
132             C6H3Me3 C9H12
133             2Cu[NH3]4(NO3)2 Cu2H24N12O12
134             2C(C[C5]4)3 C152
135             2C(C(C(C)5)4)3 C152
136             C 1 0 H 2 2 C10H22 (whitespace is completely ignored)
137              
138             =back
139              
140             When a formula is parsed, a molecule object is created which consists of
141             the set of the atoms in the formula (no bonds or coordinates, of course).
142             The atoms are created in alphabetical order, so the molecule object for C2H5Br
143             would have the atoms in the following sequence: Br, C, C, H, H, H, H, H.
144              
145             If you don't want to create a molecule object, but would rather have a simple
146             hash with the number of atoms for each element, use the C
147             method:
148              
149             my %formula = Chemistry::File::Formula->parse_formula("C2H6O");
150             use Data::Dumper;
151             print Dumper \%formula;
152              
153             which prints something like
154              
155             $VAR1 = {
156             'H' => 6,
157             'O' => 1,
158             'C' => 2
159             };
160              
161             The C method is called internally by the C method.
162              
163             =head3 Non-integer numbers in formulas
164              
165             The C method can also accept formulas that contain
166             floating-point numbers, such as H1.5N0.5. The numbers must be positive, and
167             numbers smaller than one should include a leading zero (e.g., 0.9, not .9).
168              
169             When formulas with non-integer numbers of atoms are turned into molecule
170             objects as described in the previous section, the number of atoms is always
171             B. For example, H1.5N0.5 will produce a molecule object with two
172             hydrogen atoms and one nitrogen atom.
173              
174             There is currently no way of I formulas with non-integer numbers;
175             perhaps a future version will include an "occupancy" property for atoms that
176             will result in non-integer formulas.
177              
178             =cut
179              
180             sub parse_string {
181 14     14 1 37 my ($self, $string, %opts) = @_;
182 14   50     33 my $mol_class = $opts{mol_class} || "Chemistry::Mol";
183 14   50     52 my $atom_class = $opts{atom_class} || "Chemistry::Atom";
184 14   50     39 my $bond_class = $opts{bond_class} || "Chemistry::Bond";
185              
186 14         36 my $mol = $mol_class->new;
187 14         31 my %formula = $self->parse_formula($string);
188 14         49 for my $sym (sort keys %formula) {
189 31         65 for (my $i = 0; $i < $formula{$sym}; ++$i) {
190 516         935 $mol->add_atom($atom_class->new(symbol => $sym));
191             }
192             }
193 14         151 return $mol;
194             }
195              
196             sub write_string {
197 16     16 1 36 my ($self, $mol, %opts) = @_;
198 16         21 my @formula_parts;
199              
200 16   100     65 my $format = $opts{formula_format} || "%s%d"; # default format
201 16         43 my $fh = $mol->formula_hash;
202 16         58 $format =~ s/%%/\\%/g; # escape %% with a \
203 16         22 my $joiner = "";
204 16 50       35 $joiner = $1 if $format =~ s/(?
205              
206 16         20 my @symbols;
207 16 100       32 if ($opts{formula_sort}) {
208 1         4 @symbols = $opts{formula_sort}($fh);
209             } else {
210 15         30 @symbols = $self->sort_symbols($fh);
211             }
212              
213 16         36 for my $sym (@symbols) {
214 42         47 my $s = $format;
215 42         56 my $n = $fh->{$sym};
216 42         105 $s =~ s/(?
217 42         55 $s =~ s/(?
218 42 100       56 $s =~ s/(? 1 ? $1 : ''/eg; # %d{}
  3         8  
219 42 100       97 $s =~ s/(? 1 ? $n : ''/eg; # %d
  40         91  
220 42         63 $s =~ s/\\(.)/$1/g; # other \ escapes
221 42         76 push @formula_parts, $s;
222             }
223 16         103 return join($joiner, @formula_parts);
224             }
225              
226             sub sort_symbols {
227 15     15 0 27 my ($self, $formula_hash) = @_;
228 15         56 my @symbols = keys %$formula_hash;
229 15 100       28 if ($formula_hash->{C}) {
230             # C and H first, followed by alphabetical order
231 13         130 s/^([CH])$/\0$1/ for @symbols;
232 13         44 @symbols = sort @symbols;
233 13         62 s/^\0([CH])$/$1/ for @symbols;
234 13         52 return @symbols;
235             } else {
236             # simple alphabetical order
237 2         7 return sort @symbols;
238             }
239             }
240              
241             sub file_is {
242 0     0 1 0 return 0; # no files are identified automatically as having this format
243             }
244              
245             ### Code derived from formula.pl by Brent Gregersen follows
246              
247             my %macros = (
248             Me => '(CH3)',
249             Et => '(CH3CH2)',
250             Bu => '(C4H9)',
251             Bn => '(C6H5CH2)',
252             Cp => '(C5H5)',
253             Ph => '(C6H5)',
254             Bz => '(C6H5CO)',
255             # Ac is an element
256             # Pr is an element
257             );
258              
259              
260             sub parse_formula {
261 16     16 0 998 my ($self, $formula) = @_;
262 16         20 my (%elements);
263              
264             #check balancing
265 16 50       30 return %elements if (!ParensBalanced($formula));
266              
267             # replace other grouping with normal parens
268 16         32 $formula =~ tr/<>{}[]/()()()/;
269              
270             # get rid of any spaces
271 16         34 $formula =~ s/\s+//g;
272              
273             # perform macro expansion
274 16         43 foreach (keys(%macros)) {
275 112         630 $formula =~ s/$_/$macros{$_}/g;
276             }
277              
278             # determine initial compound coeficent
279 16 100       78 my $coef = ($formula =~ s/^(\d+\.?\d*)//) ? $1 : 1.0;
280              
281             # recursively process rest of formula
282 16         35 return internal_formula_parser($formula, $coef, %elements);
283             }
284              
285             sub internal_formula_parser {
286 35     35 0 77 my ($formula, $coef, %form) = @_;
287 35         38 my $tmp_coef;
288              
289 35         82 my ($extract, $remainder, $prefix) =
290             extract_bracketed($formula, '()', '[^(]*');
291              
292 35 100 66     2808 if (defined($extract) and $extract ne '') {
293 15         64 $extract =~ s/^\((.*)\)$/$1/;
294 15 100       57 if ($remainder =~ s/^(\d+\.?\d*)(.*)$/$2/) {
295 13         26 $tmp_coef = $1 * $coef;
296             } else {
297 2         2 $tmp_coef = $coef;
298             }
299              
300             # get formula of prefix ( it has no parens)
301 15 100       37 %form = add_formula_strings($prefix, $coef, %form) if ($prefix ne '');
302              
303             # check remainder for more parens
304 15 100       33 %form = internal_formula_parser($remainder, $coef, %form)
305             if ($remainder ne '');
306              
307             # check extract for more parens
308 15         32 %form =
309             internal_formula_parser($extract, $tmp_coef, %form);
310             ## we already know this is ne ''
311             } else { # get formula of complete string
312 20 50       59 %form = add_formula_strings($remainder, $coef, %form)
313             if ($remainder ne '');
314             }
315 35         99 return %form;
316             }
317              
318             sub add_formula_strings {
319 30     30 0 68 my ($formula, $coef, %elements) = @_;
320              
321             # print "Getting Formula of $formula\n";
322 30 50       140 $formula =~ /^(?:([A-Z][a-z]*)(\d+\.?\d*)?)+$/o # XXX new
323             or croak "Invalid Portion of Formula $formula";
324 30         90 while ($formula =~ m/([A-Z][a-z]*)(\d+\.?\d*)?/go) { # XXX new
325 55         112 my ($elm, $count) = ($1, $2);
326 55 100       104 $count = 1 unless defined $count;
327 55 100       90 if (defined $elements{$elm}) {
328 18         47 $elements{$elm} += $count * $coef;
329             } else {
330 37         139 $elements{$elm} = $count * $coef;
331             }
332             }
333 30         98 return %elements;
334             }
335              
336             sub ParensBalanced {
337 16     16 0 25 my ($form) = @_;
338 16         23 my @stack = ();
339 16         45 my %pairs = (
340             '<' => '>',
341             '{' => '}',
342             '[' => ']',
343             '(' => ')'
344             );
345              
346 16         62 while ($form =~ m/([<>(){}\]\[])/go) {
347 24         37 my $current = $1;
348 24 100       45 if ($current =~ /[<({\[]/) {
349 12         16 push(@stack, $current);
350 12         27 next;
351             }
352 12 50       16 return 0 if (scalar(@stack) == 0);
353 12 50       39 return 0 if ($current ne $pairs{ pop @stack});
354             }
355 16 50       59 return @stack ? 0 : 1;
356             }
357              
358             1;
359              
360             =head1 SOURCE CODE REPOSITORY
361              
362             L
363              
364             =head1 SEE ALSO
365              
366             L, L
367              
368             For discussion about Hill order, just search the web for C
369             order">. The original reference is I B<1900>, I<22>,
370             478-494. L.
371              
372             =head1 AUTHOR
373              
374             Ivan Tubert-Brohman .
375              
376             Formula parsing code contributed by Brent Gregersen.
377              
378             Patch for non-integer formulas by Daniel Scott.
379              
380             =head1 COPYRIGHT
381              
382             Copyright (c) 2005 Ivan Tubert-Brohman. All rights reserved. This program is
383             free software; you can redistribute it and/or modify it under the same terms as
384             Perl itself.
385              
386             =cut
387