File Coverage

blib/lib/Chemistry/File/Formula.pm
Criterion Covered Total %
statement 98 99 98.9
branch 31 38 81.5
condition 7 11 63.6
subroutine 12 13 92.3
pod 3 8 37.5
total 151 169 89.3


line stmt bran cond sub pod time code
1             package Chemistry::File::Formula;
2              
3             our $VERSION = '0.40'; # VERSION
4             # $Id$
5              
6 3     3   343353 use strict;
  3         7  
  3         153  
7 3     3   22 use base "Chemistry::File";
  3         7  
  3         1853  
8 3     3   764 use Chemistry::Mol;
  3         10  
  3         225  
9 3     3   23 use Carp;
  3         6  
  3         240  
10 3     3   2971 use Text::Balanced qw(extract_bracketed);
  3         44402  
  3         6792  
11              
12             =head1 NAME
13              
14             Chemistry::File::Formula - Molecular formula reader/formatter
15              
16             =head1 SYNOPSIS
17              
18             use Chemistry::File::Formula;
19              
20             my $mol = Chemistry::Mol->parse("H2O");
21             print $mol->print(format => formula);
22             print $mol->formula; # this is a shorthand for the above
23             print $mol->print(format => formula,
24             formula_format => "%s%d{%d});
25              
26             =cut
27              
28             Chemistry::Mol->register_format('formula');
29              
30             =head1 DESCRIPTION
31              
32             This module converts a molecule object to a string with the formula and back.
33             It registers the 'formula' format with Chemistry::Mol. Besides its obvious
34             use, it is included in the Chemistry::Mol distribution because it is a very
35             simple example of a Chemistry::File derived I/O module.
36              
37             =head2 Writing formulas
38              
39             The format can be specified as a printf-like string with the following control
40             sequences, which are specified with the formula_format parameter to $mol->print
41             or $mol->write.
42              
43             =over
44              
45             =item %s symbol
46              
47             =item %D number of atoms
48              
49             =item %d number of atoms, included only when it is greater than one
50              
51             =item %d{substr} substr is only included when number of atoms is greater than
52             one
53              
54             =item %j{substr} substr is inserted between the formatted string for each
55             element. (The 'j' stands for 'joiner'.) The format should have only one joiner,
56             but its location in the format string doesn't matter.
57              
58             =item %% a percent sign
59              
60             =back
61              
62             If no format is specified, the default is "%s%d". Some examples follow. Let's
63             assume that the formula is C2H6O, as it would be formatted by default.
64              
65             =over
66              
67             =item C<< %s%D >>
68              
69             Like the default, but include explicit indices for all atoms.
70             The formula would be formatted as "C2H6O1"
71              
72             =item C<< %s%d{EsubE%dE/subE} >>
73              
74             HTML format. The output would be
75             "CEsubE2E/subEHEsubE6E/subEO".
76              
77             =item C<< %D %s%j{, } >>
78              
79             Use a comma followed by a space as a joiner. The output would be
80             "2 C, 6 H, 1 O".
81              
82             =back
83              
84             =head3 Symbol Sort Order
85              
86             The elements in the formula are sorted by default in the "Hill order", which
87             means that:
88              
89             1) if the formula contains carbon, C goes first, followed by H,
90             and the rest of the symbols in alphabetical order. For example, "CH2BrF".
91              
92             2) if there is no carbon, all the symbols (including H) are listed
93             alphabetically. For example, "BrH".
94              
95             It is possible to supply a custom sorting subroutine with the 'formula_sort'
96             option. It expects a subroutine reference that takes a hash reference
97             describing the formula (similar to what is returned by parse_formula, discussed
98             below), and that returns a list of symbols in the desired order.
99              
100             For example, this will sort the symbols in reverse asciibetical order:
101              
102             my $formula = $mol->print(
103             format => 'formula',
104             formula_sort => sub {
105             my $formula_hash = shift;
106             return reverse sort keys %$formula_hash;
107             }
108             );
109              
110             =head2 Parsing Formulas
111              
112             Formulas can also be parsed back into Chemistry::Mol objects.
113             The formula may have parentheses and square or triangular brackets, and
114             it may have the following abbreviations:
115              
116             Me => '(CH3)',
117             Et => '(CH3CH2)',
118             Bu => '(C4H9)',
119             Bn => '(C6H5CH2)',
120             Cp => '(C5H5)',
121             Ph => '(C6H5)',
122             Bz => '(C6H5CO)',
123              
124             The formula may also be preceded by a number, which multiplies the whole
125             formula. Some examples of valid formulas:
126              
127             =over
128              
129             Formula Equivalent to
130             --------------------------------------------------------------
131             CH3(CH2)3CH3 C5H12
132             C6H3Me3 C9H12
133             2Cu[NH3]4(NO3)2 Cu2H24N12O12
134             2C(C[C5]4)3 C152
135             2C(C(C(C)5)4)3 C152
136             C 1 0 H 2 2 C10H22 (whitespace is completely ignored)
137              
138             =back
139              
140             When a formula is parsed, a molecule object is created which consists of
141             the set of the atoms in the formula (no bonds or coordinates, of course).
142             The atoms are created in alphabetical order, so the molecule object for C2H5Br
143             would have the atoms in the following sequence: Br, C, C, H, H, H, H, H.
144              
145             If you don't want to create a molecule object, but would rather have a simple
146             hash with the number of atoms for each element, use the C
147             method:
148              
149             my %formula = Chemistry::File::Formula->parse_formula("C2H6O");
150             use Data::Dumper;
151             print Dumper \%formula;
152              
153             which prints something like
154              
155             $VAR1 = {
156             'H' => 6,
157             'O' => 1,
158             'C' => 2
159             };
160              
161             The C method is called internally by the C method.
162              
163             =head3 Non-integer numbers in formulas
164              
165             The C method can also accept formulas that contain
166             floating-point numbers, such as H1.5N0.5. The numbers must be positive, and
167             numbers smaller than one should include a leading zero (e.g., 0.9, not .9).
168              
169             When formulas with non-integer numbers of atoms are turned into molecule
170             objects as described in the previous section, the number of atoms is always
171             B. For example, H1.5N0.5 will produce a molecule object with two
172             hydrogen atoms and one nitrogen atom.
173              
174             There is currently no way of I formulas with non-integer numbers;
175             perhaps a future version will include an "occupancy" property for atoms that
176             will result in non-integer formulas.
177              
178             =cut
179              
180             sub parse_string {
181 14     14 1 51 my ($self, $string, %opts) = @_;
182 14   50     40 my $mol_class = $opts{mol_class} || "Chemistry::Mol";
183 14   50     81 my $atom_class = $opts{atom_class} || "Chemistry::Atom";
184 14   50     56 my $bond_class = $opts{bond_class} || "Chemistry::Bond";
185              
186 14         50 my $mol = $mol_class->new;
187 14         41 my %formula = $self->parse_formula($string);
188 14         61 for my $sym (sort keys %formula) {
189 31         90 for (my $i = 0; $i < $formula{$sym}; ++$i) {
190 516         1525 $mol->add_atom($atom_class->new(symbol => $sym));
191             }
192             }
193 14         245 return $mol;
194             }
195              
196             sub write_string {
197 16     16 1 50 my ($self, $mol, %opts) = @_;
198 16         31 my @formula_parts;
199              
200 16   100     73 my $format = $opts{formula_format} || "%s%d"; # default format
201 16         49 my $fh = $mol->formula_hash;
202 16         44 $format =~ s/%%/\\%/g; # escape %% with a \
203 16         27 my $joiner = "";
204 16 50       45 $joiner = $1 if $format =~ s/(?
205              
206 16         28 my @symbols;
207 16 100       39 if ($opts{formula_sort}) {
208 1         5 @symbols = $opts{formula_sort}($fh);
209             } else {
210 15         49 @symbols = $self->sort_symbols($fh);
211             }
212              
213 16         39 for my $sym (@symbols) {
214 42         71 my $s = $format;
215 42         74 my $n = $fh->{$sym};
216 42         199 $s =~ s/(?
217 42         82 $s =~ s/(?
218 42 100       70 $s =~ s/(? 1 ? $1 : ''/eg; # %d{}
  3         13  
219 42 100       139 $s =~ s/(? 1 ? $n : ''/eg; # %d
  40         126  
220 42         87 $s =~ s/\\(.)/$1/g; # other \ escapes
221 42         101 push @formula_parts, $s;
222             }
223 16         164 return join($joiner, @formula_parts);
224             }
225              
226             sub sort_symbols {
227 15     15 0 80 my ($self, $formula_hash) = @_;
228 15         49 my @symbols = keys %$formula_hash;
229 15 100       36 if ($formula_hash->{C}) {
230             # C and H first, followed by alphabetical order
231 13         179 s/^([CH])$/\0$1/ for @symbols;
232 13         52 @symbols = sort @symbols;
233 13         85 s/^\0([CH])$/$1/ for @symbols;
234 13         66 return @symbols;
235             } else {
236             # simple alphabetical order
237 2         10 return sort @symbols;
238             }
239             }
240              
241             sub file_is {
242 0     0 1 0 return 0; # no files are identified automatically as having this format
243             }
244              
245             ### Code derived from formula.pl by Brent Gregersen follows
246              
247             my %macros = (
248             Me => '(CH3)',
249             Et => '(CH3CH2)',
250             Bu => '(C4H9)',
251             Bn => '(C6H5CH2)',
252             Cp => '(C5H5)',
253             Ph => '(C6H5)',
254             Bz => '(C6H5CO)',
255             # Ac is an element
256             # Pr is an element
257             );
258              
259              
260             sub parse_formula {
261 16     16 0 1443 my ($self, $formula) = @_;
262 16         29 my (%elements);
263              
264             #check balancing
265 16 50       37 return %elements if (!ParensBalanced($formula));
266              
267             # replace other grouping with normal parens
268 16         42 $formula =~ tr/<>{}[]/()()()/;
269              
270             # get rid of any spaces
271 16         39 $formula =~ s/\s+//g;
272              
273             # perform macro expansion
274 16         60 foreach (keys(%macros)) {
275 112         1017 $formula =~ s/$_/$macros{$_}/g;
276             }
277              
278             # determine initial compound coeficent
279 16 100       96 my $coef = ($formula =~ s/^(\d+\.?\d*)//) ? $1 : 1.0;
280              
281             # recursively process rest of formula
282 16         51 return internal_formula_parser($formula, $coef, %elements);
283             }
284              
285             sub internal_formula_parser {
286 35     35 0 96 my ($formula, $coef, %form) = @_;
287 35         72 my $tmp_coef;
288              
289 35         114 my ($extract, $remainder, $prefix) =
290             extract_bracketed($formula, '()', '[^(]*');
291              
292 35 100 66     4027 if (defined($extract) and $extract ne '') {
293 15         103 $extract =~ s/^\((.*)\)$/$1/;
294 15 100       137 if ($remainder =~ s/^(\d+\.?\d*)(.*)$/$2/) {
295 13         37 $tmp_coef = $1 * $coef;
296             } else {
297 2         5 $tmp_coef = $coef;
298             }
299              
300             # get formula of prefix ( it has no parens)
301 15 100       58 %form = add_formula_strings($prefix, $coef, %form) if ($prefix ne '');
302              
303             # check remainder for more parens
304 15 100       48 %form = internal_formula_parser($remainder, $coef, %form)
305             if ($remainder ne '');
306              
307             # check extract for more parens
308 15         52 %form =
309             internal_formula_parser($extract, $tmp_coef, %form);
310             ## we already know this is ne ''
311             } else { # get formula of complete string
312 20 50       86 %form = add_formula_strings($remainder, $coef, %form)
313             if ($remainder ne '');
314             }
315 35         144 return %form;
316             }
317              
318             sub add_formula_strings {
319 30     30 0 80 my ($formula, $coef, %elements) = @_;
320              
321             # print "Getting Formula of $formula\n";
322 30 50       268 $formula =~ /^(?:([A-Z][a-z]*)(\d+\.?\d*)?)+$/o # XXX new
323             or croak "Invalid Portion of Formula $formula";
324 30         184 while ($formula =~ m/([A-Z][a-z]*)(\d+\.?\d*)?/go) { # XXX new
325 55         158 my ($elm, $count) = ($1, $2);
326 55 100       115 $count = 1 unless defined $count;
327 55 100       121 if (defined $elements{$elm}) {
328 18         63 $elements{$elm} += $count * $coef;
329             } else {
330 37         172 $elements{$elm} = $count * $coef;
331             }
332             }
333 30         177 return %elements;
334             }
335              
336             sub ParensBalanced {
337 16     16 0 32 my ($form) = @_;
338 16         25 my @stack = ();
339 16         91 my %pairs = (
340             '<' => '>',
341             '{' => '}',
342             '[' => ']',
343             '(' => ')'
344             );
345              
346 16         85 while ($form =~ m/([<>(){}\]\[])/go) {
347 24         56 my $current = $1;
348 24 100       72 if ($current =~ /[<({\[]/) {
349 12         38 push(@stack, $current);
350 12         40 next;
351             }
352 12 50       29 return 0 if (scalar(@stack) == 0);
353 12 50       58 return 0 if ($current ne $pairs{ pop @stack});
354             }
355 16 50       74 return @stack ? 0 : 1;
356             }
357              
358             1;
359              
360             =head1 SOURCE CODE REPOSITORY
361              
362             L
363              
364             =head1 SEE ALSO
365              
366             L, L
367              
368             For discussion about Hill order, just search the web for C
369             order">. The original reference is I B<1900>, I<22>,
370             478-494. L.
371              
372             =head1 AUTHOR
373              
374             Ivan Tubert-Brohman .
375              
376             Formula parsing code contributed by Brent Gregersen.
377              
378             Patch for non-integer formulas by Daniel Scott.
379              
380             =head1 COPYRIGHT
381              
382             Copyright (c) 2005 Ivan Tubert-Brohman. All rights reserved. This program is
383             free software; you can redistribute it and/or modify it under the same terms as
384             Perl itself.
385              
386             =cut
387