File Coverage

blib/lib/Chemistry/File/Formula.pm

Criterion	Covered	Total	%
statement	98	99	98.9
branch	31	38	81.5
condition	7	11	63.6
subroutine	12	13	92.3
pod	3	8	37.5
total	151	169	89.3

line	stmt	bran	cond	sub	pod	time	code
1							package Chemistry::File::Formula;
2							$VERSION = '0.37';
3							# $Id: Formula.pm,v 1.16 2009/05/10 19:37:59 itubert Exp $
4
5	3			3		2916	use strict;
	3					7
	3					178
6	3			3		23	use base "Chemistry::File";
	3					7
	3					1599
7	3			3		889	use Chemistry::Mol;
	3					10
	3					159
8	3			3		21	use Carp;
	3					6
	3					293
9	3			3		4077	use Text::Balanced qw(extract_bracketed);
	3					49007
	3					5293
10
11							=head1 NAME
12
13							Chemistry::File::Formula - Molecular formula reader/formatter
14
15							=head1 SYNOPSIS
16
17							use Chemistry::File::Formula;
18
19							my $mol = Chemistry::Mol->parse("H2O");
20							print $mol->print(format => formula);
21							print $mol->formula; # this is a shorthand for the above
22							print $mol->print(format => formula,
23							formula_format => "%s%d{_%d});
24
25							=cut
26
27							Chemistry::Mol->register_format('formula');
28
29							=head1 DESCRIPTION
30
31							This module converts a molecule object to a string with the formula and back.
32							It registers the 'formula' format with Chemistry::Mol. Besides its obvious
33							use, it is included in the Chemistry::Mol distribution because it is a very
34							simple example of a Chemistry::File derived I/O module.
35
36							=head2 Writing formulas
37
38							The format can be specified as a printf-like string with the following control
39							sequences, which are specified with the formula_format parameter to $mol->print
40							or $mol->write.
41
42							=over
43
44							=item %s symbol
45
46							=item %D number of atoms
47
48							=item %d number of atoms, included only when it is greater than one
49
50							=item %d{substr} substr is only included when number of atoms is greater than
51							one
52
53							=item %j{substr} substr is inserted between the formatted string for each
54							element. (The 'j' stands for 'joiner'.) The format should have only one joiner,
55							but its location in the format string doesn't matter.
56
57							=item %% a percent sign
58
59							=back
60
61							If no format is specified, the default is "%s%d". Some examples follow. Let's
62							assume that the formula is C2H6O, as it would be formatted by default.
63
64							=over
65
66							=item C<< %s%D >>
67
68							Like the default, but include explicit indices for all atoms.
69							The formula would be formatted as "C2H6O1"
70
71							=item C<< %s%d{EsubE%dE/subE} >>
72
73							HTML format. The output would be
74							"CEsubE2E/subEHEsubE6E/subEO".
75
76							=item C<< %D %s%j{, } >>
77
78							Use a comma followed by a space as a joiner. The output would be
79							"2 C, 6 H, 1 O".
80
81							=back
82
83							=head3 Symbol Sort Order
84
85							The elements in the formula are sorted by default in the "Hill order", which
86							means that:
87
88							1) if the formula contains carbon, C goes first, followed by H,
89							and the rest of the symbols in alphabetical order. For example, "CH2BrF".
90
91							2) if there is no carbon, all the symbols (including H) are listed
92							alphabetically. For example, "BrH".
93
94							It is possible to supply a custom sorting subroutine with the 'formula_sort'
95							option. It expects a subroutine reference that takes a hash reference
96							describing the formula (similar to what is returned by parse_formula, discussed
97							below), and that returns a list of symbols in the desired order.
98
99							For example, this will sort the symbols in reverse asciibetical order:
100
101							my $formula = $mol->print(
102							format => 'formula',
103							formula_sort => sub {
104							my $formula_hash = shift;
105							return reverse sort keys %$formula_hash;
106							}
107							);
108
109							=head2 Parsing Formulas
110
111							Formulas can also be parsed back into Chemistry::Mol objects.
112							The formula may have parentheses and square or triangular brackets, and
113							it may have the following abbreviations:
114
115							Me => '(CH3)',
116							Et => '(CH3CH2)',
117							Bu => '(C4H9)',
118							Bn => '(C6H5CH2)',
119							Cp => '(C5H5)',
120							Ph => '(C6H5)',
121							Bz => '(C6H5CO)',
122
123							The formula may also be preceded by a number, which multiplies the whole
124							formula. Some examples of valid formulas:
125
126							=over
127
128							Formula Equivalent to
129							--------------------------------------------------------------
130							CH3(CH2)3CH3 C5H12
131							C6H3Me3 C9H12
132							2Cu[NH3]4(NO3)2 Cu2H24N12O12
133							2C(C[C5]4)3 C152
134							2C(C(C(C)5)4)3 C152
135							C 1 0 H 2 2 C10H22 (whitespace is completely ignored)
136
137							=back
138
139							When a formula is parsed, a molecule object is created which consists of
140							the set of the atoms in the formula (no bonds or coordinates, of course).
141							The atoms are created in alphabetical order, so the molecule object for C2H5Br
142							would have the atoms in the following sequence: Br, C, C, H, H, H, H, H.
143
144							If you don't want to create a molecule object, but would rather have a simple
145							hash with the number of atoms for each element, use the C
146							method:
147
148							my %formula = Chemistry::File::Formula->parse_formula("C2H6O");
149							use Data::Dumper;
150							print Dumper \%formula;
151
152							which prints something like
153
154							$VAR1 = {
155							'H' => 6,
156							'O' => 1,
157							'C' => 2
158							};
159
160							The C method is called internally by the C method.
161
162							=head3 Non-integer numbers in formulas
163
164							The C method can also accept formulas that contain
165							floating-point numbers, such as H1.5N0.5. The numbers must be positive, and
166							numbers smaller than one should include a leading zero (e.g., 0.9, not .9).
167
168							When formulas with non-integer numbers of atoms are turned into molecule
169							objects as described in the previous section, the number of atoms is always
170							B. For example, H1.5N0.5 will produce a molecule object with two
171							hydrogen atoms and one nitrogen atom.
172
173							There is currently no way of I formulas with non-integer numbers;
174							perhaps a future version will include an "occupancy" property for atoms that
175							will result in non-integer formulas.
176
177							=cut
178
179							sub parse_string {
180	14			14	1	41	my ($self, $string, %opts) = @_;
181	14		50			41	my $mol_class = $opts{mol_class} \|\| "Chemistry::Mol";
182	14		50			77	my $atom_class = $opts{atom_class} \|\| "Chemistry::Atom";
183	14		50			84	my $bond_class = $opts{bond_class} \|\| "Chemistry::Bond";
184
185	14					53	my $mol = $mol_class->new;
186	14					45	my %formula = $self->parse_formula($string);
187	14					63	for my $sym (sort keys %formula) {
188	31					92	for (my $i = 0; $i < $formula{$sym}; ++$i) {
189	516					1992	$mol->add_atom($atom_class->new(symbol => $sym));
190							}
191							}
192	14					126	return $mol;
193							}
194
195							sub write_string {
196	16			16	1	43	my ($self, $mol, %opts) = @_;
197	16					21	my @formula_parts;
198
199	16		100			75	my $format = $opts{formula_format} \|\| "%s%d"; # default format
200	16					59	my $fh = $mol->formula_hash;
201	16					39	$format =~ s/%%/\\%/g; # escape %% with a \
202	16					27	my $joiner = "";
203	16	50				49	$joiner = $1 if $format =~ s/(?
204
205	16					20	my @symbols;
206	16	100				38	if ($opts{formula_sort}) {
207	1					5	@symbols = $opts{formula_sort}($fh);
208							} else {
209	15					47	@symbols = $self->sort_symbols($fh);
210							}
211
212	16					57	for my $sym (@symbols) {
213	42					56	my $s = $format;
214	42					58	my $n = $fh->{$sym};
215	42					141	$s =~ s/(?
216	42					62	$s =~ s/(?
217	42	100				64	$s =~ s/(? 1 ? $1 : ''/eg; # %d{}
	3					9
218	42	100				131	$s =~ s/(? 1 ? $n : ''/eg; # %d
	40					186
219	42					68	$s =~ s/\\(.)/$1/g; # other \ escapes
220	42					109	push @formula_parts, $s;
221							}
222	16					132	return join($joiner, @formula_parts);
223							}
224
225							sub sort_symbols {
226	15			15	0	22	my ($self, $formula_hash) = @_;
227	15					55	my @symbols = keys %$formula_hash;
228	15	100				41	if ($formula_hash->{C}) {
229							# C and H first, followed by alphabetical order
230	13					517	s/^([CH])$/\0$1/ for @symbols;
231	13					45	@symbols = sort @symbols;
232	13					102	s/^\0([CH])$/$1/ for @symbols;
233	13					69	return @symbols;
234							} else {
235							# simple alphabetical order
236	2					11	return sort @symbols;
237							}
238							}
239
240							sub file_is {
241	0			0	1	0	return 0; # no files are identified automatically as having this format
242							}
243
244							### Code derived from formula.pl by Brent Gregersen follows
245
246							my %macros = (
247							Me => '(CH3)',
248							Et => '(CH3CH2)',
249							Bu => '(C4H9)',
250							Bn => '(C6H5CH2)',
251							Cp => '(C5H5)',
252							Ph => '(C6H5)',
253							Bz => '(C6H5CO)',
254							# Ac is an element
255							# Pr is an element
256							);
257
258
259							sub parse_formula {
260	16			16	0	1876	my ($self, $formula) = @_;
261	16					19	my (%elements);
262
263							#check balancing
264	16	50				42	return %elements if (!ParensBalanced($formula));
265
266							# replace other grouping with normal parens
267	16					39	$formula =~ tr/<>{}[]/()()()/;
268
269							# get rid of any spaces
270	16					40	$formula =~ s/\s+//g;
271
272							# perform macro expansion
273	16					57	foreach (keys(%macros)) {
274	112					1043	$formula =~ s/$_/$macros{$_}/g;
275							}
276
277							# determine initial compound coeficent
278	16	100				87	my $coef = ($formula =~ s/^(\d+\.?\d*)//) ? $1 : 1.0;
279
280							# recursively process rest of formula
281	16					43	return internal_formula_parser($formula, $coef, %elements);
282							}
283
284							sub internal_formula_parser {
285	35			35	0	77	my ($formula, $coef, %form) = @_;
286	35					33	my $tmp_coef;
287
288	35					367	my ($extract, $remainder, $prefix) =
289							extract_bracketed($formula, '()', '[^(]*');
290
291	35	100	66			5374	if (defined($extract) and $extract ne '') {
292	15					68	$extract =~ s/^$(.*)$$/$1/;
293	15	100				73	if ($remainder =~ s/^(\d+\.?\d)(.)$/$2/) {
294	13					31	$tmp_coef = $1 * $coef;
295							} else {
296	2					3	$tmp_coef = $coef;
297							}
298
299							# get formula of prefix ( it has no parens)
300	15	100				54	%form = add_formula_strings($prefix, $coef, %form) if ($prefix ne '');
301
302							# check remainder for more parens
303	15	100				40	%form = internal_formula_parser($remainder, $coef, %form)
304							if ($remainder ne '');
305
306							# check extract for more parens
307	15					116	%form =
308							internal_formula_parser($extract, $tmp_coef, %form);
309							## we already know this is ne ''
310							} else { # get formula of complete string
311	20	50				77	%form = add_formula_strings($remainder, $coef, %form)
312							if ($remainder ne '');
313							}
314	35					167	return %form;
315							}
316
317							sub add_formula_strings {
318	30			30	0	90	my ($formula, $coef, %elements) = @_;
319
320							# print "Getting Formula of $formula\n";
321	30	50				383	$formula =~ /^(?:([A-Z][a-z])(\d+\.?\d)?)+$/o # XXX new
322							or croak "Invalid Portion of Formula $formula";
323	30					130	while ($formula =~ m/([A-Z][a-z])(\d+\.?\d)?/go) { # XXX new
324	55					113	my ($elm, $count) = ($1, $2);
325	55	100				100	$count = 1 unless defined $count;
326	55	100				128	if (defined $elements{$elm}) {
327	18					66	$elements{$elm} += $count * $coef;
328							} else {
329	37					245	$elements{$elm} = $count * $coef;
330							}
331							}
332	30					140	return %elements;
333							}
334
335							sub ParensBalanced {
336	16			16	0	24	my ($form) = @_;
337	16					23	my @stack = ();
338	16					75	my %pairs = (
339							'<' => '>',
340							'{' => '}',
341							'[' => ']',
342							'(' => ')'
343							);
344
345	16					74	while ($form =~ m/([<>(){}\]\[])/go) {
346	24					41	my $current = $1;
347	24	100				698	if ($current =~ /[<({\[]/) {
348	12					19	push(@stack, $current);
349	12					46	next;
350							}
351	12	50				27	return 0 if (scalar(@stack) == 0);
352	12	50				57	return 0 if ($current ne $pairs{ pop @stack});
353							}
354	16	50				90	return @stack ? 0 : 1;
355							}
356
357							1;
358
359							=head1 VERSION
360
361							0.37
362
363							=head1 SEE ALSO
364
365							L, L
366
367							For discussion about Hill order, just search the web for C
368							order">. The original reference is I B<1900>, I<22>,
369							478-494. L.
370
371							The PerlMol website L
372
373							=head1 AUTHOR
374
375							Ivan Tubert-Brohman .
376
377							Formula parsing code contributed by Brent Gregersen.
378
379							Patch for non-integer formulas by Daniel Scott.
380
381							=head1 COPYRIGHT
382
383							Copyright (c) 2005 Ivan Tubert-Brohman. All rights reserved. This program is
384							free software; you can redistribute it and/or modify it under the same terms as
385							Perl itself.
386
387							=cut
388