File Coverage

lib/Chemistry/File/InChI/Parser.yp

Criterion	Covered	Total	%
statement	139	160	86.8
branch	28	34	82.3
condition	2	3	66.6
subroutine	46	59	77.9
pod	0	2	0.0
total	215	258	83.3

line	stmt	bran	cond	sub	pod	time	code
1							# Header section
2
3							%{
4
5	3			3		22	use strict;
	3					9
	3					126
6	3			3		19	use warnings;
	3					8
	3					230
7
8	3			3		1521	use Chemistry::Atom;
	3					80137
	3					246
9	3			3		2143	use Chemistry::Mol;
	3					62698
	3					237
10
11	3			3		32	use List::Util qw( first );
	3					8
	3					14114
12
13							my @LAYER_ORDER = qw( PREFIX FORMULA CONNECTIONS H_ATOMS CHARGE TETRAHEDRAL STEREOCHEMISTRY );
14
15							%}
16
17							%%
18	30			30	0	83
19	30	50				119	# Rules section
20
21							# The grammar is taken from https://github.com/metamolecular/inchi-grammar/blob/master/grammar.ebnf, commit 74a8858, MIT license
22
23							string: prefix
24	30			30		1530	{ 'PREFIX' }
25							\| string layer
26							{
27	90			90		4520	my $layer1_id = first { $LAYER_ORDER[$_] eq $_[1] } 0..$#LAYER_ORDER;
	182					446
28	90					677	my $layer2_id = first { $LAYER_ORDER[$_] eq $_[2] } 0..$#LAYER_ORDER;
	274					492
29	90	50				428	die "unknown layer $_[2]\n" unless defined $layer2_id;
30	90	50				223	die "duplicated layer $_[1]\n" if $layer1_id == $layer2_id;
31	90	50				215	die "incorrect layer order, $_[2] must appear before $_[1]\n" if $layer1_id > $layer2_id;
32	90					246	$_[2];
33							}
34							;
35
36							layer: formula
37	30			30		829	{ 'FORMULA' }
38							\| connections
39	30			30		1342	{ 'CONNECTIONS' }
40							\| h_atoms
41	28			28		1228	{ 'H_ATOMS' }
42							\| charge
43	0			0		0	{ 'CHARGE' }
44							\| tetrahedral
45	2			2		156	{ 'TETRAHEDRAL' }
46							\| stereochemistry
47	0			0		0	{ 'STEREOCHEMISTRY' }
48							;
49
50							formula: formula_first
51	30			30		3170	{ [ $_[1] ] }
52							\| formula formula_continuation
53	3			3		149	{ push @{$_[1]}, $_[2] }
	3					14
54							;
55
56							connections: '/' 'c'
57	0			0		0	{ $_[0]->{USER}{CURSOR}++ }
58							\| '/' 'c' graph
59	30			30		2833	{ $_[0]->{USER}{CURSOR}++ }
60							\| connections ';'
61	2			2		60	{ $_[0]->{USER}{CURSOR}++ }
62							\| connections ';' graph
63	1			1		94	{ $_[0]->{USER}{CURSOR}++ }
64							;
65
66							h_atoms: '/' 'h'
67	1			1		31	{ $_[0]->{USER}{CURSOR}++ }
68							\| '/' 'h' hydrogens
69	27			27		5634	{ $_[0]->{USER}{CURSOR}++ }
70							\| h_atoms ';'
71	1			1		31	{ $_[0]->{USER}{CURSOR}++ }
72							\| h_atoms ';' hydrogens
73	2			2		586	{ $_[0]->{USER}{CURSOR}++ }
74							;
75
76							tetrahedral: '/' 't'
77							\| '/' 't' tetrahedral_centers
78							\| tetrahedral ';'
79							\| tetrahedral ';' tetrahedral_centers
80							;
81
82							stereochemistry: '/' 's' '1'
83	0			0		0	{ $_[0]->{USER}{MOL}->attr( 'inchi/stereochemistry', $_[3] ) }
84							\| '/' 's' '2'
85	0			0		0	{ $_[0]->{USER}{MOL}->attr( 'inchi/stereochemistry', $_[3] ) }
86							;
87
88							# Production 'tail' is merged to 'graph' and 'body' for simplicity.
89
90							graph: chain
91							\| count '*' chain
92							;
93
94							# Return: The last atom in a chain
95							chain: index '-' index
96	31			31		2180	{ $_[0]->_add_bonds( $_[1], $_[3] ); return $_[3] }
	31					5302
97							\| index branches index
98	0			0		0	{ $_[0]->_add_bonds( $_[1], [ @{$_[2]}, $_[3] ] ); return $_[3] }
	0					0
	0					0
99							\| chain '-' index
100	95			95		6449	{ $_[0]->_add_bonds( $_[1], $_[3] ); return $_[3] }
	95					13458
101							\| chain branches index
102	64			64		4234	{ $_[0]->_add_bonds( $_[1], [ @{$_[2]}, $_[3] ] ); return $_[3] }
	64					303
	64					8663
103							;
104
105	74			74		4831	branch: '(' branch_body ')' { $_[2]->{bonds} } ;
106
107							# Return: Array with all atoms connected to the parent one
108							branch_body: index
109	74			74		5082	{ { bonds => [ $_[1] ], last => $_[1] } }
110							\| branch_body ',' index
111							{
112	3			3		246	push @{$_[1]->{bonds}}, $_[3];
	3					13
113	3					7	$_[1]->{last} = $_[3];
114	3					8	return $_[1];
115							}
116							\| branch_body '-' index
117							{
118	20			20		1499	$_[0]->_add_bonds( $_[1]->{last}, $_[3] );
119	20					2978	$_[1]->{last} = $_[3];
120	20					55	return $_[1];
121							}
122							\| branch_body branches index
123							{
124	10			10		699	$_[0]->_add_bonds( $_[1]->{last}, [ @{$_[2]}, $_[3] ] );
	10					58
125	10					1394	$_[1]->{last} = $_[3];
126	10					32	return $_[1];
127							}
128							;
129
130							branches: branch
131	74			74		3199	{ $_[1] }
132							\| branches branch
133	0			0		0	{ [ @{$_[1]}, @{$_[2]} ] }
	0					0
	0					0
134							;
135
136							hydrogens: vh_count
137							\| vh_count mh_counts
138							\| count '*' vh_count
139							\| count '*' vh_count mh_counts
140							\| mh_count
141							;
142
143							vh_count: virtual_hydrogens
144							\| vh_count ',' virtual_hydrogens
145							;
146
147							mh_count: mobile_hydrogens
148							\| mh_count mobile_hydrogens
149							;
150
151							mh_counts: ',' mh_count
152							\| mh_counts ',' mh_count
153							;
154
155							virtual_hydrogens: virtual_hydrogen_receivers 'H'
156	23			23		719	{ $_[0]->_add_hydrogens( $_[1] ) }
157							\| virtual_hydrogen_receivers 'H' count
158	28			28		931	{ $_[0]->_add_hydrogens( $_[1], $_[3] ) }
159							;
160
161							virtual_hydrogen_receivers: index
162	51			51		3369	{ [ $_[1] ] }
163							\| virtual_hydrogen_receivers ',' index
164							{
165	16			16		1066	push @{$_[1]}, $_[3];
	16					56
166	16					40	return $_[1];
167							}
168							\| virtual_hydrogen_receivers '-' index
169							{
170	38			38		2563	push @{$_[1]}, ($_[1]->[-1] + 1)..$_[3];
	38					302
171	38					126	return $_[1]
172							}
173							;
174
175							mobile_hydrogens: '(' 'H' comma_separated_indexes ')'
176							\| '(' 'H' '-' comma_separated_indexes ')'
177							\| '(' 'H' count '-' comma_separated_indexes ')'
178							\| '(' 'H' count comma_separated_indexes ')'
179							;
180
181							comma_separated_indexes: ',' index
182	13			13		886	{ [ $_[2] ] }
183							\| comma_separated_indexes ',' index
184							{
185	27			27		1772	push @{$_[1]}, $_[3];
	27					113
186	27					66	return $_[1];
187							}
188							;
189
190							tetrahedral_center: index '+'
191	2			2		210	{ $_[0]->_get_atom( $_[1] )->attr( 'inchi/chirality', $_[2] ) }
192							\| index '-'
193	5			5		406	{ $_[0]->_get_atom( $_[1] )->attr( 'inchi/chirality', $_[2] ) }
194							\| index '?'
195							;
196
197							tetrahedral_centers: tetrahedral_center
198							\| tetrahedral_centers ',' tetrahedral_center
199							;
200
201							charge: '/' 'q'
202	0			0		0	{ $_[0]->{USER}{MOL}->attr( 'inchi/charges', [] ) }
203							\| charge '+' natural
204	0			0		0	{ $_[0]->{USER}{MOL}->attr( 'inchi/charges' )->[$_[0]->{USER}{CURSOR}] = int( $_[3] ) }
205							\| charge '-' natural
206	0			0		0	{ $_[0]->{USER}{MOL}->attr( 'inchi/charges' )->[$_[0]->{USER}{CURSOR}] = int( -$_[3] ) }
207							\| charge ';'
208	0			0		0	{ $_[0]->{USER}{CURSOR}++ }
209							;
210
211							index: '1' \| count ;
212
213							count: '1' digit
214	118			118		12884	{ join '', @_[1..2] }
215							\| count digit
216	45			45		4817	{ join '', @_[1..2] }
217							\| twoplus
218							;
219
220							natural: '1'
221							\| twoplus
222							\| natural digit
223	0			0		0	{ join '', @_[1..2] }
224	30					15571	;
225
226							digit: '0' \| '1' \| twoplus ;
227
228							twoplus: '2' \| '3' \| '4' \| '5' \| '6' \| '7' \| '8' \| '9' ;
229
230							%%
231
232							# Footer section
233
234							sub parse
235							{
236	30			30	0	84	my( $self, $string ) = @_;
237
238	30					109	$self->YYData->{INPUT} = $string;
239	30					496	$self->{USER}{MOL} = Chemistry::Mol->new;
240	30					787	$self->{USER}{CURSOR} = 0;
241	30					179	$self->YYParse( yylex => \&_Lexer, yyerror => \&_Error, yydebug => 0 );
242
243	30					10213	return $self->{USER}{MOL};
244							}
245
246							sub _Lexer
247							{
248	1442			1442		81422	my( $self ) = @_;
249
250							# If the line is empty and the input is originating from the file,
251							# another line is read.
252	1442	50	66			3653	if( !$self->YYData->{INPUT} && $self->{USER}{FILEIN} ) {
253	0					0	my $filein = $self->{USER}{FILEIN};
254	0					0	$self->YYData->{INPUT} = <$filein>;
255	0					0	$self->{USER}{CHARNO} = 0;
256							}
257
258							# Prefix
259	1442	100				11768	if( $self->YYData->{INPUT} =~ s/^(InChI=1S?)// ) {
260	30					3399	return ( 'prefix', $1 );
	30					623
261							}
262
263							# Formula parts
264							# TODO: Check Hill order, require counts > 1
265	1412	100				11232	if( $self->YYData->{INPUT} =~ s/^([\/\.])([2-9]\|[1-9][0-9]+)?(([A-Z][a-z]?\d*)+)// ) {
266	33					651	my( $sep, $count, $formula ) = ( $1, $2, $3 );
267	33	100				116	$count = 1 unless $count;
268	33					59	my %atom_map;
269
270	33					215	while( $formula =~ /([A-Z][a-z]?)(\d*)/g ) {
271	103					392	my( $element, $count ) = ( $1, $2 );
272	103	100				343	next if $element eq 'H'; # H atoms will be added later
273	73	100				190	$count = 1 unless $count;
274	73					321	for (1..$count) {
275	294					891	my $atom = Chemistry::Atom->new( symbol => $element );
276	294					14201	$self->{USER}{MOL}->add_atom( $atom );
277	294					9365	$atom_map{scalar( keys %atom_map ) + 1} = $atom;
278							}
279							}
280
281	33	100				96	if( $sep eq '/') {
282	30					105	$self->{USER}{ATOM_MAPS} = [ \%atom_map ];
283	30					272	$self->{USER}{MOL}->attr( 'inchi/counts', [ $count ] );
284	30					726	return ( 'formula_first', $formula );
285							} else {
286	3					8	push @{$self->{USER}{ATOM_MAPS}}, \%atom_map;
	3					13
287	3					7	push @{$self->{USER}{MOL}->attr( 'inchi/counts' )}, $count;
	3					13
288	3					46	return ( 'formula_continuation', $formula );
289							}
290							}
291
292							# Reset cursor on 'h', 'q' or 't'
293	1379	100				11638	if( $self->YYData->{INPUT} =~ s/^([hqt])// ) {
294	30					312	$self->{USER}{CURSOR} = 0;
295	30					190	return ( $1, $1 );
296							}
297
298							# Remove unsupported layers
299	1349					10153	$self->YYData->{INPUT} =~ s/^(\/[pbmsifo][^\/]*)+//;
300
301							# Any other character
302	1349	100				9028	if( $self->YYData->{INPUT} =~ s/^(.)// ) {
303	1319					17789	return ( $1, $1 );
304							}
305
306	30					251	return ( '', '' );
307							}
308
309							sub _Error
310							{
311	0			0		0	my( $self ) = @_;
312	0					0	die 'ERROR: ', $self->YYData->{INPUT}, "\n";
313							}
314
315							sub _add_bonds
316							{
317	220			220		556	my( $self, $a, $b ) = @_;
318
319	220	100				819	my @bonds = ref $b eq 'ARRAY' ? @$b : $b;
320	220					528	for (@bonds) {
321	297					12156	$self->{USER}{MOL}->new_bond( atoms => [ $self->_get_atom( $a ),
322							$self->_get_atom( $_ ) ] );
323							}
324							}
325
326							sub _add_hydrogens
327							{
328	51			51		132	my( $self, $atoms, $count ) = @_;
329
330	51					149	my $atom_map = $self->{USER}{ATOM_MAPS}[$self->{USER}{CURSOR}];
331	51	50				149	my @atoms = map { $atom_map->{$_} }
	147					376
332							ref $atoms ? @$atoms : ( $atoms );
333	51	100				172	$count = 1 unless $count;
334	51					162	for my $atom (@atoms) {
335	147					12558	for (1..$count) {
336	252					14589	my $H = Chemistry::Atom->new( symbol => 'H' );
337	252					13216	$self->{USER}{MOL}->add_atom( $H );
338	252					7960	$self->{USER}{MOL}->new_bond( atoms => [ $atom, $H ] );
339							}
340							}
341							}
342
343							sub _get_atom
344							{
345	601			601		1130	my( $self, $atom ) = @_;
346
347	601					1272	my $atom_map = $self->{USER}{ATOM_MAPS}[$self->{USER}{CURSOR}];
348
349	601	100				1440	if( !exists $atom_map->{$atom} ) {
350							# If an atom with given index does not exist, this is probably a hydrogen.
351							# FIXME: Check if there are unused hydrogen atoms.
352
353	2					10	my $H = Chemistry::Atom->new( symbol => 'H' );
354	2					109	$self->{USER}{MOL}->add_atom( $H );
355	2					95	$atom_map->{$atom} = $H;
356							}
357
358	601					2107	return $atom_map->{$atom};
359							}
360
361							1;