File Coverage

lib/OMOP/CSV/Validator.pm

Criterion	Covered	Total	%
statement	70	98	71.4
branch	21	34	61.7
condition	3	11	27.2
subroutine	14	16	87.5
pod	0	5	0.0
total	108	164	65.8

line	stmt	bran	cond	sub	pod	time	code
1							package OMOP::CSV::Validator;
2
3	1			1		964225	use strict;
	1					3
	1					44
4	1			1		6	use warnings;
	1					3
	1					58
5	1			1		6	use utf8;
	1					2
	1					6
6	1			1		32	use JSON::XS;
	1					2
	1					76
7	1			1		6	use JSON::Validator;
	1					1
	1					6
8	1			1		56	use Text::CSV_XS;
	1					2
	1					72
9	1			1		14	use Scalar::Util qw(looks_like_number);
	1					1
	1					91
10	1			1		8	use Path::Tiny;
	1					2
	1					70
11	1			1		10	use Data::Dumper;
	1					2
	1					2194
12
13							our $VERSION = '0.03';
14
15							=head1 NAME
16
17							OMOP::CSV::Validator - Validates OMOP CDM CSV files against their expected data types
18
19							=head1 SYNOPSIS
20
21							use OMOP::CSV::Validator;
22
23							my $validator = OMOP::CSV::Validator->new();
24
25							# Load schemas from DDL
26							my $schemas = $validator->load_schemas_from_ddl($ddl_text);
27
28							# Retrieve specific table schema for a CSV file
29							my $schema = $validator->get_schema_from_csv_filename($csv_file, $schemas);
30
31							# Validate CSV file
32							my $errors = $validator->validate_csv_file($csv_file, $schema);
33							if (@$errors) {
34							print "Validation errors found:\n";
35							for my $err_info (@$errors) {
36							print "Row $err_info->{row}:\n";
37							for my $e (@{ $err_info->{errors} }) {
38							print " $e\n";
39							}
40							}
41							} else {
42							print "CSV is valid.\n";
43							}
44
45							=head1 DESCRIPTION
46
47							OMOP::CSV::Validator is a CLI tool and Perl module designed to validate OMOP Common Data Model (CDM) CSV files. It auto-generates JSON schemas from PostgreSQL DDL files and then validates CSV rows against those schemas.
48
49							=head1 METHODS
50
51							=cut
52
53							##########################################################################
54							# Constructor: new()
55							##########################################################################
56							sub new {
57	1			1	0	338752	my ( $class, %args ) = @_;
58	1					5	my $self = bless {}, $class;
59	1					4	return $self;
60							}
61
62							##########################################################################
63							# load_schemas_from_ddl($ddl_text)
64							#
65							# Parses all CREATE TABLE definitions from a PostgreSQL OMOP DDL
66							# and returns a hashref of JSON schemas keyed by table name (lowercase).
67							##########################################################################
68							sub load_schemas_from_ddl {
69	1			1	0	8	my ( $self, $ddl_text ) = @_;
70	1					7	return $self->_ddl_to_json_schemas($ddl_text);
71							}
72
73							##########################################################################
74							# _ddl_to_json_schemas($ddl_text) - private
75							#
76							# Internal subroutine that iterates over all CREATE TABLE blocks.
77							##########################################################################
78							sub _ddl_to_json_schemas {
79	1			1		3	my ( $self, $ddl_text ) = @_;
80	1					2	my %schemas;
81	1					28	while (
82							$ddl_text =~ /
83							CREATE\s+TABLE\s+\S+\.(\w+)\s*\( # capture table name (after schema qualifier)
84							(.*?) # capture everything inside parentheses
85							\)\s*; # until the closing parenthesis and semicolon
86							/gisx
87							)
88							{
89	2					11	my ( $table, $cols_block ) = ( lc $1, $2 );
90	2					10	$schemas{$table} = $self->_build_schema( $table, $cols_block );
91							}
92	1					6	return \%schemas;
93							}
94
95							##########################################################################
96							# _build_schema($table_name, $cols_block) - private
97							#
98							# Builds a JSON schema for one table from the column definitions.
99							##########################################################################
100							sub _build_schema {
101	2			2		6	my ( $self, $table_name, $cols_block ) = @_;
102	2					18	my $schema = {
103							'$schema' => 'http://json-schema.org/draft-07/schema#',
104							title => $table_name,
105							type => 'object',
106							properties => {},
107							required => [],
108							additionalProperties => 0,
109							};
110
111	2					40	for my $line ( grep /\S/, split /\n/, $cols_block ) {
112	24					311	$line =~ s/^\s+\|\s+$//g;
113	24					88	$line =~ s/,$//;
114	24	50				65	next if $line =~ /^--/; # Skip comment lines
115
116	24	50				130	if ( $line =~
117							/^(\w+)\s+([A-Za-z]+)(?:$(\d+(?:,\d+)?)$)?(?:\s+(NOT NULL))?/i )
118							{
119	24					134	my ( $col, $type, $length, $notnull ) =
120							( lc $1, lc $2, $3, defined $4 );
121	24					38	my $prop = {};
122
123	24	100				104	if ( $type =~ /int/ ) {
		100
		100
		100
		50
124	15					47	$prop->{type} = 'integer';
125	15					26	$prop->{_coerce} = 1;
126							}
127							elsif ( $type =~ /numeric\|real\|double/ ) {
128	1					2	$prop->{type} = 'number';
129	1					3	$prop->{_coerce} = 1;
130
131							}
132							elsif ( $type eq 'date' ) {
133	1					4	$prop->{type} = 'string';
134	1					3	$prop->{format} = 'date';
135							}
136							elsif ( $type =~ /timestamp/ ) {
137	2					4	$prop->{type} = 'string';
138	2					7	$prop->{format} = 'date-time';
139							}
140							elsif ( $type eq 'varchar' ) {
141	5					23	$prop->{type} = 'string';
142	5	50				11	if ( defined $length ) {
143
144							# Capture only the first number if a comma is present (e.g., varchar(10,2))
145	5	50				17	if ( $length =~ /^(\d+)/ ) {
146	5					16	$prop->{maxLength} = int($1);
147							}
148							}
149							}
150							else {
151	0					0	$prop->{type} = 'string';
152							}
153
154							# If the column is not marked as NOT NULL, allow null values
155	24	100				49	unless ($notnull) {
156	16					87	$prop->{type} = [ $prop->{type}, 'null' ];
157							}
158
159	24					64	$schema->{properties}{$col} = $prop;
160	24	100				59	push @{ $schema->{required} }, $col if $notnull;
	8					22
161							}
162							}
163	2					28	return $schema;
164							}
165
166							##########################################################################
167							# get_schema_from_csv_filename($csv_filename, $schemas)
168							#
169							# Derives the table name from the CSV file's basename (e.g. PERSON.csv → person)
170							# and returns the corresponding schema from the provided hashref.
171							##########################################################################
172							sub get_schema_from_csv_filename {
173	0			0	0	0	my ( $self, $csv_filename, $schemas ) = @_;
174	0					0	( my $table = lc $csv_filename ) =~ s{^.*/}{}; # remove any path
175	0					0	$table =~ s/\.csv$//i; # remove .csv extension
176	0					0	return $schemas->{$table};
177							}
178
179							##########################################################################
180							# validate_csv_file($csv_file, $schema, $sep)
181							#
182							# Reads the CSV file, coerces numeric fields, and validates each row against
183							# the provided JSON schema. Returns an arrayref of error info (each entry is a
184							# hashref with keys 'row' and 'errors').
185							##########################################################################
186							sub validate_csv_file {
187	0			0	0	0	my ( $self, $csv_file, $schema, $sep ) = @_;
188	0		0			0	$sep //= ',';
189
190	0					0	my $csv_handle = path($csv_file)->openr_utf8;
191	0	0				0	my $csv =
192							Text::CSV_XS->new(
193							{ binary => 1, sep_char => $sep, auto_diag => 1, blank_is_undef => 1 } )
194							or die "Cannot use CSV: " . Text::CSV_XS->error_diag();
195
196	0					0	my $header = $csv->getline($csv_handle);
197	0					0	$csv->column_names(@$header);
198
199	0					0	my $records = $csv->getline_hr_all($csv_handle);
200	0					0	$csv_handle->close;
201
202	0					0	my @errors;
203	0					0	my $validator = JSON::Validator->new;
204	0					0	$validator->schema($schema);
205
206	0					0	for my $i ( 0 .. $#$records ) {
207	0					0	my $record = $records->[$i];
208
209							# Coerce numeric fields according to the schema.
210	0					0	for my $col ( keys %{ $schema->{properties} } ) {
	0					0
211	0	0				0	if ( exists $record->{$col} ) {
212	0					0	my $prop = $schema->{properties}->{$col};
213	0	0	0			0	if ( defined $prop->{_coerce} && $prop->{_coerce} ) {
214							$record->{$col} =
215	0					0	$self->dotify_and_coerce_number( $record->{$col} );
216							}
217							}
218							}
219
220							# Validate
221	0					0	my $errs = [ $validator->validate($record) ];
222	0	0				0	if (@$errs) {
223
224							# row number excludes header → row index + 1
225	0					0	push @errors, { row => $i + 1, errors => $errs };
226							}
227							}
228	0					0	return \@errors;
229							}
230
231							##########################################################################
232							# dotify_and_coerce_number($val)
233							#
234							# Converts a CSV string value to a number if it looks numeric.
235							# Returns undef if the value is empty or "\N".
236							##########################################################################
237							sub dotify_and_coerce_number {
238	48			48	0	45368	my ( $self, $val ) = @_;
239	48	100	33			261	return undef unless ( defined $val && $val ne '' && $val ne '\\N' );
			66
240	39					76	( my $tr_val = $val ) =~ tr/,/./;
241	39	100				189	return looks_like_number($tr_val) ? 0 + $tr_val : $val;
242							}
243
244							=head1 AUTHOR
245
246							Written by Manuel Rueda, PhD. Info about CNAG can be found at L.
247
248							=head1 LICENSE
249
250							This module is free software; you may redistribute it and/or modify it under the same terms as Perl itself.
251
252							=cut
253
254							1;