File Coverage

blib/lib/Bio/Gonzales/Seq/Validate/fasta.pm

Criterion	Covered	Total	%
statement	75	87	86.2
branch	32	46	69.5
condition	14	18	77.7
subroutine	8	8	100.0
pod	0	1	0.0
total	129	160	80.6

line	stmt	bran	cond	sub	pod	time	code
1							#Copyright (c) 2010 Joachim Bargsten <code at bargsten dot org>. All rights reserved.
2
3							package Bio::Gonzales::Seq::Validate::fasta;
4
5	1			1		118211	use Mouse;
	1					30363
	1					6
6
7	1			1		382	use warnings;
	1					2
	1					25
8	1			1		6	use strict;
	1					2
	1					24
9	1			1		18	use Carp;
	1					4
	1					1614
10
11							our $VERSION = '0.083'; # VERSION
12
13							#no use yet
14							our %alphabets = (
15							'dna' => [qw(A C G T R Y M K S W H B V D X N)],
16							'rna' => [qw(A C G U R Y M K S W H B V D X N)],
17							'protein' => [
18							qw(A R N D C Q E G H I L K M F U
19							P S T W X Y V B Z J O *)
20							],
21							);
22
23							#no use yet
24							our %alphabets_strict = (
25							'dna' => [qw( A C G T )],
26							'rna' => [qw( A C G U )],
27							'protein' => [
28							qw(A R N D C Q E G H I L K M F U
29							P S T W Y V O)
30							],
31							);
32
33							our @protein_alphabet = qw/A B C D E F G H I K L M N P Q R S T V W X Y Z * \-/;
34
35							our @nuc_alphabet = qw/A C G T U M R W S Y K V H D B X N \-/;
36
37							has max_seq_len_out => ( is => 'ro', default => 21 );
38							has fh => ( is => 'rw', required => 1 );
39							has prot_regex => (
40							is => 'ro',
41							default => sub {
42							my $r = join "", (@protein_alphabet);
43							return "[$r]+";
44							}
45							);
46							has nuc_regex => (
47							is => 'ro',
48							default => sub {
49							my $r = join "", (@nuc_alphabet);
50							return "[$r]+";
51							}
52							);
53							has seq_regex => (
54							is => 'ro',
55							default => sub {
56							my $r = join "", ( @protein_alphabet, @nuc_alphabet );
57							return "[$r]+";
58							}
59							);
60
61							has _error_cache => ( is => 'rw', default => sub { {} } );
62
63							sub validate {
64	1			1	0	9	my ($self) = @_;
65
66							#clear cache
67	1					6	$self->_error_cache( {} );
68
69	1					4	my $fh = $self->fh;
70	1					4	my $pre = $self->prot_regex;
71	1					10	my $nre = $self->nuc_regex;
72	1					3	my $sre = $self->seq_regex;
73
74	1					9	my $header;
75							my $seq_type;
76	1					0	my $is_dos;
77	1					0	my $probably_seq_in_header;
78	1					0	my %seen;
79
80	1					715	while (<$fh>) {
81	371	100	66			788	if ( !$is_dos && /\r\n/ ) {
82	1					3	$is_dos = 1;
83	1					6	$self->_add_error( 0, "File seems to be in DOS format" );
84							}
85
86							s/\r\n/\n/
87	371	50				1329	if ($is_dos);
88
89	371					683	chomp;
90
91	371	100	66			1190	if (/^>/) {
		100
		50
		50
92	182	100	100			435	if ( $header && $probably_seq_in_header ) {
		100
93	4					19	$self->_add_error(
94							( $. - 1 ),
95							"Wrong header format, seems to be sequence in the header: >>"
96							. $self->_shorten_seq($probably_seq_in_header) . "<<"
97							);
98	4					8	undef $probably_seq_in_header;
99							} elsif ($header) {
100	1					3	$self->_add_error( $. - 1, "No sequence after header." );
101							}
102							#header
103	182					296	for my $re ( $pre, $nre ) {
104	364	100	100			12444	$probably_seq_in_header = $1
			66
105							if ( ( (/\s+($re)\s$/) \|\| (/\s+($re)\s$/i) ) && length($1) > 3 );
106							}
107	182					435	$header = 1;
108
109	182	50				526	if (/^>([^\s]+)/) {
		0
110	182	50				504	$self->_add_error( $., "ID is very long: " . $self->_shorten_seq($1) )
111							if ( length($1) > 50 );
112							$self->_add_error( $., "ID is ambiguous. $1" )
113	182	100				437	if ( exists $seen{$1} );
114	182					485	$seen{$1} = 1;
115							} elsif (/^>\s*$/) {
116	0					0	$self->_add_error( $., "Empty header" );
117							} else {
118	0					0	$self->_add_error( $., "No ID in header." );
119							}
120	182					900	undef $seq_type;
121							} elsif (/[^>]+>/) {
122	4					15	$self->_add_error( $., "Wrong header format, '>' not in the beginning." );
123
124	4	100				34	$self->_add_error( $. - 1, "No sequence after header." )
125							if ($header);
126	4					11	$header = 1;
127	4					12	undef $seq_type;
128							} elsif (/^\s*$/) {
129	0					0	$self->_add_error( $., "Empty line." );
130							} elsif ( $header \|\| $seq_type ) {
131							#now sequence should be there
132	185	50	66			435	if ( $seq_type && !/^$seq_type$/i ) {
133	0					0	my @unknown = $self->_find_unknown_character( $_, $sre );
134	0	0				0	if ( @unknown > 1 ) {
135	0					0	$self->_add_error( $., "Found unknown characters: >>" . join( "", @unknown ) . "<<" );
136							} else {
137	0					0	$self->_add_error( $., "Sequence type switch (from nucleotide to protein or vice versa)" );
138							}
139							} else {
140	185	100				845	if (/^$pre$/i) {
		50
141	180					356	$seq_type = $pre;
142							} elsif (/^$nre$/i) {
143	0					0	$seq_type = $nre;
144							} else {
145	5					23	my @unknown = $self->_find_unknown_character( $_, $sre );
146	5	50				13	if ( @unknown > 0 ) {
147	5					24	$self->_add_error( $., "Found unknown characters: >>" . join( "", @unknown ) . "<<" );
148							} else {
149	0					0	$self->_add_error( $., "Found mixed protein and DNA/RNA sequence." );
150							}
151	5					17	$seq_type = $sre;
152							}
153							}
154	185					653	undef $header;
155							} else {
156	0					0	$self->_add_error( $., " ERROR IN THE SCRIPT" );
157							}
158							}
159	1	50				4	if ( keys %{ $self->_error_cache } > 0 ) {
	1					10
160	1					23	return $self->_error_cache;
161							} else {
162	0					0	return;
163							}
164							}
165
166							sub _add_error {
167	17			17		48	my ( $self, $line, $msg ) = @_;
168
169	17	100				95	$self->_error_cache->{$line} = [] unless defined $self->_error_cache->{$line};
170	17					27	push @{ $self->_error_cache->{$line} }, $msg;
	17					56
171							}
172
173							sub _find_unknown_character {
174	5			5		15	my ( $self, $seq, @re ) = @_;
175
176	5					10	my @unknown;
177	5					34	my @seq = split //, $seq;
178	5					11	for my $re (@re) {
179	5					9	for my $c (@seq) {
180	176	100				488	push @unknown, $c
181							unless ( $c =~ /$re/i );
182							}
183							}
184	5					50	return @unknown;
185							}
186
187							sub _shorten_seq {
188	4			4		11	my ( $self, $seq ) = @_;
189
190	4					14	my $len = $self->max_seq_len_out;
191
192	4					5	my $short_seq;
193	4	50				12	if ( length($seq) >= $len ) {
194	4					10	$short_seq = substr $seq, 0, 9;
195	4					9	$short_seq .= " ... ";
196	4					6	$short_seq .= substr $seq, -9;
197
198	4					20	return $short_seq;
199							} else {
200	0						return $seq;
201							}
202							}
203
204							1;