File Coverage

blib/lib/BioUtil/Seq.pm

Criterion	Covered	Total	%
statement	74	188	39.3
branch	19	82	23.1
condition	2	6	33.3
subroutine	14	24	58.3
pod	15	17	88.2
total	124	317	39.1

line	stmt	bran	cond	sub	pod	time	code
1							package BioUtil::Seq;
2
3							require Exporter;
4							@ISA = (Exporter);
5							@EXPORT = qw(
6							FastaReader
7							read_sequence_from_fasta_file
8							write_sequence_to_fasta_file
9							format_seq
10
11							validate_sequence
12							complement
13							revcom
14							base_content
15							degenerate_seq_to_regexp
16							degenerate_seq_match_sites
17							dna2peptide
18							codon2aa
19							generate_random_seqence
20
21							shuffle_sequences
22							rename_fasta_header
23							clean_fasta_header
24							);
25
26	1			1		20180	use vars qw($VERSION);
	1					3
	1					66
27
28	1			1		25	use 5.010_000;
	1					3
	1					41
29	1			1		11	use strict;
	1					6
	1					44
30	1			1		6	use warnings FATAL => 'all';
	1					1
	1					64
31
32	1			1		6	use List::Util qw(shuffle);
	1					2
	1					3427
33
34							=head1 NAME
35
36							BioUtil::Seq - Utilities for sequence
37
38							Some great modules like BioPerl provide many robust solutions.
39							However, it is not easy to install for someone in some platforms.
40							And for some simple task scripts, a lite module may be a good choice.
41							So I reinvented some wheels and added some useful utilities into this module,
42							hoping it would be helpful.
43
44							=head1 VERSION
45
46							Version 2015.0309
47
48							=cut
49
50							our $VERSION = 2015.0309;
51
52							=head1 EXPORT
53
54							FastaReader
55							read_sequence_from_fasta_file
56							write_sequence_to_fasta_file
57							format_seq
58
59							validate_sequence
60							complement
61							revcom
62							base_content
63							degenerate_seq_to_regexp
64							degenerate_seq_match_sites
65							dna2peptide
66							codon2aa
67							generate_random_seqence
68
69							shuffle_sequences
70							rename_fasta_header
71							clean_fasta_header
72
73							=head1 SYNOPSIS
74
75							use BioUtil::Seq;
76
77
78							=head1 SUBROUTINES/METHODS
79
80
81							=head2 FastaReader
82
83							FastaReader is a fasta file parser using closure.
84							FastaReader returns an anonymous subroutine, when called, it
85							return a fasta record which is reference of an array
86							containing fasta header and sequence.
87
88							FastaReader could also read from STDIN when the file name is "STDIN" or "stdin".
89
90							A boolean argument is optional. If set as "true", spaces including blank, tab,
91							"return" ("\r") and "new line" ("\n") symbols in sequence will not be trimed.
92
93							FastaReader speeds up by utilizing the special Perl variable $/ (set to "\n>"),
94							with kind help of Mario Roy, author of MCE
95							(https://code.google.com/p/many-core-engine-perl/). A lot of optimizations were
96							also done by him.
97
98							Example:
99
100							# do not trim the spaces and \n
101							# $not_trim = 1;
102							# my $next_seq = FastaReader("test.fa", $not_trim);
103
104							# read from STDIN
105							# my $next_seq = FastaReader('STDIN');
106
107							# read from file
108							my $next_seq = FastaReader("test.fa");
109
110							while ( my $fa = &$next_seq() ) {
111							my ( $header, $seq ) = @$fa;
112
113							print ">$header\n$seq\n";
114							}
115
116							=cut
117
118							sub FastaReader {
119	2			2	1	2	my ( $file, $not_trim ) = @_;
120
121	2					3	my ( $open_flg, $finished ) = ( 0, 0 );
122	2					5	my ( $fh, $pos, $head ) = (undef) x 3;
123
124	2	50	33			13	if ( $file =~ /^STDIN$/i ) { # from stdin
		50
125	0					0	$fh = *STDIN;
126							}
127							elsif ( ref $file eq '' or ref $file eq 'SCALAR' ) { # from file
128	2	50				191	open $fh, '<', $file or die "fail to open file: $file!\n";
129	2					4	$open_flg = 1;
130							}
131							else { # glob, i.e. given file handler
132	0					0	$fh = $file;
133							}
134
135	2					10	local $/ = \1; ## read one byte
136	2					18	while (<$fh>) { ## until reaching ">"
137	2	50				6	last if $_ eq '>';
138							}
139							return sub {
140	6	50		6		10	return if $finished;
141
142	6					10	local $/ = "\n>"; ## set input record separator
143	6					23	while (<$fh>) {
144							## trim trailing ">", part of $/. faster than s/\r?\n>$//
145	4	100				10	substr( $_, -1, 1, '' ) if substr( $_, -1, 1 ) eq '>';
146
147							## extract header and sequence
148							# faster than ( $head, $seq ) = split( /\n/, $_, 2 );
149	4					8	$pos = index( $_, "\n" ) + 1;
150	4					8	$head = substr( $_, 0, $pos - 1 );
151
152							# $_ becomes sequence, to save memory
153							# $seq = substr( $_, $pos );
154	4					5	substr( $_, 0, $pos, '' );
155
156							## trim trailing "\r" in header
157	4	50				9	chop $head if substr( $head, -1, 1 ) eq "\r";
158
159	4	50				9	if ( length $head > 0 ) {
160
161							# faster than $seq =~ s/\s//g unless $not_trim;
162							# $seq =~ tr/\t\r\n //d unless $not_trim;
163	4	50				9	$_ =~ tr/\t\r\n //d unless $not_trim;
164	4					16	return [ $head, $_ ];
165							}
166							}
167
168	2	50				16	close $fh if $open_flg;
169	2					3	$finished = 1;
170	2					6	return;
171	2					28	};
172							}
173
174							sub FastaReader_old {
175	0			0	0	0	my ( $file, $not_trim ) = @_;
176
177	0					0	my ( $last_header, $seq_buffer ) = ( '', '' ); # buffer for header and seq
178	0					0	my ( $header, $seq ) = ( '', '' ); # current header and seq
179	0					0	my $finished = 0;
180
181	0					0	my ( $fh, $is_stdin ) = ( undef, 0 );
182	0	0				0	if ( $file =~ /^STDIN$/i ) {
183	0					0	( $fh, $is_stdin ) = ( *STDIN, 1 );
184							}
185							else {
186	0	0				0	open $fh, "<", $file or die "fail to open file: $file!\n";
187							}
188
189							return sub {
190	0	0		0		0	return undef if $finished; # end of file
191
192	0					0	while (<$fh>) {
193	0					0	s/^\s+//; # remove the space at the front of line
194
195	0	0				0	if (/^>(.*)/) { # header line
196	0					0	( $header, $last_header ) = ( $last_header, $1 );
197	0					0	( $seq, $seq_buffer ) = ( $seq_buffer, '' );
198
199							# only output fasta records with non-blank header
200	0	0				0	if ( $header ne '' ) {
201	0	0				0	$seq =~ s/\s+//g unless $not_trim;
202	0					0	return [ $header, $seq ];
203							}
204							}
205							else {
206	0					0	$seq_buffer .= $_; # append seq
207							}
208							}
209	0	0				0	close $fh unless $is_stdin;
210	0					0	$finished = 1;
211
212							# last record. only output fasta records with non-blank header
213	0	0				0	if ( $last_header ne '' ) {
214	0	0				0	$seq_buffer =~ s/\s+//g unless $not_trim;
215	0					0	return [ $last_header, $seq_buffer ];
216							}
217	0					0	};
218							}
219
220							=head2 read_sequence_from_fasta_file
221
222							Read all sequences from fasta file.
223
224							Example:
225
226							my $seqs = read_sequence_from_fasta_file($file);
227							for my $header (keys %$seqs) {
228							my $seq = $$seqs{$header};
229							print ">$header\n$seq\n";
230							}
231
232							=cut
233
234							sub read_sequence_from_fasta_file {
235	2			2	1	13	my ( $file, $not_trim ) = @_;
236	2					3	my $seqs = {};
237
238	2					7	my $next_seq = FastaReader( $file, $not_trim );
239	2					5	while ( my $fa = &$next_seq() ) {
240							# my ( $header, $seq ) = @$fa;
241							# $$seqs{$header} = $seq;
242	4					13	$$seqs{ $fa->[0] } = $fa->[1];
243							}
244
245	2					16	return $seqs;
246							}
247
248							=head2 write_sequence_to_fasta_file
249
250							Example:
251
252							my $seq = {"seq1" => "acgagaggag"};
253							write_sequence_to_fasta_file($seq, "seq.fa");
254
255							=cut
256
257							sub write_sequence_to_fasta_file {
258	1			1	1	385	my ( $seqs, $file, $n ) = @_;
259	1	50				4	unless ( ref $seqs eq 'HASH' ) {
260	0					0	warn "seqs should be reference of hash\n";
261	0					0	return 0;
262							}
263	1	50				3	$n = 70 unless defined $n;
264
265	1	50				92	open my $fh2, ">$file" or die "failed to write to $file\n";
266	1					6	for ( keys %$seqs ) {
267	2					12	print $fh2 ">$_\n", format_seq( $$seqs{$_}, $n ), "\n";
268							}
269	1					46	close $fh2;
270							}
271
272							=head2 format_seq
273
274							Format sequence to readable text
275
276							Example:
277
278							printf ">%s\n%s", $head, format_seq($seq, 60);
279
280							=cut
281
282							sub format_seq {
283	2			2	1	4	my ( $s, $n ) = @_;
284	2	50				6	$n = 70 unless defined $n;
285	2	50	33			22	unless ( $n =~ /^\d+$/ and $n > 0 ) {
286	0					0	warn "n should be positive integer\n";
287	0					0	return $s;
288							}
289
290	2					3	my $s2 = '';
291	2					1	my ( $j, $int );
292	2					7	$int = int( ( length $s ) / $n );
293	2					4	for ( $j = 0; $j <= $int; $j++ ) {
294	2					9	$s2 .= substr( $s, $j * $n, $n ) . "\n";
295							}
296	2					43	return $s2;
297							}
298
299							=head2 validate_sequence
300
301							Validate a sequence.
302
303							Legale symbols:
304
305							DNA: ACGTRYSWKMBDHV
306							RNA: ACGURYSWKMBDHV
307							Protein: ACDEFGHIKLMNPQRSTVWY
308							gap and space: - *.
309
310							Example:
311
312							if (validate_sequence($seq)) {
313							# do some thing
314							}
315
316							=cut
317
318							sub validate_sequence {
319	2			2	1	212	my ($seq) = @_;
320	2	100				13	return 0 if $seq =~ /[^\.\-\s_*ABCDEFGHIKLMNPQRSTUVWY]/i;
321	1					2	return 1;
322							}
323
324							=head2 complement
325
326							Complement sequence
327
328							IUPAC nucleotide code: ACGTURYSWKMBDHVN
329
330							http://droog.gs.washington.edu/parc/images/iupac.html
331
332							code base Complement
333							A A T
334							C C G
335							G G C
336							T/U T A
337
338							R A/G Y
339							Y C/T R
340							S C/G S
341							W A/T W
342							K G/T M
343							M A/C K
344
345							B C/G/T V
346							D A/G/T H
347							H A/C/T D
348							V A/C/G B
349
350							X/N A/C/G/T X
351							. not A/C/G/T
352							or- gap
353
354							my $comp = complement($seq);
355
356							=cut
357
358							sub complement {
359	1			1	1	12	my ($s) = @_;
360	1					3	$s
361							=~ tr/ACGTURYMKSWBDHVNacgturymkswbdhvn/TGCAAYRKMSWVHDBNtgcaayrkmswvhdbn/;
362	1					3	return $s;
363							}
364
365							=head2 revcom
366
367							Reverse complement sequence
368
369							my $recom = revcom($seq);
370
371							=cut
372
373							sub revcom {
374	1			1	1	4	my $rc = reverse complement( $_[0] );
375	1					4	return $rc;
376							}
377
378							=head2 base_content
379
380							Example:
381
382							my $gc_cotent = base_content('gc', $seq);
383
384							=cut
385
386							sub base_content {
387	1			1	1	4	my ( $bases, $seq ) = @_;
388	1	50				7	if ( $seq eq '' ) {
389	0					0	return 0;
390							}
391
392	1					2	my $sum = 0;
393	1					42	$sum += $seq =~ s/$_/$_/ig for split "", $bases;
394	1					29	return sprintf "%.4f", $sum / length $seq;
395							}
396
397							=head2 degenerate_seq_to_regexp
398
399							Translate degenerate sequence to regular expression
400
401							=cut
402
403							sub degenerate_seq_to_regexp {
404	0			0	1		my ($seq) = @_;
405	0						my %bases = (
406							'A' => 'A',
407							'T' => 'T',
408							'U' => 'U',
409							'C' => 'C',
410							'G' => 'G',
411							'R' => '[AG]',
412							'Y' => '[CT]',
413							'M' => '[AC]',
414							'K' => '[GT]',
415							'S' => '[CG]',
416							'W' => '[AT]',
417							'H' => '[ACT]',
418							'B' => '[CGT]',
419							'V' => '[ACG]',
420							'D' => '[AGT]',
421							'N' => '[ACGT]',
422							);
423	0						return join '', map { $bases{$_} } split //, $seq;
	0
424							}
425
426							=head2 degenerate_seq_match_sites
427
428							Find all sites matching degenerat subseq.
429
430							See https://github.com/shenwei356/bio_scripts/blob/master/sequence/fasta_locate_motif.pl
431
432							=cut
433
434							sub degenerate_seq_match_sites {
435	0			0	1		my ( $r, $s ) = @_;
436
437							# original regexp length
438	0						my $r2 = $r;
439	0						$r2 =~ s/\[[^\[\]]+?\]/_/g;
440	0						my $len = length $r2;
441
442	0						my @sites = ();
443	0						my $pos = -1;
444	0						while ( $s =~ /($r)/ig ) {
445	0						$pos = pos $s;
446	0						push @sites, [ $pos - $len + 1, $pos, $1];
447	0						pos $s = $pos - $len + 1;
448							}
449	0						return \@sites;
450							}
451
452							=head2 dna2peptide
453
454							Translate DNA sequence into a peptide
455
456							=cut
457
458							sub dna2peptide {
459	0			0	1		my ($dna) = @_;
460	0						my $protein = '';
461
462							# Translate each three-base codon to an amino acid, and append to a protein
463	0						for ( my $i = 0; $i < ( length($dna) - 2 ); $i += 3 ) {
464	0						$protein .= codon2aa( substr( $dna, $i, 3 ) );
465							}
466	0						return $protein;
467							}
468
469							=head2 codon2aa
470
471							Translate a DNA 3-character codon to an amino acid
472
473							=cut
474
475							sub codon2aa {
476	0			0	1		my ($codon) = @_;
477	0						$codon = uc $codon;
478	0						my %genetic_code = (
479							'TCA' => 'S', # Serine
480							'TCC' => 'S', # Serine
481							'TCG' => 'S', # Serine
482							'TCT' => 'S', # Serine
483							'TTC' => 'F', # Phenylalanine
484							'TTT' => 'F', # Phenylalanine
485							'TTA' => 'L', # Leucine
486							'TTG' => 'L', # Leucine
487							'TAC' => 'Y', # Tyrosine
488							'TAT' => 'Y', # Tyrosine
489							'TAA' => '_', # Stop
490							'TAG' => '_', # Stop
491							'TGC' => 'C', # Cysteine
492							'TGT' => 'C', # Cysteine
493							'TGA' => '_', # Stop
494							'TGG' => 'W', # Tryptophan
495							'CTA' => 'L', # Leucine
496							'CTC' => 'L', # Leucine
497							'CTG' => 'L', # Leucine
498							'CTT' => 'L', # Leucine
499							'CCA' => 'P', # Proline
500							'CCC' => 'P', # Proline
501							'CCG' => 'P', # Proline
502							'CCT' => 'P', # Proline
503							'CAC' => 'H', # Histidine
504							'CAT' => 'H', # Histidine
505							'CAA' => 'Q', # Glutamine
506							'CAG' => 'Q', # Glutamine
507							'CGA' => 'R', # Arginine
508							'CGC' => 'R', # Arginine
509							'CGG' => 'R', # Arginine
510							'CGT' => 'R', # Arginine
511							'ATA' => 'I', # Isoleucine
512							'ATC' => 'I', # Isoleucine
513							'ATT' => 'I', # Isoleucine
514							'ATG' => 'M', # Methionine
515							'ACA' => 'T', # Threonine
516							'ACC' => 'T', # Threonine
517							'ACG' => 'T', # Threonine
518							'ACT' => 'T', # Threonine
519							'AAC' => 'N', # Asparagine
520							'AAT' => 'N', # Asparagine
521							'AAA' => 'K', # Lysine
522							'AAG' => 'K', # Lysine
523							'AGC' => 'S', # Serine
524							'AGT' => 'S', # Serine
525							'AGA' => 'R', # Arginine
526							'AGG' => 'R', # Arginine
527							'GTA' => 'V', # Valine
528							'GTC' => 'V', # Valine
529							'GTG' => 'V', # Valine
530							'GTT' => 'V', # Valine
531							'GCA' => 'A', # Alanine
532							'GCC' => 'A', # Alanine
533							'GCG' => 'A', # Alanine
534							'GCT' => 'A', # Alanine
535							'GAC' => 'D', # Aspartic Acid
536							'GAT' => 'D', # Aspartic Acid
537							'GAA' => 'E', # Glutamic Acid
538							'GAG' => 'E', # Glutamic Acid
539							'GGA' => 'G', # Glycine
540							'GGC' => 'G', # Glycine
541							'GGG' => 'G', # Glycine
542							'GGT' => 'G', # Glycine
543							);
544
545	0	0					if ( exists $genetic_code{$codon} ) {
546	0						return $genetic_code{$codon};
547							}
548							else {
549	0						print STDERR "Bad codon \"$codon\"!!\n";
550	0						exit;
551							}
552							}
553
554							=head2 generate_random_seqence
555
556							Example:
557
558							my @alphabet = qw/a c g t/;
559							my $seq = generate_random_seqence( \@alphabet, 50 );
560
561							=cut
562
563							sub generate_random_seqence {
564	0			0	1		my ( $alphabet, $length ) = @_;
565	0	0					unless ( ref $alphabet eq 'ARRAY' ) {
566	0						warn "alphabet should be ref of array\n";
567	0						return 0;
568							}
569
570	0						my $n = @$alphabet;
571	0						my $seq;
572	0						$seq .= $$alphabet[ int rand($n) ] for ( 1 .. $length );
573	0						return $seq;
574							}
575
576							=head2 shuffle sequences
577
578							Example:
579
580							shuffle_sequences($file, "$file.shuf.fa");
581
582							=cut
583
584							sub shuffle_sequences {
585	0			0	0		my ( $file, $file_out, $not_trim ) = @_;
586	0						my $seqs = read_sequence_from_fasta_file( $file, $not_trim );
587	0						my @keys = shuffle( keys %$seqs );
588
589	0	0					$file_out = "$file.shuffled.fa" unless defined $file_out;
590	0	0					open my $fh2, ">$file_out" or die "fail to write file $file_out\n";
591	0						print $fh2 ">$_\n$$seqs{$_}\n" for @keys;
592	0						close $fh2;
593
594	0						return $file_out;
595							}
596
597							=head2 rename_fasta_header
598
599							Rename fasta header with regexp.
600
601							Example:
602
603							# delete some symbols
604							my $n = rename_fasta_header('[^a-z\d\s\-\_\[\]\\|]', '', $file, "$file.rename.fa");
605							print "$n records renamed\n";
606
607							=cut
608
609							sub rename_fasta_header {
610	0			0	1		my ( $regex, $repalcement, $file, $outfile ) = @_;
611
612	0	0					open my $fh, "<", $file or die "fail to open file: $file\n";
613	0	0					open my $fh2, ">", $outfile or die "fail to wirte file: $outfile\n";
614
615	0						my $head = '';
616	0						my $n = 0;
617	0						while (<$fh>) {
618	0	0					if (/^\s>(.)\r?\n/) {
619	0						$head = $1;
620	0	0					if ( $head =~ /$regex/ ) {
621	0						$head =~ s/$regex/$repalcement/g;
622	0						$n++;
623							}
624	0						print $fh2 ">$head\n";
625							}
626							else {
627	0						print $fh2 $_;
628							}
629							}
630	0						close $fh;
631	0						close $fh2;
632
633	0						return $n;
634							}
635
636							=head2 clean_fasta_header
637
638							Rename given symbols to repalcement string.
639							Because, some symbols in fasta header will cause unexpected result.
640
641							Example:
642
643							my $file = "test.fa";
644							my $n = clean_fasta_header($file, "$file.rename.fa");
645							# replace any symbol in (\/:*?"<>\|) with '', i.e. deleting.
646							# my $n = clean_fasta_header($file, "$file.rename.fa", '', '\/:*?"<>\|');
647							print "$n records renamed\n";
648
649							=cut
650
651							sub clean_fasta_header {
652	0			0	1		my ( $file, $outfile, $replacement, $symbols ) = @_;
653	0	0					$replacement = "_" unless defined $replacement;
654
655	0						my @default = split //, '\/:*?"<>\|';
656	0	0					$symbols = \@default unless defined $symbols;
657	0	0					unless ( ref $symbols eq 'ARRAY' ) {
658	0						warn "symbols should be ref of array\n";
659	0						return 0;
660							}
661	0						my $re = join '', map { quotemeta $_ } @$symbols;
	0
662	0	0					open my $fh, "<", $file or die "fail to open file: $file\n";
663	0	0					open my $fh2, ">", $outfile or die "fail to wirte file: $outfile\n";
664
665	0						my $head = '';
666	0						my $n = 0;
667	0						while (<$fh>) {
668	0	0					if (/^\s>(.)\r?\n/) {
669	0						$head = $1;
670	0	0					if ( $head =~ /[$re]/ ) {
671	0						$head =~ s/[$re]/$replacement/g;
672	0						$n++;
673							}
674	0						print $fh2 ">$head\n";
675							}
676							else {
677	0						print $fh2 $_;
678							}
679							}
680	0						close $fh;
681	0						close $fh2;
682
683	0						return $n;
684							}
685
686							1;