File Coverage

blib/lib/CracTools/Utils.pm

Criterion	Covered	Total	%
statement	186	270	68.8
branch	40	96	41.6
condition	12	27	44.4
subroutine	27	40	67.5
pod	27	27	100.0
total	292	460	63.4

line	stmt	bran	cond	sub	pod	time	code
1							package CracTools::Utils;
2							{
3							$CracTools::Utils::DIST = 'CracTools';
4							}
5							# ABSTRACT: A set of useful functions
6							$CracTools::Utils::VERSION = '1.25';
7	8			8		13658	use strict;
	8					11
	8					173
8	8			8		24	use warnings;
	8					8
	8					150
9
10	8			8		22	use Carp;
	8					13
	8					395
11	8			8		27	use Fcntl qw( SEEK_SET );
	8					9
	8					21024
12
13
14							sub reverseComplement($) {
15	1			1	1	6	my $dna = shift;
16
17							# reverse the DNA sequence
18	1					2	my $revcomp = reverse $dna;
19
20							# complement the reversed DNA sequence
21	1					3	$revcomp =~ tr/ACGTacgt/TGCAtgca/;
22	1					5	return $revcomp;
23							}
24
25
26							sub reverse_tab($) {
27	1			1	1	2	my $string = shift;
28	1					5	my @tab = split(/,/,$string);
29	1					1	my $newString;
30	1	50				4	if(@tab > 0) {
31	1					4	for (my $i=$#tab ; $i > 0 ; $i--){
32	3					3	$newString .= $tab[$i];
33	3					6	$newString .= ",";
34							}
35	1					2	$newString .= $tab[0];
36							}
37	1					4	return $newString;
38							}
39
40
41							sub isVersionGreaterOrEqual($$) {
42	0			0	1	0	my ($v1,$v2) = @_;
43	0					0	my @v1_nums = split(/\./,$v1);
44	0					0	my @v2_nums = split(/\./,$v2);
45	0					0	for(my $i = 0; $i < @v1_nums; $i++) {
46	0	0				0	if($v1_nums[$i] >= $v2_nums[$i]) {
47	0					0	return 1;
48							}else {
49	0					0	return 0;
50							}
51							}
52	0	0				0	if(scalar @v2_nums > @v1_nums) {
53	0					0	return 0;
54							} else {
55	0					0	return 1;
56							}
57							}
58
59
60							my %conversion_hash = ( '+' => 1, '-' => '-1', '1' => '+', '-1' => '-');
61							sub convertStrand($) {
62	50			50	1	43	my $strand = shift;
63	50	50				143	return defined $strand? $conversion_hash{$strand} : undef;
64							}
65
66							sub removeChrPrefix($) {
67	0			0	1	0	my $string = shift;
68	0					0	$string =~ s/^chr//i;
69	0					0	return $string;
70							}
71
72							sub addChrPrefix($) {
73	0			0	1	0	my $string = shift;
74	0					0	return "chr".removeChrPrefix($string);
75							}
76
77
78							our $Base64_BITNESS = 6;
79							our @Base64_ENCODING = qw(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z 0 1 2 3 4 5 6 7 8 9 + /);
80
81							# Encode error list into base64
82							sub encodePosListToBase64 {
83	1			1	1	2533	my @pos_list = @_;
84	1					1	my @bitList;
85							my $encoded_str;
86
87	1	50				7	return "" if @pos_list == 0;
88
89							# Compute the appropriate length for the bitList
90	1					2	my $length = $pos_list[-1] + 1;
91	1					4	my $bitList_length = int($length / $Base64_BITNESS);
92	1	50				4	if ($length % $Base64_BITNESS > 0) {
93	1					2	$bitList_length++;
94							}
95
96							# Put 0 at all position
97	1					3	for (my $i = 0; $i < $bitList_length; $i++) {
98	6					10	$bitList[$i] = 0;
99							}
100
101							# Create bit vector in Base64_ENCODING
102	1					3	foreach my $j (@pos_list) {
103	6					9	$bitList[int($j / $Base64_BITNESS)] \|= (1 << ($j % $Base64_BITNESS));
104							}
105
106							# Convert bitList into string_Base64_ENCODING
107	1					4	for(my $k=0; $k < $bitList_length; $k++) {
108	6					10	$encoded_str .= scalar($Base64_ENCODING[$bitList[$k]]);
109							}
110	1					5	return $encoded_str;
111							}
112
113
114							# Decode base 64 error list
115							sub decodePosListInBase64 {
116	1			1	1	4	my $encoded_str = shift;
117
118	1					4	my @encoded_list = split "", $encoded_str;
119	1					1	my (@index,@decoded_list);
120
121							#Looking for index of each char
122	1					2	foreach my $j (@encoded_list) {
123	6					12	my @ind = grep { $Base64_ENCODING[$_] eq $j } 0 .. $#Base64_ENCODING;
	384					279
124	6					13	push(@index,$ind[0]);
125							}
126
127							# Convert into list position
128	1					33	for (my $e = 0; $e < (0+@encoded_list);$e++) {
129	6					10	for(my $i=0; $i<$Base64_BITNESS; $i++) {
130	36	100				58	if ($index[$e] & (1 << $i)) {
131	6					6	my $pos = (int($i+$Base64_BITNESS*$e));
132	6					10	push(@decoded_list,$pos);
133							}
134							}
135							}
136
137	1					4	return @decoded_list;
138							}
139
140
141
142
143							sub seqFileIterator {
144	5			5	1	2723	my ($file,$format) = @_;
145
146	5	50				12	croak "Missing file in argument of seqFileIterator" if !defined $file;
147
148							# Get file handle for $file
149	5					9	my $fh = getReadingFileHandle($file);
150
151							# Automatic file extension detection
152	5	50				9	if(!defined $format) {
153	5	100				8	if($file =~ /\.(fasta\|fa)(\.\|$)/) {
		50
154	2					30	$format = 'fasta';
155							} elsif($file =~ /\.(fastq\|fq)(\.\|$)/) {
156	3					42	$format = 'fastq';
157							} else {
158	0					0	croak "Undefined file extension";
159							}
160							} else {
161	0					0	$format = lc $format;
162							}
163
164							# FASTA ITERATOR
165	5	100				12	if ($format eq 'fasta') {
		50
166							# Read prev line for FASTA because we dont know the number
167							# of line used for the sequence
168	2					15	my $prev_line = <$fh>;
169	2					3	chomp $prev_line;
170							return sub {
171	7			7		3711	my ($name,$seq,$qual);
172	7	100				11	if(defined $prev_line) {
173	5					15	($name) = $prev_line =~ />(.*)$/;
174	5					10	$prev_line = <$fh>;
175							# Until we find a new sequence identifier ">", we
176							# concatenate the lines corresponding to the sequence
177	5		100			24	while(defined $prev_line && $prev_line !~ /^>/) {
178	10					7	chomp $prev_line;
179	10					12	$seq .= $prev_line;
180	10					36	$prev_line = <$fh>;
181							}
182	5					19	return {name => $name, seq => $seq, qual => $qual};
183							} else {
184	2					4	return undef;
185							}
186	2					9	};
187							# FASTQ ITERATOR
188							} elsif ($format eq 'fastq') {
189							return sub {
190	7			7		1398	my ($name,$seq,$qual);
191	7					36	my $line = <$fh>;
192	7	100				28	($name) = $line =~ /@(.*)$/ if defined $line;
193	7	100				12	if(defined $name) {
194	6					10	$seq = <$fh>;
195	6					6	chomp $seq;
196	6					7	<$fh>; # skip second seq name (useless line)
197	6					7	$qual = <$fh>;
198	6					4	chomp $qual;
199	6					20	return {name => $name, seq => $seq, qual => $qual};
200							} else {
201	1					2	return undef;
202							}
203	3					13	};
204							} else {
205	0					0	croak "Undefined file format";
206							}
207							}
208
209
210							sub pairedEndSeqFileIterator {
211	1			1	1	1355	my($file1,$file2,$format) = @_;
212
213	1					3	my $it1 = seqFileIterator($file1,$format);
214	1					5	my $it2 = seqFileIterator($file2,$format);
215
216							return sub {
217	2			2		1491	my $entry1 = $it1->();
218	2					4	my $entry2 = $it2->();
219	2	50	33			9	if(defined $entry1 && defined $entry2) {
220	2					8	return { read1 => $entry1, read2 => $entry2 };
221							} else {
222	0					0	return undef;
223							}
224	1					4	};
225							}
226
227
228							sub writeSeq {
229	0			0	1	0	my ($fh,$format,$name,$seq,$qual) = @_;
230	0	0				0	if($format eq 'fasta') {
		0
231	0					0	print $fh ">$name\n$seq\n";
232							} elsif($format eq 'fastq') {
233	0					0	print $fh '@'."$name\n$seq\n+\n$qual\n";
234							} else {
235	0					0	die "Unknown file format. Must be either a FASTA or a FASTQ";
236							}
237							}
238
239
240							sub bedFileIterator {
241	1			1	1	3017	return getFileIterator(file => shift, type => 'bed');
242							}
243
244
245							sub gffFileIterator {
246	1			1	1	2413	my $file = shift;
247	1					2	my $type = shift;
248	1					5	return getFileIterator(file => $file, type => $type);
249							}
250
251
252							sub vcfFileIterator {
253	1			1	1	4179	my $file = shift;
254	1					4	return getFileIterator(file => $file, type => 'vcf');
255							}
256
257
258							sub chimCTFileIterator {
259	0			0	1	0	return getFileIterator(file => shift, type => 'chimCT');
260							}
261
262
263							sub bamFileIterator {
264	0			0	1	0	my ($file,$region) = @_;
265	0	0				0	$region = "" if !defined $region;
266
267	0					0	my $fh;
268	0	0				0	if($file =~ /\.bam$/) {
269	0	0				0	open($fh, "-\|", "samtools view $file $region" )or die "Cannot open $file, check if samtools are installed.";
270							} else {
271	0					0	$fh = getReadingFileHandle($file);
272							}
273
274							return sub {
275	0			0		0	return <$fh>;
276							}
277
278	0					0	}
279
280
281							sub getSeqFromIndexedRef {
282	0			0	1	0	my ($ref_file,$chr,$pos,$length,$format) = @_;
283	0	0				0	$format = 'fasta' if !defined $format;
284	0					0	$pos++; # +1 because samtools faidx is 1-based
285							# First we try without the "chr" prefix
286	0					0	my $region = removeChrPrefix($chr).":$pos-".($pos+$length-1);
287	0					0	my $fasta_seq = `samtools faidx $ref_file "$region"`;
288							# If it has not worked, we try without
289	0	0				0	if($fasta_seq !~ /^>\S+\n\S+/) {
290	0					0	$region = addChrPrefix($chr).":$pos-".($pos+$length-1);
291	0					0	$fasta_seq = `samtools faidx $ref_file "$region"`;
292							}
293	0	0				0	if($format eq 'raw') {
294	0					0	my ($seq) = $fasta_seq =~ /^>\S+\n(\S+)/;
295	0					0	return $seq;
296							}
297	0					0	return $fasta_seq;
298							}
299
300
301							sub parseBedLine {
302	1			1	1	2	my $line = shift;
303	1					3	my %args = @_;
304	1					5	my($chr,$start,$end,$name,$score,$strand,$thick_start,$thick_end,$rgb,$block_count,$block_size,$block_starts) = split("\t",$line);
305	1					2	my @blocks;
306
307							# Manage blocks if we have a bed12
308	1	50				3	if(defined $block_starts) {
309	1					3	my @block_size = split(",",$block_size);
310	1					3	my @block_starts = split(",",$block_starts);
311	1					2	my $cumulated_block_size = 0;
312	1					3	for(my $i = 0; $i < $block_count; $i++) {
313	2					12	push(@blocks,{size => $block_size[$i],
314							start => $block_starts[$i],
315							end => $block_starts[$i] + $block_size[$i],
316							block_start => $cumulated_block_size,
317							block_end => $cumulated_block_size + $block_size[$i],
318							ref_start => $block_starts[$i] + $start,
319							ref_end => $block_starts[$i] + $start + $block_size[$i],
320							});
321	2					4	$cumulated_block_size += $block_size[$i];
322							}
323							}
324
325	1					8	return { chr => $chr,
326							start => $start,
327							end => $end,
328							name => $name,
329							score => $score,
330							strand => $strand,
331							thick_start => $thick_start,
332							thick_end => $thick_end,
333							rgb => $rgb,
334							blocks => \@blocks,
335							};
336							}
337
338
339							sub parseGFFLine {
340	1			1	1	2	my $line = shift;
341	1					2	my $type = shift;
342	1					1	my $attribute_split;
343	1	50	0			3	if($type =~ /gff3/i) {
		0
344	1			4		3	$attribute_split = sub {my $attr = shift; return $attr =~ /(\S+)=(.*)/;};
	4					2
	4					17
345							} elsif ($type eq 'gtf' \|\| $type eq 'gff2') {
346	0			0		0	$attribute_split = sub {my $attr = shift; return $attr =~ /(\S+)\s+"(.*)"/;};
	0					0
	0					0
347							} else {
348	0					0	croak "Undefined GFF format (must be either gff3,gtf, or gff2)";
349							}
350	1					5	my($chr,$source,$feature,$start,$end,$score,$strand,$frame,$attributes) = split("\t",$line);
351	1					4	my @attributes_tab = split(";",$attributes);
352	1					1	my %attributes_hash;
353	1					2	foreach my $attr (@attributes_tab) {
354	4					6	my ($k,$v) = $attribute_split->($attr);
355	4					8	$attributes_hash{$k} = $v;
356							}
357	1					10	return { chr => $chr,
358							source => $source,
359							feature => $feature,
360							start => $start,
361							end => $end,
362							score => $score,
363							strand => $strand,
364							frame => $frame,
365							attributes => \%attributes_hash,
366							};
367							}
368
369
370							sub parseVCFLine {
371	1			1	1	3	my $line = shift;
372	1					6	my($chr,$pos,$id,$ref,$alt,$qual,$filter,$info) = split("\t",$line);
373
374	1	50				6	my @alts = defined $alt? split(",",$alt) : ();
375							my %infos = defined $info?
376	1	50				4	map { my ($k,$v) = split '='; $k => $v } split ';', $info : ();
	5					8
	5					10
377
378	1					7	return { chr => $chr,
379							pos => $pos,
380							id => $id,
381							ref => $ref,
382							alt => \@alts,
383							qual => $qual,
384							filter => $filter,
385							info => \%infos,
386							};
387							}
388
389
390							sub parseChimCTLine {
391	0			0	1	0	my $line = shift;
392	0					0	my($id,$name,$chr1,$pos1,$strand1,$chr2,$pos2,$strand2,$chim_value,$spanning_junction,$spanning_PE,$class,$comments,@others) = split("\t",$line);
393	0					0	my($sample,$chim_key) = split(":",$id);
394	0					0	my @comments = split(",",$comments);
395	0					0	my %comments;
396	0					0	foreach my $com (@comments){
397	0					0	my ($key,$val) = split("=",$com);
398	0					0	$comments{$key} = $val;
399							}
400	0					0	my %extend_fields;
401	0					0	foreach my $field (@others) {
402	0					0	my ($key,$val) = split("=",$field);
403	0	0	0			0	if(defined $key && defined $val) {
404	0					0	$extend_fields{$key} = $val;
405							}
406							}
407							return {
408	0					0	sample => $sample,
409							chim_key => $chim_key,
410							name => $name,
411							chr1 => $chr1,
412							pos1 => $pos1,
413							strand1 => $strand1,
414							chr2 => $chr2,
415							pos2 => $pos2,
416							strand2 => $strand2,
417							chim_value => $chim_value,
418							spanning_junction => $spanning_junction,
419							spanning_PE => $spanning_PE,
420							class => $class,
421							comments => \%comments,
422							extended_fields => \%extend_fields,
423							};
424							}
425
426
427							sub parseSAMLineLite {
428	1			1	1	2803	my $line = shift;
429	1					6	my ($qname,$flag,$rname,$pos,$mapq,$cigar,$rnext,$pnext,$tlen,$seq,$qual,@others) = split("\t",$line);
430	1					7	my @cigar_hash = map { { op => substr($_,-1), nb => substr($_,0,length($_)-1)} } $cigar =~ /(\d+\D)/g;
	3					11
431							return {
432	1					13	qname => $qname,
433							flag => $flag,
434							rname => $rname,
435							pos => $pos,
436							mapq => $mapq,
437							cigar => \@cigar_hash,
438							original_cigar => $cigar,
439							rnext => $rnext,
440							pnext => $pnext,
441							tlen => $tlen,
442							seq => $seq,
443							qual => $qual,
444							extended_fields => \@others,
445							};
446							}
447
448
449							sub parseCigarChain {
450	1			1	1	8	my $cigar_string = shift;
451	1					1	my @cigar;
452	1					4	while($cigar_string =~ /(\d+)(\S)/g) {
453	5					17	push @cigar, { op => $2, nb => $1 };
454							}
455	1					3	return \@cigar;
456							}
457
458
459
460							sub getFileIterator {
461	4			4	1	12	my %args = @_;
462	4					7	my $file = $args{file};
463	4					6	my $type = $args{type};
464	4					4	my $skip = $args{skip};
465	4					6	my $header_regex = $args{header_regex};
466	4					6	my $parsing_method = $args{parsing_method};
467	4					6	my @parsing_arguments = ();
468
469
470	4	50				13	croak "Missing arguments in getFileIterator" if !defined $file;
471
472	4	100	66			19	if(!defined $parsing_method && defined $type) {
473	3	100	66			31	if($type =~ /gff3/i \|\| $type eq 'gtf' \|\| $type eq 'gff2') {
		100	66
		50	0
		0
		0
474	1					2	$header_regex = '^#';
475	1			1		4	$parsing_method = sub { return parseGFFLine(@_,$type) };
	1					4
476	1					3	push (@parsing_arguments,$type);
477							} elsif($type =~ /bed/i) {
478	1					1	$header_regex = '(^track\s)\|(^#)';
479	1					3	$parsing_method = \&parseBedLine;
480							} elsif ($type =~ /vcf/i) {
481	1					3	$header_regex = '^#';
482	1					2	$parsing_method = \&parseVCFLine;
483							} elsif ($type =~ /chimCT/i) {
484	0					0	$header_regex = '^#';
485	0					0	$parsing_method = \&parseChimCTLine;
486							} elsif ($type =~ /SAM/i \|\| $type =~ /BAM/i) {
487	0					0	$header_regex = '^@';
488	0					0	$parsing_method = \&parseSAMLineLite;
489							} else {
490	0					0	croak "Undefined format type";
491							}
492							}
493
494							# set defaults
495							#$header_regex = '^#' if !defined $header_regex;
496	4	50		0		11	$parsing_method = sub{ return { line => shift } } if !defined $parsing_method;
	0					0
497
498							# We get a filehandle on the file
499	4					11	my $fh = getReadingFileHandle($file);
500
501							# $curr_pos will hold the SEEK_POS of the current_line
502	4					18	my $curr_pos = tell($fh);
503							# $line will hold the content of the current_line
504	4					37	my $line = <$fh>;
505
506
507							# if we need to skip lines
508	4	50				10	if(defined $skip) {
509	0					0	for(my $i = 0; $i < $skip; $i++) {
510	0					0	$curr_pos = tell($fh);
511	0					0	$line = <$fh>;
512							}
513							}
514
515							# Skip line that match a specific regex
516	4	50				10	if(defined $header_regex) {
517	4		66			63	while($line && $line =~ /$header_regex/) {
518	22					16	$curr_pos = tell($fh);
519	22					77	$line = <$fh>;
520							}
521							}
522
523							# The iterator itself (an unnamed subroutine
524							return sub {
525	35	100		35		55	if (defined $line) {
526	34					25	chomp $line;
527							# We parse the line with the appropriate methd
528	34					54	my $parsed_line = $parsing_method->($line,@parsing_arguments);
529							# Add seek pos to the output hashref
530	34					34	$parsed_line->{seek_pos} = $curr_pos;
531	34					34	$parsed_line->{_original_line} = $line;
532	34					36	$curr_pos = tell($fh);
533	34					52	$line = <$fh>; # Get next line
534	34					84	return $parsed_line;
535							} else {
536	1					18	return undef;
537							}
538	4					25	};
539							}
540
541
542							sub getReadingFileHandle {
543	10			10	1	10	my $file = shift;
544	10					10	my $fh;
545	10	50				28	if($file =~ /\.gz$/) {
546	0	0				0	open($fh,"gunzip -c $file \|") or die ("Cannot open $file");
547							} else {
548	10	50				120	open($fh,"< $file") or die ("Cannot open $file");
549							}
550	10					273	return $fh;
551							}
552
553
554							sub getWritingFileHandle {
555	0			0	1		my $file = shift;
556	0						my $fh;
557	0	0					if($file =~ /\.gz$/) {
558	0	0					open($fh,"\| gzip > $file") or die ("Cannot open $file");
559							} else {
560	0	0					open($fh,"> $file") or die ("Cannot open $file");
561							}
562	0						return $fh;
563							}
564
565
566							sub getLineFromSeekPos {
567	0			0	1		my($fh,$seek_pos) = @_;
568	0						seek($fh,$seek_pos,SEEK_SET);
569	0						my $line = <$fh>;
570	0						chomp($line);
571	0						return $line;
572							}
573
574							1;
575
576							__END__