File Coverage

blib/lib/CracTools/Utils.pm

Criterion	Covered	Total	%
statement	155	239	64.8
branch	33	84	39.2
condition	12	27	44.4
subroutine	24	37	64.8
pod	24	24	100.0
total	248	411	60.3

line	stmt	bran	cond	sub	pod	time	code
1							package CracTools::Utils;
2							{
3							$CracTools::Utils::DIST = 'CracTools';
4							}
5							# ABSTRACT: A set of useful functions
6							$CracTools::Utils::VERSION = '1.22';
7	10			10		20678	use strict;
	10					16
	10					266
8	10			10		49	use warnings;
	10					17
	10					249
9
10	10			10		45	use Carp;
	10					23
	10					594
11	10			10		48	use Fcntl qw( SEEK_SET );
	10					36
	10					33600
12
13
14							sub reverseComplement($) {
15	1			1	1	8	my $dna = shift;
16
17							# reverse the DNA sequence
18	1					5	my $revcomp = reverse $dna;
19
20							# complement the reversed DNA sequence
21	1					3	$revcomp =~ tr/ACGTacgt/TGCAtgca/;
22	1					8	return $revcomp;
23							}
24
25
26							sub reverse_tab($) {
27	1			1	1	4	my $string = shift;
28	1					8	my @tab = split(/,/,$string);
29	1					2	my $newString;
30	1	50				4	if(@tab > 0) {
31	1					6	for (my $i=$#tab ; $i > 0 ; $i--){
32	3					5	$newString .= $tab[$i];
33	3					11	$newString .= ",";
34							}
35	1					3	$newString .= $tab[0];
36							}
37	1					5	return $newString;
38							}
39
40
41							sub isVersionGreaterOrEqual($$) {
42	0			0	1	0	my ($v1,$v2) = @_;
43	0					0	my @v1_nums = split(/\./,$v1);
44	0					0	my @v2_nums = split(/\./,$v2);
45	0					0	for(my $i = 0; $i < @v1_nums; $i++) {
46	0	0				0	if($v1_nums[$i] >= $v2_nums[$i]) {
47	0					0	return 1;
48							}else {
49	0					0	return 0;
50							}
51							}
52	0	0				0	if(scalar @v2_nums > @v1_nums) {
53	0					0	return 0;
54							} else {
55	0					0	return 1;
56							}
57							}
58
59
60							my %conversion_hash = ( '+' => 1, '-' => '-1', '1' => '+', '-1' => '-');
61							sub convertStrand($) {
62	53			53	1	75	my $strand = shift;
63	53					254	return $conversion_hash{$strand};
64							}
65
66							sub removeChrPrefix($) {
67	0			0	1	0	my $string = shift;
68	0					0	$string =~ s/^chr//;
69	0					0	return $string;
70							}
71
72							sub addChrPrefix($) {
73	0			0	1	0	my $string = shift;
74	0					0	return "chr".removeChrPrefix($string);
75							}
76
77
78							sub seqFileIterator {
79	5			5	1	4778	my ($file,$format) = @_;
80
81	5	50				19	croak "Missing file in argument of seqFileIterator" if !defined $file;
82
83							# Get file handle for $file
84	5					16	my $fh = getReadingFileHandle($file);
85
86							# Automatic file extension detection
87	5	50				17	if(!defined $format) {
88	5	100				16	if($file =~ /\.(fasta\|fa)(\.\|$)/) {
		50
89	2					28	$format = 'fasta';
90							} elsif($file =~ /\.(fastq\|fq)(\.\|$)/) {
91	3					75	$format = 'fastq';
92							} else {
93	0					0	croak "Undefined file extension";
94							}
95							} else {
96	0					0	$format = lc $format;
97							}
98
99							# FASTA ITERATOR
100	5	100				23	if ($format eq 'fasta') {
		50
101							# Read prev line for FASTA because we dont know the number
102							# of line used for the sequence
103	2					25	my $prev_line = <$fh>;
104	2					21	chomp $prev_line;
105							return sub {
106	7			7		6397	my ($name,$seq,$qual);
107	7	100				22	if(defined $prev_line) {
108	5					28	($name) = $prev_line =~ />(.*)$/;
109	5					15	$prev_line = <$fh>;
110							# Until we find a new sequence identifier ">", we
111							# concatenate the lines corresponding to the sequence
112	5		100			36	while(defined $prev_line && $prev_line !~ /^>/) {
113	10					14	chomp $prev_line;
114	10					23	$seq .= $prev_line;
115	10					65	$prev_line = <$fh>;
116							}
117	5					39	return {name => $name, seq => $seq, qual => $qual};
118							} else {
119	2					6	return undef;
120							}
121	2					17	};
122							# FASTQ ITERATOR
123							} elsif ($format eq 'fastq') {
124							return sub {
125	7			7		2773	my ($name,$seq,$qual);
126	7					55	my $line = <$fh>;
127	7	100				45	($name) = $line =~ /@(.*)$/ if defined $line;
128	7	100				18	if(defined $name) {
129	6					16	$seq = <$fh>;
130	6					11	chomp $seq;
131	6					14	<$fh>; # skip second seq name (useless line)
132	6					16	$qual = <$fh>;
133	6					8	chomp $qual;
134	6					32	return {name => $name, seq => $seq, qual => $qual};
135							} else {
136	1					5	return undef;
137							}
138	3					20	};
139							} else {
140	0					0	croak "Undefined file format";
141							}
142							}
143
144
145							sub pairedEndSeqFileIterator {
146	1			1	1	2546	my($file1,$file2,$format) = @_;
147
148	1					5	my $it1 = seqFileIterator($file1,$format);
149	1					7	my $it2 = seqFileIterator($file2,$format);
150
151							return sub {
152	2			2		2769	my $entry1 = $it1->();
153	2					7	my $entry2 = $it2->();
154	2	50	33			16	if(defined $entry1 && defined $entry2) {
155	2					9	return { read1 => $entry1, read2 => $entry2 };
156							} else {
157	0					0	return undef;
158							}
159	1					7	};
160							}
161
162
163							sub writeSeq {
164	0			0	1	0	my ($fh,$format,$name,$seq,$qual) = @_;
165	0	0				0	if($format eq 'fasta') {
		0
166	0					0	print $fh ">$name\n$seq\n";
167							} elsif($format eq 'fastq') {
168	0					0	print $fh '@'."$name\n$seq\n+\n$qual\n";
169							} else {
170	0					0	die "Unknown file format. Must be either a FASTA or a FASTQ";
171							}
172							}
173
174
175							sub bedFileIterator {
176	1			1	1	5556	return getFileIterator(file => shift, type => 'bed');
177							}
178
179
180							sub gffFileIterator {
181	1			1	1	4382	my $file = shift;
182	1					3	my $type = shift;
183	1					5	return getFileIterator(file => $file, type => $type);
184							}
185
186
187							sub vcfFileIterator {
188	1			1	1	7964	my $file = shift;
189	1					5	return getFileIterator(file => $file, type => 'vcf');
190							}
191
192
193							sub chimCTFileIterator {
194	0			0	1	0	return getFileIterator(file => shift, type => 'chimCT');
195							}
196
197
198							sub bamFileIterator {
199	0			0	1	0	my ($file,$region) = @_;
200	0	0				0	$region = "" if !defined $region;
201
202	0					0	my $fh;
203	0	0				0	if($file =~ /\.bam$/) {
204	0	0				0	open($fh, "-\|", "samtools view $file $region" )or die "Cannot open $file, check if samtools are installed.";
205							} else {
206	0					0	$fh = getReadingFileHandle($file);
207							}
208
209							return sub {
210	0			0		0	return <$fh>;
211							}
212
213	0					0	}
214
215
216							sub getSeqFromIndexedRef {
217	0			0	1	0	my ($ref_file,$chr,$pos,$length,$format) = @_;
218	0	0				0	$format = 'fasta' if !defined $format;
219	0					0	$pos++; # +1 because samtools faidx is 1-based
220							# First we try without the "chr" prefix
221	0					0	my $region = removeChrPrefix($chr).":$pos-".($pos+$length-1);
222	0					0	my $fasta_seq = `samtools faidx $ref_file "$region"`;
223							# If it has not worked, we try without
224	0	0				0	if($fasta_seq !~ /^>\S+\n\S+/) {
225	0					0	$region = addChrPrefix($chr).":$pos-".($pos+$length-1);
226	0					0	$fasta_seq = `samtools faidx $ref_file "$region"`;
227							}
228	0	0				0	if($format eq 'raw') {
229	0					0	my ($seq) = $fasta_seq =~ /^>\S+\n(\S+)/;
230	0					0	return $seq;
231							}
232	0					0	return $fasta_seq;
233							}
234
235
236							sub parseBedLine {
237	1			1	1	3	my $line = shift;
238	1					4	my %args = @_;
239	1					8	my($chr,$start,$end,$name,$score,$strand,$thick_start,$thick_end,$rgb,$block_count,$block_size,$block_starts) = split("\t",$line);
240	1					4	my @blocks;
241
242							# Manage blocks if we have a bed12
243	1	50				5	if(defined $block_starts) {
244	1					4	my @block_size = split(",",$block_size);
245	1					4	my @block_starts = split(",",$block_starts);
246	1					2	my $cumulated_block_size = 0;
247	1					5	for(my $i = 0; $i < $block_count; $i++) {
248	2					17	push(@blocks,{size => $block_size[$i],
249							start => $block_starts[$i],
250							end => $block_starts[$i] + $block_size[$i],
251							block_start => $cumulated_block_size,
252							block_end => $cumulated_block_size + $block_size[$i],
253							ref_start => $block_starts[$i] + $start,
254							ref_end => $block_starts[$i] + $start + $block_size[$i],
255							});
256	2					9	$cumulated_block_size += $block_size[$i];
257							}
258							}
259
260	1					12	return { chr => $chr,
261							start => $start,
262							end => $end,
263							name => $name,
264							score => $score,
265							strand => $strand,
266							thick_start => $thick_start,
267							thick_end => $thick_end,
268							rgb => $rgb,
269							blocks => \@blocks,
270							};
271							}
272
273
274							sub parseGFFLine {
275	1			1	1	3	my $line = shift;
276	1					2	my $type = shift;
277	1					2	my $attribute_split;
278	1	50	0			7	if($type =~ /gff3/i) {
		0
279	1			4		6	$attribute_split = sub {my $attr = shift; return $attr =~ /(\S+)=(.*)/;};
	4					8
	4					29
280							} elsif ($type eq 'gtf' \|\| $type eq 'gff2') {
281	0			0		0	$attribute_split = sub {my $attr = shift; return $attr =~ /(\S+)\s+"(.*)"/;};
	0					0
	0					0
282							} else {
283	0					0	croak "Undefined GFF format (must be either gff3,gtf, or gff2)";
284							}
285	1					9	my($chr,$source,$feature,$start,$end,$score,$strand,$frame,$attributes) = split("\t",$line);
286	1					5	my @attributes_tab = split(";",$attributes);
287	1					2	my %attributes_hash;
288	1					3	foreach my $attr (@attributes_tab) {
289	4					10	my ($k,$v) = $attribute_split->($attr);
290	4					94	$attributes_hash{$k} = $v;
291							}
292	1					18	return { chr => $chr,
293							source => $source,
294							feature => $feature,
295							start => $start,
296							end => $end,
297							score => $score,
298							strand => $strand,
299							frame => $frame,
300							attributes => \%attributes_hash,
301							};
302							}
303
304
305							sub parseVCFLine {
306	1			1	1	4	my $line = shift;
307	1					8	my($chr,$pos,$id,$ref,$alt,$qual,$filter,$info) = split("\t",$line);
308	1					5	my @alts = split(",",$alt);
309	1					2	my %infos;
310	1					6	my @infos = split(";",$info);
311	1					4	foreach (@infos) {
312	5					16	my($k,$v) = split("=",$_);
313	5					16	$infos{$k} = $v;
314							}
315
316	1					14	return { chr => $chr,
317							pos => $pos,
318							id => $id,
319							ref => $ref,
320							alt => \@alts,
321							qual => $qual,
322							filter => $filter,
323							info => \%infos,
324							};
325							}
326
327
328							sub parseChimCTLine {
329	0			0	1	0	my $line = shift;
330	0					0	my($id,$name,$chr1,$pos1,$strand1,$chr2,$pos2,$strand2,$chim_value,$spanning_junction,$spanning_PE,$class,$comments,@others) = split("\t",$line);
331	0					0	my($sample,$chim_key) = split(":",$id);
332	0					0	my @comments = split(",",$comments);
333	0					0	my %comments;
334	0					0	foreach my $com (@comments){
335	0					0	my ($key,$val) = split("=",$com);
336	0					0	$comments{$key} = $val;
337							}
338	0					0	my %extend_fields;
339	0					0	foreach my $field (@others) {
340	0					0	my ($key,$val) = split("=",$field);
341	0	0	0			0	if(defined $key && defined $val) {
342	0					0	$extend_fields{$key} = $val;
343							}
344							}
345							return {
346	0					0	sample => $sample,
347							chim_key => $chim_key,
348							chr1 => $chr1,
349							pos1 => $pos1,
350							strand1 => $strand1,
351							chr2 => $chr2,
352							pos2 => $pos2,
353							strand2 => $strand2,
354							chim_value => $chim_value,
355							spanning_junction => $spanning_junction,
356							spanning_PE => $spanning_PE,
357							class => $class,
358							comments => \%comments,
359							extended_fields => \%extend_fields,
360							};
361							}
362
363
364							sub parseSAMLineLite {
365	1			1	1	5298	my $line = shift;
366	1					10	my ($qname,$flag,$rname,$pos,$mapq,$cigar,$rnext,$pnext,$tlen,$seq,$qual,@others) = split("\t",$line);
367	1					10	my @cigar_hash = map { { op => substr($_,-1), nb => substr($_,0,length($_)-1)} } $cigar =~ /(\d+\D)/g;
	3					18
368							return {
369	1					72	qname => $qname,
370							flag => $flag,
371							rname => $rname,
372							pos => $pos,
373							mapq => $mapq,
374							cigar => \@cigar_hash,
375							original_cigar => $cigar,
376							rnext => $rnext,
377							pnext => $pnext,
378							tlen => $tlen,
379							seq => $seq,
380							qual => $qual,
381							extended_fields => \@others,
382							};
383							}
384
385
386
387							sub getFileIterator {
388	5			5	1	22	my %args = @_;
389	5					12	my $file = $args{file};
390	5					14	my $type = $args{type};
391	5					11	my $skip = $args{skip};
392	5					48	my $header_regex = $args{header_regex};
393	5					10	my $parsing_method = $args{parsing_method};
394	5					12	my @parsing_arguments = ();
395
396
397	5	50				22	croak "Missing arguments in getFileIterator" if !defined $file;
398
399	5	100	66			33	if(!defined $parsing_method && defined $type) {
400	3	100	66			47	if($type =~ /gff3/i \|\| $type eq 'gtf' \|\| $type eq 'gff2') {
		100	66
		50	0
		0
		0
401	1					3	$header_regex = '^#';
402	1			1		5	$parsing_method = sub { return parseGFFLine(@_,$type) };
	1					5
403	1					3	push (@parsing_arguments,$type);
404							} elsif($type =~ /bed/i) {
405	1					4	$header_regex = '^track\s';
406	1					3	$parsing_method = \&parseBedLine;
407							} elsif ($type =~ /vcf/i) {
408	1					3	$header_regex = '^#';
409	1					5	$parsing_method = \&parseVCFLine;
410							} elsif ($type =~ /chimCT/i) {
411	0					0	$header_regex = '^#';
412	0					0	$parsing_method = \&parseChimCTLine;
413							} elsif ($type =~ /SAM/i \|\| $type =~ /BAM/i) {
414	0					0	$header_regex = '^@';
415	0					0	$parsing_method = \&parseSAMLineLite;
416							} else {
417	0					0	croak "Undefined format type";
418							}
419							}
420
421							# set defaults
422							#$header_regex = '^#' if !defined $header_regex;
423	5	50		0		19	$parsing_method = sub{ return { line => shift } } if !defined $parsing_method;
	0					0
424
425							# We get a filehandle on the file
426	5					17	my $fh = getReadingFileHandle($file);
427
428							# $curr_pos will hold the SEEK_POS of the current_line
429	5					27	my $curr_pos = tell($fh);
430							# $line will hold the content of the current_line
431	5					75	my $line = <$fh>;
432
433
434							# if we need to skip lines
435	5	50				20	if(defined $skip) {
436	0					0	for(my $i = 0; $i < $skip; $i++) {
437	0					0	$curr_pos = tell($fh);
438	0					0	$line = <$fh>;
439							}
440							}
441
442							# Skip line that match a specific regex
443	5	50				18	if(defined $header_regex) {
444	5		66			96	while($line && $line =~ /$header_regex/) {
445	22					36	$curr_pos = tell($fh);
446	22					162	$line = <$fh>;
447							}
448							}
449
450							# The iterator itself (an unnamed subroutine
451							return sub {
452	39	100		39		89	if (defined $line) {
453	37					51	chomp $line;
454							# We parse the line with the appropriate methd
455	37					98	my $parsed_line = $parsing_method->($line,@parsing_arguments);
456							# Add seek pos to the output hashref
457	37					72	$parsed_line->{seek_pos} = $curr_pos;
458	37					89	$curr_pos = tell($fh);
459	37					81	$line = <$fh>; # Get next line
460	37					134	return $parsed_line;
461							} else {
462	2					33	return undef;
463							}
464	5					44	};
465							}
466
467
468							sub getReadingFileHandle {
469	11			11	1	20	my $file = shift;
470	11					18	my $fh;
471	11	50				48	if($file =~ /\.gz$/) {
472	0	0				0	open($fh,"gunzip -c $file \|") or die ("Cannot open $file");
473							} else {
474	11	50				183	open($fh,"< $file") or die ("Cannot open $file");
475							}
476	11					516	return $fh;
477							}
478
479
480							sub getWritingFileHandle {
481	0			0	1		my $file = shift;
482	0						my $fh;
483	0	0					if($file =~ /\.gz$/) {
484	0	0					open($fh,"\| gzip > $file") or die ("Cannot open $file");
485							} else {
486	0	0					open($fh,"> $file") or die ("Cannot open $file");
487							}
488	0						return $fh;
489							}
490
491
492							sub getLineFromSeekPos {
493	0			0	1		my($fh,$seek_pos) = @_;
494	0						seek($fh,$seek_pos,SEEK_SET);
495	0						my $line = <$fh>;
496	0						chomp($line);
497	0						return $line;
498							}
499
500							1;
501
502							__END__