File Coverage

blib/lib/Bio/ToolBox/utility.pm
Criterion Covered Total %
statement 110 141 78.0
branch 37 52 71.1
condition 1 8 12.5
subroutine 7 8 87.5
pod 5 5 100.0
total 160 214 74.7


line stmt bran cond sub pod time code
1             package Bio::ToolBox::utility;
2             our $VERSION = '1.69';
3              
4             =head1 NAME
5              
6             Bio::ToolBox::utility - common utility functions for Bio::ToolBox
7              
8             =head1 DESCRIPTION
9              
10             These are general subroutines that don't fit in with the other modules.
11              
12             =head1 REGULAR SUBROUTINES
13              
14             The following subroutines are automatically exported when you use this module.
15              
16             =over 4
17              
18             =item parse_list
19              
20             my $index_request = '1,2,5-7';
21             my @indices = parse_list($index_request); # returns [1,2,5,6,7]
22              
23             This subroutine parses a scalar value into a list of values. The scalar is
24             a text string of numbers (usually column or dataset indices) delimited by
25             commas and/or including a range. For example, a string "1,2,5-7" would become
26             an array of [1,2,5,6,7].
27              
28             Pass the module the scalar string.
29              
30             It will return the array of numbers.
31              
32             =item format_with_commas
33              
34             my $count = '4327908475';
35             printf " The final count was %s\n", format_with_commas($count);
36              
37             This subroutine process a large number (e.g. 4327908475) into a human-friendly
38             version with commas delimiting the thousands (4,327,908,475).
39              
40             Pass the module a scalar string with a number value.
41              
42             It will return a scalar value containing the formatted number.
43              
44             =item ask_user_for_index
45              
46             my @answers = ask_user_for_index($Data, 'Please enter 2 or more columns ');
47              
48             This subroutine will present the list of column names from a L
49             structure along with their numeric indexes to the user and prompt for one
50             or more to be selected and entered. The function is smart enough to only print
51             the list once (if it hasn't changed) so as not to annoy the user with repeated
52             lists of header names when used more than once. A text prompt should be provided,
53             or a generic one is used. The list of indices are validated, and a warning printed
54             for invalid responses. The responses are then returned as a single value or array,
55             depending on context.
56              
57             =item simplify_dataset_name
58              
59             my $simple_name = simplify_dataset_name($dataset);
60              
61             This subroutine will take a dataset name and simplify it. Dataset names may
62             often be file names of data files, such as Bam and bigWig files. These may
63             include a C, C, or C prefix, one or more directory paths,
64             and one or more file name extensions. Additionally, more than one dataset
65             may be combined, for example two stranded bigWig files, with an ampersand.
66             This function will safely remove the prefix, directories, and everything after
67             the first period.
68              
69             =item sane_chromo_sort
70              
71             my @chromo = $db->seq_ids;
72             my @sorted = sane_chromo_sort(@chromo);
73              
74             This subroutine will take a list of chromosome or sequence identifiers and sort
75             them into a reasonably sane order: standard numeric identifiers first (numeric
76             order), sex chromosomes (alphabetical), mitochondrial, names with text and
77             numbers (text first alphabetically, then numbers numerically) for contigs and
78             such, and finally anything else (aciibetically). Any 'chr' prefix is ignored.
79             Roman numerals are properly handled numerically.
80              
81             The provided list may be a list of SCALAR values (chromosome names) or ARRAY
82             references, with the first element assumed to be the name, e.g.
83             C<[$name, $length]>.
84              
85             =back
86              
87             =cut
88              
89 4     4   658 use strict;
  4         7  
  4         94  
90 4     4   14 use Carp;
  4         7  
  4         148  
91 4     4   23 use File::Spec;
  4         5  
  4         5568  
92             require Exporter;
93              
94              
95             ### Variables
96             # Export
97             our @ISA = qw(Exporter);
98             our @EXPORT = qw(
99             parse_list
100             format_with_commas
101             ask_user_for_index
102             simplify_dataset_name
103             sane_chromo_sort
104             );
105             our $DATA_COLNAMES = undef;
106             our $DATA_FILENAME = undef;
107              
108             ### The True Statement
109             1;
110              
111              
112              
113             ################# The Subroutines ###################
114              
115             ### Parse string into list
116             sub parse_list {
117             # this subroutine will parse a string into an array
118             # it is designed for a string of numbers delimited by commas
119             # a range of numbers may be specified using a dash
120             # hence 1,2,5-7 would become an array of 1,2,5,6,7
121            
122 2     2 1 984 my $string = shift;
123 2 50       7 return unless defined $string;
124 2 50       6 if ($string =~ /[^\d,\-\s\&]/) {
125 0         0 carp " the string contains characters that can't be parsed\n";
126 0         0 return;
127             }
128 2         3 my @list;
129 2         9 foreach (split /[,\s+]/, $string) {
130             # check for a range
131 5 100       10 if (/\-/) {
132 2         5 my ($start, $stop) = split /\-/;
133             # add each item in the range to the list
134 2         6 for (my $i = $start; $i <= $stop; $i++) {
135 13         19 push @list, $i;
136             }
137 2         3 next;
138             }
139             else {
140             # either an ordinary number or an "&"ed list of numbers
141 3         4 push @list, $_;
142             }
143             }
144 2         8 return @list;
145             }
146              
147              
148              
149             ### Format a number into readable comma-delimited by thousands number
150             sub format_with_commas {
151             # for formatting a number with commas
152 3     3 1 1459 my $number = shift;
153            
154             # check number
155 3         4 my ($integers, $decimals, $sign);
156 3 100       28 if ($number =~ /^(\-)?(\d+)\.(\d+)$/) {
    50          
157 2         5 $sign = $1;
158 2         3 $integers = $2;
159 2         4 $decimals = $3;
160             }
161             elsif ($number =~ /^(\-)?(\d+)$/) {
162 1         3 $sign = $1;
163 1         2 $integers = $2;
164             }
165             else {
166 0         0 carp " the string contains characters that can't be parsed\n";
167 0         0 return $number;
168             }
169            
170             # format
171 3         10 my @digits = split //, $integers;
172 3         3 my @formatted;
173 3         7 while (@digits) {
174 10 100       15 if (@digits > 3) {
175 7         12 unshift @formatted, pop @digits;
176 7         10 unshift @formatted, pop @digits;
177 7         9 unshift @formatted, pop @digits;
178 7         10 unshift @formatted, ',';
179             }
180             else {
181 3         6 while (@digits) {
182 4         9 unshift @formatted, pop @digits;
183             }
184             }
185             }
186            
187             # finished
188 3 100       4 my $final = $sign ? $sign : '';
189 3         7 $final .= join("", @formatted);
190 3 100       7 $final .= '.' . $decimals if defined $decimals;
191 3         15 return $final;
192             }
193              
194              
195             sub ask_user_for_index {
196 0     0 1 0 my $Data = shift;
197 0   0     0 my $line = shift || ' Enter the desired column index ';
198 0 0       0 unless (ref($Data) =~ /Bio::ToolBox::Data/) {
199 0         0 carp "Must pass a Bio::ToolBox::Data object!\n";
200 0         0 return;
201             }
202            
203             # print column header names only if we have not done so before
204 0 0 0     0 unless (
205             # we use filename and column number as indicators
206             $Data->filename eq $DATA_FILENAME and
207             join(";", $Data->list_columns) eq $DATA_COLNAMES
208             ) {
209 0         0 print " These are the columns in the file\n";
210 0         0 my $i = 0;
211 0         0 foreach ($Data->list_columns) {
212 0         0 print " $i\t$_\n";
213 0         0 $i++;
214             }
215             # remember for next time
216 0         0 $DATA_FILENAME = $Data->filename;
217 0         0 $DATA_COLNAMES = join(";", $Data->list_columns);
218             }
219 0         0 print $line;
220            
221             # get response
222 0         0 my $response = ;
223 0         0 chomp $response;
224 0         0 my @indices = parse_list($response);
225            
226             # verify
227 0         0 my @good;
228 0         0 foreach (@indices) {
229 0 0       0 if ($Data->name($_)) {
230 0         0 push @good, $_;
231             }
232             else {
233 0         0 print " $_ is not a valid index!\n";
234             }
235             }
236 0 0       0 return wantarray ? @good : $good[0];
237             }
238              
239              
240             sub simplify_dataset_name {
241 14     14 1 4665 my $dataset = shift;
242 14         17 my $new_name;
243            
244             # strip any file prefix
245 14         39 $dataset =~ s/^(?:file|http|ftp):\/*//;
246            
247 14 100       30 if ($dataset =~ /&/) {
248             # a combination dataset
249 2         5 foreach (split /&/, $dataset) {
250 4         9 my $n = simplify_dataset_name($_);
251 4 100       10 if ($new_name) {
252 2         4 $new_name .= '&' . $n;
253             }
254             else {
255 2         3 $new_name = $n;
256             }
257             }
258             }
259             else {
260             # a single dataset
261             # this could be either a file name or an entry in a BioPerl or BigWigSet database
262             # remove any possible paths
263 12         95 (undef, undef, $new_name) = File::Spec->splitpath($dataset);
264            
265             # remove any known file extensions
266 12         44 $new_name =~ s/\.(?:bw|bam|bb|useq|bigwig|bigbed|g[tf]f3?|cram|wig|bdg|bedgraph)(?:\.gz)?$//i;
267            
268             # remove common non-useful stuff
269             # trying to imagine all sorts of possible things
270 12         27 $new_name =~ s/[_\.\-](?:sort|sorted|dedup|dedupe|deduplicated|rmdup|mkdup|markdup|dup|unique|filt|filtered)\b//gi;
271 12         23 $new_name =~ s/[_\.\-](?:coverage|rpm|ext\d*|extend\d*|log2fe|log\d+|qvalue|fragment|count|lambda_control|fe|fold.?enrichment|ratio|log\d*ratio)\b//gi;
272             }
273 14         24 return $new_name;
274             }
275              
276              
277             sub sane_chromo_sort {
278 4     4 1 1496 my @chroms = @_;
279 4 50       11 return unless scalar @chroms;
280            
281             # let's try and sort in some kind of rational order
282 4         17 my @numeric;
283             my @romanic;
284 4         0 my @mixed;
285 4         0 my @alphic;
286 4         0 my @sex;
287 4         0 my @mito;
288 4         7 foreach my $c (@chroms) {
289            
290 42         43 my $name;
291 42 50       51 if (ref($c) eq 'ARRAY') {
292 0         0 $name = $c->[0];
293             }
294             else {
295 42         39 $name = $c;
296             }
297            
298             # identify the type of chromosome name to sort
299 42 100       164 if ($name =~ /^(?:chr)?([wxyz])$/i) {
    100          
    100          
    100          
    100          
300             # sex chromosomes
301 4         10 push @sex, [$1, $c];
302             }
303             elsif ($name =~ /^(?:chr)?(?:m|mt|mito)(?:dna)?$/i) {
304             # mitochondrial
305 3         6 push @mito, [$name, $c];
306             }
307             elsif ($name =~ /^(?:chr)?(\d+)$/i) {
308             # standard numeric chromosome
309 22         46 push @numeric, [$1, $c];
310             }
311             elsif ($name =~ /^(?:chr)?([IVX]+)$/) {
312             # Roman numerals - silly Saccharomyces cerevisiae
313 7         19 push @romanic, [$1, $c];
314             }
315             elsif ($name =~ /^([a-zA-Z_\-\.]+)(\d+)/) {
316             # presumed contigs and such?
317 5         12 push @mixed, [$1, $2, $name, $c];
318             }
319             else {
320             # everything else
321 1         4 push @alphic, [$name, $c];
322             }
323             }
324            
325             # check romanic
326 4 100       9 if (scalar @romanic) {
327             # looks like we have romanic chromosomes
328 2 100       6 if (scalar @sex) {
329             # probably caught up chrX, unlikely WYZ
330 1         3 my @x = grep { $sex[$_]->[0] =~ m/^X$/ } (0 .. $#sex);
  2         6  
331 1         3 foreach (reverse @x) {
332             # I'm assuming and hoping there's only one chrX found
333             # but reverse the list, just in case - assuming grep returns in order
334 1         3 push @romanic, ( splice(@sex, $_, 1) );
335             }
336             }
337 2 50       5 if (scalar @numeric) {
338             # well, shoot, this is weird, mix of both numeric and romanic chromosomes?
339             # just merge romanic with alphic and hope for the best
340 0         0 push @alphic, @romanic;
341             }
342             else {
343             # convert the romanic to numeric
344 2         5 while (@romanic) {
345 8         15 my $r = shift @romanic;
346 8         11 my $c = $r->[0];
347 8         12 $c =~ s/IV/4/;
348 8         13 $c =~ s/IX/9/;
349 8         12 $c =~ s/V/5/;
350 8         12 $c =~ s/I/1/g;
351 8         12 my $n = 0;
352 8         15 foreach (split q(), $c) {
353 15 100       22 if ($_ eq 'X') {
354 2         3 $n += 10;
355             }
356             else {
357 13         16 $n += $_;
358             }
359             }
360 8         44 push @numeric, [$n, $r->[1]];
361             }
362             }
363             }
364            
365             # sort
366 4         5 my @sorted;
367 4         16 push @sorted, map { $_->[1] } sort {$a->[0] <=> $b->[0]} @numeric;
  30         40  
  66         68  
368 4         9 push @sorted, map { $_->[1] } sort {$a->[0] cmp $b->[0]} @sex;
  3         5  
  1         3  
369 4         6 push @sorted, map { $_->[1] } sort {$a->[0] cmp $b->[0]} @mito;
  3         7  
  0         0  
370 5         7 push @sorted, map { $_->[3] } sort {
371 4 50 33     7 $a->[0] cmp $b->[0] or
  4         18  
372             $a->[1] <=> $b->[1] or
373             $a->[2] cmp $b->[2]
374             } @mixed;
375 4         7 push @sorted, map { $_->[1] } sort { $a->[0] cmp $b->[0] } @alphic;
  1         1  
  0         0  
376            
377 4         22 return @sorted;
378             }
379              
380              
381             __END__