File Coverage

blib/lib/Bio/ToolBox/utility.pm

Criterion	Covered	Total	%
statement	12	89	13.4
branch	0	28	0.0
condition	0	5	0.0
subroutine	4	11	36.3
pod	7	7	100.0
total	23	140	16.4

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::ToolBox::utility;
2							our $VERSION = '1.68';
3
4							=head1 NAME
5
6							Bio::ToolBox::utility - common utility functions for Bio::ToolBox
7
8							=head1 DESCRIPTION
9
10							These are general subroutines that don't fit in with the other modules.
11
12							=head1 REGULAR SUBROUTINES
13
14							The following subroutines are automatically exported when you use this module.
15
16							=over 4
17
18							=item parse_list
19
20							my $index_request = '1,2,5-7';
21							my @indices = parse_list($index_request); # returns [1,2,5,6,7]
22
23							This subroutine parses a scalar value into a list of values. The scalar is
24							a text string of numbers (usually column or dataset indices) delimited by
25							commas and/or including a range. For example, a string "1,2,5-7" would become
26							an array of [1,2,5,6,7].
27
28							Pass the module the scalar string.
29
30							It will return the array of numbers.
31
32							=item format_with_commas
33
34							my $count = '4327908475';
35							printf " The final count was %s\n", format_with_commas($count);
36
37							This subroutine process a large number (e.g. 4327908475) into a human-friendly
38							version with commas delimiting the thousands (4,327,908,475).
39
40							Pass the module a scalar string with a number value.
41
42							It will return a scalar value containing the formatted number.
43
44							=item ask_user_for_index
45
46							my @answers = ask_user_for_index($Data, 'Please enter 2 or more columns ');
47
48							This subroutine will present the list of column names from a L
49							structure along with their numeric indexes to the user and prompt for one
50							or more to be selected and entered. The function is smart enough to only print
51							the list once (if it hasn't changed) so as not to annoy the user with repeated
52							lists of header names when used more than once. A text prompt should be provided,
53							or a generic one is used. The list of indices are validated, and a warning printed
54							for invalid responses. The responses are then returned as a single value or array,
55							depending on context.
56
57							=item simplify_dataset_name
58
59							my $simple_name = simplify_dataset_name($dataset);
60
61							This subroutine will take a dataset name and simplify it. Dataset names may
62							often be file names of data files, such as Bam and bigWig files. These may
63							include a C, C, or C prefix, one or more directory paths,
64							and one or more file name extensions. Additionally, more than one dataset
65							may be combined, for example two stranded bigWig files, with an ampersand.
66							This function will safely remove the prefix, directories, and everything after
67							the first period.
68
69							=back
70
71							=head1 LEGACY SUBROUTINES
72
73							These are additional functions that can be optionally exported. These provide
74							accessibility to the L functions that might be needed
75							for old scripts that do not implement L objects. You normally
76							should not need these. If you import these, be sure to import the ones above
77							if you need those too.
78
79							=over 4
80
81							=item open_to_read_fh
82
83							Wrapper around L.
84							Opens a file as an L read only object. Transparently handles gzip
85							and bzip2 compression.
86
87							=item open_to_write_fh
88
89							my $fh = open_to_write_fh($file, $gz, $append);
90
91							Wrapper around L.
92							Opens a file as an L write only object. Pass the file name as the option.
93							Optionally provide a boolean value if you want the file to be written as a compressed
94							gzip file. Pass another boolean value if you want to append to an existing file;
95							otherwise an existing file with the same name will be overwritten!
96
97							=item check_file
98
99							Wrapper around the L method.
100							Checks to see if a file exists. If not, some common missing extensions are appended
101							and then existence is re-checked. If a file is found, the name is returned so that
102							it could be opened. Useful, for example, if you forget the F<.txt> or F<.gz> extensions.
103
104							=back
105
106							=cut
107
108	3			3		22	use strict;
	3					5
	3					127
109	3			3		19	use Carp;
	3					6
	3					163
110	3			3		19	use File::Spec;
	3					13
	3					86
111							require Exporter;
112	3			3		15	use Bio::ToolBox::Data::file;
	3					6
	3					4102
113
114
115							### Variables
116							# Export
117							our @ISA = qw(Exporter);
118							our @EXPORT = qw(
119							parse_list
120							format_with_commas
121							ask_user_for_index
122							simplify_dataset_name
123							);
124							our @EXPORT_OK = qw(
125							open_to_read_fh
126							open_to_write_fh
127							check_file
128							);
129							our $DATA_COLNAMES = undef;
130							our $DATA_FILENAME = undef;
131
132							### The True Statement
133							1;
134
135
136
137							################# The Subroutines ###################
138
139							### Parse string into list
140							sub parse_list {
141							# this subroutine will parse a string into an array
142							# it is designed for a string of numbers delimited by commas
143							# a range of numbers may be specified using a dash
144							# hence 1,2,5-7 would become an array of 1,2,5,6,7
145
146	0			0	1		my $string = shift;
147	0	0					return unless defined $string;
148	0	0					if ($string =~ /[^\d,\-\s\&]/) {
149	0						carp " the string contains characters that can't be parsed\n";
150	0						return;
151							}
152	0						my @list;
153	0						foreach (split /[,\s+]/, $string) {
154							# check for a range
155	0	0					if (/\-/) {
156	0						my ($start, $stop) = split /\-/;
157							# add each item in the range to the list
158	0						for (my $i = $start; $i <= $stop; $i++) {
159	0						push @list, $i;
160							}
161	0						next;
162							}
163							else {
164							# either an ordinary number or an "&"ed list of numbers
165	0						push @list, $_;
166							}
167							}
168	0						return @list;
169							}
170
171
172
173							### Format a number into readable comma-delimited by thousands number
174							sub format_with_commas {
175							# for formatting a number with commas
176	0			0	1		my $number = shift;
177	0	0					if ($number =~ /[^\d,\-\.]/) {
178	0						carp " the string contains characters that can't be parsed\n";
179	0						return $number;
180							}
181
182							# check for decimals
183	0						my ($integers, $decimals);
184	0	0					if ($number =~ /^\-?(\d+)\.(\d+)$/) {
185	0						$integers = $1;
186	0						$decimals = $2;
187							}
188							else {
189	0						$integers = $number;
190							}
191
192							# format
193	0						my @digits = split //, $integers;
194	0						my @formatted;
195	0						while (@digits) {
196	0	0					if (@digits > 3) {
197	0						unshift @formatted, pop @digits;
198	0						unshift @formatted, pop @digits;
199	0						unshift @formatted, pop @digits;
200	0						unshift @formatted, ',';
201							}
202							else {
203	0						while (@digits) {
204	0						unshift @formatted, pop @digits;
205							}
206							}
207							}
208
209							# finished
210	0						my $final = join("", @formatted);
211	0	0					$final .= $decimals if defined $decimals;
212	0						return $final;
213							}
214
215
216							sub ask_user_for_index {
217	0			0	1		my $Data = shift;
218	0		0				my $line = shift \|\| ' Enter the desired column index ';
219	0	0					unless (ref($Data) =~ /Bio::ToolBox::Data/) {
220	0						carp "Must pass a Bio::ToolBox::Data object!\n";
221	0						return;
222							}
223
224							# print column header names only if we have not done so before
225	0	0	0				unless (
226							# we use filename and column number as indicators
227							$Data->filename eq $DATA_FILENAME and
228							join(";", $Data->list_columns) eq $DATA_COLNAMES
229							) {
230	0						print " These are the columns in the file\n";
231	0						my $i = 0;
232	0						foreach ($Data->list_columns) {
233	0						print " $i\t$_\n";
234	0						$i++;
235							}
236							# remember for next time
237	0						$DATA_FILENAME = $Data->filename;
238	0						$DATA_COLNAMES = join(";", $Data->list_columns);
239							}
240	0						print $line;
241
242							# get response
243	0						my $response = ;
244	0						chomp $response;
245	0						my @indices = parse_list($response);
246
247							# verify
248	0						my @good;
249	0						foreach (@indices) {
250	0	0					if ($Data->name($_)) {
251	0						push @good, $_;
252							}
253							else {
254	0						print " $_ is not a valid index!\n";
255							}
256							}
257	0	0					return wantarray ? @good : $good[0];
258							}
259
260
261							sub simplify_dataset_name {
262	0			0	1		my $dataset = shift;
263	0						my $new_name;
264
265							# strip any file prefix
266	0						$dataset =~ s/^(?:file\|http\|ftp):\/*//;
267
268	0	0					if ($dataset =~ /&/) {
		0
269							# a combination dataset
270	0						foreach (split /&/, $dataset) {
271	0						my $n = simplify_dataset_name($_);
272	0	0					if ($new_name) {
273	0						$new_name .= '&' . $n;
274							}
275							else {
276	0						$new_name = $n;
277							}
278							}
279							}
280							elsif ($dataset =~ m\|/\|) {
281							# appears to have paths
282	0						my (undef, undef, $file_name) = File::Spec->splitpath($dataset);
283	0						$file_name =~ s/^([\w\d\-\_]+)\..+$/$1/i; # take everything up to first .
284	0						$new_name = $file_name;
285							}
286							else {
287							# strip everything after first period, like above
288	0						$dataset =~ s/^([\w\d\-\_]+)\..+$/$1/i; # take everything up to first .
289	0						$new_name = $dataset;
290							}
291	0						return $new_name;
292							}
293
294
295							sub open_to_read_fh {
296	0			0	1		return Bio::ToolBox::Data::file->open_to_read_fh(@_);
297							}
298
299
300							sub open_to_write_fh {
301	0			0	1		return Bio::ToolBox::Data::file->open_to_write_fh(@_);
302							}
303
304							sub check_file {
305	0			0	1		return Bio::ToolBox::Data::file->check_file(@_);
306							}
307
308
309							__END__