File Coverage

blib/lib/RandomJungle/Jungle.pm
Criterion Covered Total %
statement 13 15 86.6
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 18 20 90.0


line stmt bran cond sub pod time code
1             package RandomJungle::Jungle;
2              
3             =head1 NAME
4              
5             RandomJungle::Jungle - Consolidated interface for Random Jungle input and output data
6              
7             =cut
8              
9 4     4   113412 use strict;
  4         10  
  4         138  
10 4     4   20 use warnings;
  4         7  
  4         104  
11              
12 4     4   21 use Carp;
  4         7  
  4         256  
13 4     4   3930 use Devel::StackTrace;
  4         21958  
  4         134  
14              
15 4     4   2968 use RandomJungle::File::DB;
  0            
  0            
16             use RandomJungle::Tree;
17              
18             =head1 VERSION
19              
20             Version 0.05
21              
22             =cut
23              
24             our $VERSION = 0.05;
25             our $ERROR; # used if new() fails
26              
27             =head1 SYNOPSIS
28              
29             RandomJungle::Jungle provides a simplified interface to access Random Jungle input and output data.
30             See RandomJungle::Tree for methods relating to the classification trees produced by Random Jungle,
31             and RandomJungle::File::DB for lower-level methods that are wrapped by this module.
32              
33             use RandomJungle::Jungle;
34              
35             my $rj = RandomJungle::Jungle->new( db_file => $file ) || die $RandomJungle::Jungle::ERROR;
36             $rj->store( xml_file => $file, oob_file => $file, raw_file => $file ) || die $rj->err_str;
37             my $href = $rj->summary_data(); # for loaded data
38              
39             my $href = $rj->get_filenames; # filenames specified in store()
40             my $href = $rj->get_rj_input_params; # input params that were used when RJ was run
41             my $aref = $rj->get_variable_labels; # (expected: SEX PHENOTYPE var1 ...)
42             my $aref = $rj->get_sample_labels; # from the IID column of the RAW file
43              
44             # Returns data for the specified sample, where $label is the IID from the RAW file
45             my $href = $rj->get_sample_data_by_label( label => $label ) || warn $rj->err_str;
46              
47             my $aref = $rj->get_tree_ids;
48             my $tree = $rj->get_tree_by_id( $id ) || warn $rj->err_str; # RJ::Tree object
49              
50             # Returns hash of arefs containing lists of tree IDs, by OOB state for the sample
51             my $href = $rj->get_oob_for_sample( $label ) || warn $rj->err_str;
52              
53             # Returns the OOB state for a given sample and tree ID
54             my $val = $rj->get_oob_state( sample => $label, tree_id => $id ) || warn $rj->err_str;
55              
56             # Returns a hash of arefs containing lists of sample labels, by OOB for the tree
57             my $href = $rj->get_oob_for_tree( $tree_id ) || warn $rj->err_str;
58              
59             # Error handling
60             $rj->set_err( 'Something went boom' );
61             my $msg = $rj->err_str;
62             my $trace = $rj->err_trace;
63              
64             =cut
65              
66             #*********************************************************************
67             # Public Methods
68             #*********************************************************************
69              
70             =head1 METHODS
71              
72             =head2 new()
73              
74             Creates and returns a new RandomJungle::Jungle object:
75              
76             my $rj = RandomJungle::Jungle->new( db_file => $file ) || die $RandomJungle::Jungle::ERROR;
77              
78             The 'db_file' parameter is required.
79             Returns undef and sets $ERROR on failure.
80              
81             =cut
82              
83             sub new
84             {
85             # carps and returns undef if $args{db_file} is not defined
86             my ( $class, %args ) = @_;
87              
88             my $obj = {};
89             bless $obj, $class;
90             $obj->_init( %args ) || return; # $ERROR is set by _init()
91              
92             return $obj;
93             }
94              
95             =head2 store()
96              
97             This method loads data into the RandomJungle::File::DB database. All parameters are optional,
98             so files can be loaded in a single call or in multiple calls. Each type of file can only be
99             loaded once; subsequent calls to this method for a given file type will overwrite the
100             previously-loaded data.
101              
102             $rj->store( xml_file => $file, oob_file => $file, raw_file => $file ) || die $rj->err_str;
103              
104             Returns true on success. Sets err_str and returns false if an error occurred.
105              
106             =cut
107              
108             sub store
109             {
110             # Stores files specified by [xml_file oob_file raw_file] into the db
111             # Returns true on success, returns false and sets err_str on failure
112             my ( $self, %args ) = @_;
113              
114             my $rjdb = $self->{rjdb};
115             if( ! defined $rjdb )
116             {
117             $self->set_err( "Cannot store data without valid db connection" );
118             return;
119             }
120              
121             my %params;
122             foreach my $filetype qw( xml_file oob_file raw_file )
123             {
124             if( defined $args{$filetype} )
125             {
126             $params{$filetype} = $args{$filetype};
127             }
128             }
129              
130             $rjdb->store_data( %params ) ||
131             do
132             {
133             # need to preserve the error from the original class
134             my $err = join( "\n", $rjdb->err_str, $rjdb->err_trace );
135             $self->set_err( $err );
136             return;
137             };
138              
139             return 1;
140             }
141              
142             =head2 get_filenames()
143              
144             Returns a hash reference containing the names of the files specified in store():
145              
146             my $href = $rj->get_filenames;
147              
148             Keys in the href are db, xml, oob, and raw.
149              
150             =cut
151              
152             sub get_filenames
153             {
154             # Returns an href containing the names of the data source files in the db
155             my ( $self ) = @_;
156              
157             my $rjdb = $self->{rjdb};
158              
159             my %data = ( db => $rjdb->get_db_filename,
160             xml => $rjdb->get_xml_filename,
161             oob => $rjdb->get_oob_filename,
162             raw => $rjdb->get_raw_filename, );
163              
164             return \%data;
165             }
166              
167             =head2 get_rj_input_params()
168              
169             Returns a href of the input parameters used when Random Jungle was run:
170              
171             my $href = $rj->get_rj_input_params; # $href->{$param_name} = $param_value;
172              
173             =cut
174              
175             sub get_rj_input_params
176             {
177             # Returns a href of the input params that were used for RJ
178             my ( $self ) = @_;
179              
180             my $rjdb = $self->{rjdb};
181             my $href = $rjdb->get_rj_params;
182              
183             return $href;
184             }
185              
186             =head2 get_variable_labels()
187              
188             Returns a reference to an array that contains the variable labels from the RAW file:
189              
190             my $aref = $rj->get_variable_labels; # (expected: SEX PHENOTYPE var1 ...)
191              
192             =cut
193              
194             sub get_variable_labels
195             {
196             # Returns an aref of the variable labels from the RAW file (expected: SEX PHENOTYPE var1 ...)
197             my ( $self ) = @_;
198              
199             my $rjdb = $self->{rjdb};
200             my $aref = $rjdb->get_variable_labels;
201              
202             return $aref;
203             }
204              
205             =head2 get_sample_labels()
206              
207             Returns a reference to an array that contains the sample labels from the IID column of the RAW file:
208              
209             my $aref = $rj->get_sample_labels;
210              
211             =cut
212              
213             sub get_sample_labels
214             {
215             # Returns an aref of sample labels from the IID column of the RAW file
216             my ( $self ) = @_;
217              
218             my $rjdb = $self->{rjdb};
219             my $aref = $rjdb->get_sample_labels;
220              
221             return $aref;
222             }
223              
224             =head2 get_sample_data_by_label()
225              
226             Returns a hash ref containing data for the sample specified by label => $label, where label is
227             the IID from the RAW file. Sets err_str and returns undef if label is not specified or is invalid.
228              
229             my $href = $rj->get_sample_data_by_label( label => $label ) || warn $rj->err_str;
230              
231             $href has the following structure:
232             SEX => $val,
233             PHENOTYPE => $val,
234             orig_data => $line, (unsplit, unspliced)
235             index => $i, (index in aref from get_sample_labels(), can be used to index into OOB matrix)
236             classification_data => $aref, (can be passed to RandomJungle::Tree->classify_data)
237              
238             =cut
239              
240             sub get_sample_data_by_label
241             {
242             # Returns sample data specified by label => $label, where label is the IID from the RAW file
243             # Sets err_str and returns undef if label is not specified or is invalid
244             my ( $self, %args ) = @_;
245              
246             my $rjdb = $self->{rjdb};
247             my $href = $rjdb->get_sample_data( %args );
248              
249             if( ! defined $href )
250             {
251             # need to preserve the error from the original class
252             my $err = join( "\n", $rjdb->err_str, $rjdb->err_trace );
253             $self->set_err( $err );
254             return;
255             }
256              
257             return $href;
258             }
259              
260             =head2 get_tree_ids()
261              
262             Returns an array ref of tree IDs (sorted numerically):
263              
264             my $aref = $rj->get_tree_ids;
265              
266             =cut
267              
268             sub get_tree_ids
269             {
270             # Returns an aref of tree IDs
271             my ( $self ) = @_;
272              
273             my $rjdb = $self->{rjdb};
274             my $aref = $rjdb->get_tree_ids;
275              
276             return $aref;
277             }
278              
279             =head2 get_tree_by_id()
280              
281             Returns a RandomJungle::Tree object for the specified tree.
282              
283             my $tree = $rj->get_tree_by_id( $id ) || warn $rj->err_str;
284              
285             Sets err_str and returns undef if tree ID is undef or invalid, or if an internal error occurred.
286              
287             =cut
288              
289             sub get_tree_by_id
290             {
291             # Returns RJ::Tree object on success, sets err_str and returns undef on failure
292             # Sets err_str and returns undef if tree ID is undef or invalid, or if missing
293             # required params to ::Tree->new()
294             my ( $self, $id ) = @_;
295              
296             if( ! defined $id )
297             {
298             $self->set_err( "Tree ID is required to retrieve tree" );
299             return;
300             }
301              
302             my $rjdb = $self->{rjdb};
303              
304             my $href = $rjdb->get_tree_data( $id ); # Returns an empty href if tree ID is invalid
305              
306             if( ! exists $href->{$id} )
307             {
308             $self->set_err( "Error retrieving data for tree [$id] (may be invalid ID) - cannot create object" );
309             return;
310             }
311              
312             # get variable labels so can translate from indices to labels if desired
313             my $aref = $rjdb->get_variable_labels; # incl. sex, phenotype, and all genotypes
314              
315             # include variable labels as an optional param when creating the tree; returns undef if fails
316             my $tree = RandomJungle::Tree->new( %{ $href->{$id} }, variable_labels => $aref );
317              
318             if( ! defined $tree )
319             {
320             $self->set_err( $RandomJungle::Tree::ERROR );
321             return;
322             }
323              
324             return $tree;
325             }
326              
327             =head2 get_oob_for_sample()
328              
329             Returns lists of tree IDs, by OOB state, for the specified sample label.
330              
331             my $href = $rj->get_oob_for_sample( $label ) || warn $rj->err_str;
332              
333             The href contains the following keys, each of which point to an array reference containing tree IDs:
334             sample_used_to_construct_trees => [],
335             sample_not_used_to_construct_trees => [],
336              
337             Sets err_str and returns undef if the specified sample cannot be found (invalid label) or on error.
338              
339             =cut
340              
341             sub get_oob_for_sample
342             {
343             # Takes a sample label as single param, where label is the sample label (IID) from the RAW file.
344             # Returns an href with keys sample_used_to_construct_trees and sample_not_used_to_construct_trees,
345             # each of which are an aref of tree IDs.
346             # Sets err_str and returns undef if the specified sample cannot be found (invalid label) or on error.
347             my ( $self, $label ) = @_;
348              
349             if( ! defined $label )
350             {
351             $self->set_err( "Sample label is undefined" );
352             return;
353             }
354              
355             my $rjdb = $self->{rjdb};
356             my $tree_ids = $rjdb->get_tree_ids;
357              
358             my $line = $rjdb->get_oob_by_sample( label => $label );
359              
360             if( ! defined $line )
361             {
362             # need to preserve the error from the original class
363             my $err = join( "\n", $rjdb->err_str, $rjdb->err_trace );
364             $self->set_err( $err );
365             return;
366             }
367              
368             my @states = split( "\t", $line );
369              
370             my $oob_results = $self->_classify_oob_states( \@states, $tree_ids ) || return;
371              
372             my %results = ( sample_used_to_construct_trees => $oob_results->{in_bag},
373             sample_not_used_to_construct_trees => $oob_results->{oob} );
374              
375             return \%results;
376             }
377              
378             =head2 get_oob_state()
379              
380             Returns the OOB state for a given sample label and tree ID:
381              
382             my $val = $rj->get_oob_state( sample => $label, tree_id => $id ) || warn $rj->err_str;
383              
384             Expected values are 0 (the sample is "in bag" for the tree) or 1 (the sample is "out of bag" for the tree).
385              
386             Sets err_str and returns undef if sample or tree_id are not defined, or if sample label is invalid.
387              
388             =cut
389              
390             sub get_oob_state
391             {
392             # Returns the OOB state (expect 0 or 1) for a given sample and tree ID, specified as params
393             # using sample => $label, tree_id => $id, where label is from the IID column of the RAW file.
394             # Sets err_str and returns undef if sample or tree_id are not defined, or if sample label is invalid.
395             my ( $self, %params ) = @_;
396              
397             if( ! defined $params{sample} || ! defined $params{tree_id} )
398             {
399             $self->set_err( "sample and tree_id are required parameters to determine OOB state" );
400             return;
401             }
402              
403             my $rjdb = $self->{rjdb};
404              
405             # find tree index given ID (shouldn't assume they are equal) - removing for efficiency
406             #my @tree_ids = @{ $rjdb->get_tree_ids };
407             #my ( $tree_i ) = grep { $tree_ids[$_] == $params{tree_id} ? 1 : 0 } ( 0 .. $#tree_ids );
408             my $tree_i = $params{tree_id};
409              
410             my $line = $rjdb->get_oob_by_sample( label => $params{sample} );
411              
412             if( ! defined $line )
413             {
414             # need to preserve the error from the original class
415             my $err = join( "\n", $rjdb->err_str, $rjdb->err_trace );
416             $self->set_err( $err );
417             return;
418             }
419              
420             my $oob_state = ( split( "\t", $line ) )[ $tree_i ];
421              
422             return $oob_state;
423             }
424              
425             =head2 get_oob_for_tree()
426              
427             Returns lists of sample labels, by OOB state, for the specified tree ID.
428              
429             my $href = $rj->get_oob_for_tree( $tree_id ) || warn $rj->err_str;
430              
431             The href contains the following keys, each of which point to an array reference containing sample labels:
432             in_bag_samples => [],
433             oob_samples => [],
434              
435             Sets err_str and returns undef if the specified tree ID cannot be found (invalid) or on error.
436              
437             =cut
438              
439             sub get_oob_for_tree
440             {
441             # Takes a tree ID as single param.
442             # Returns an href with keys in_bag_samples and oob_samples
443             # each of which are an aref of sample labels (where label is from the IID column of the RAW file).
444             # Sets err_str and returns undef if the specified tree ID cannot be found (invalid) or on error.
445             my ( $self, $tree_id ) = @_;
446              
447             if( ! defined $tree_id )
448             {
449             $self->set_err( "Tree ID is undefined" );
450             return;
451             }
452              
453             my $rjdb = $self->{rjdb};
454              
455             # Validate $tree_id
456             my $href = $rjdb->get_tree_data( $tree_id );
457              
458             if( scalar keys %$href == 0 )
459             {
460             $self->set_err( "Invalid tree ID [$tree_id]" );
461             return;
462             }
463              
464             my $samples = $rjdb->get_sample_labels;
465             my @states;
466              
467             foreach my $sample ( @$samples )
468             {
469             my $state = $self->get_oob_state( sample => $sample, tree_id => $tree_id );
470             return if( ! defined $state );
471             push( @states, $state );
472             }
473              
474             my $oob_results = $self->_classify_oob_states( \@states, $samples );
475              
476             return if( ! defined $oob_results );
477              
478             my %results = ( in_bag_samples => $oob_results->{in_bag},
479             oob_samples => $oob_results->{oob} );
480              
481             return \%results;
482             }
483              
484             =head2 summary_data()
485              
486             Returns an href containing a summary of the data that is loaded into the db:
487              
488             my $href = $rj->summary_data();
489              
490             $href contains the output of other methods in this class, and it has the following structure:
491              
492             filenames => get_filenames(),
493             rj_params => get_rj_input_params(),
494             variable_labels => get_variable_labels() and see below,
495             sample_labels => get_sample_labels() and see below,
496             tree_ids => get_tree_ids() and see below,
497              
498             The keys variable_labels, sample_labels, and tree_ids all point to hrefs.
499             Each href has the following structure:
500              
501             all_labels => $aref, (for variable_labels and sample_labels)
502             all_ids => $aref, (for tree_ids only)
503             first => $val, (the first element of the all* aref)
504             last => $val, (the last element of the all* aref)
505             count => $val, (the size of the all* aref)
506              
507             =cut
508              
509             sub summary_data
510             {
511             # Returns an href containing a summary of the data that is loaded into the db
512             my ( $self ) = @_;
513              
514             my %data;
515              
516             $data{filenames} = $self->get_filenames;
517             $data{rj_params} = $self->get_rj_input_params;
518              
519             my $vars = $self->get_variable_labels;
520             $data{variable_labels}{all_labels} = $vars;
521             $data{variable_labels}{count} = scalar @$vars;
522             $data{variable_labels}{first} = $vars->[0];
523             $data{variable_labels}{last} = $vars->[-1];
524              
525             my $samples = $self->get_sample_labels;
526             $data{sample_labels}{all_labels} = $samples;
527             $data{sample_labels}{count} = scalar @$samples;
528             $data{sample_labels}{first} = $samples->[0];
529             $data{sample_labels}{last} = $samples->[-1];
530              
531             my $trees = $self->get_tree_ids;
532             $data{tree_ids}{all_ids} = $trees;
533             $data{tree_ids}{count} = scalar @$trees;
534             $data{tree_ids}{first} = $trees->[0];
535             $data{tree_ids}{last} = $trees->[-1];
536              
537             return \%data;
538             }
539              
540             =head2 set_err()
541              
542             Sets the error message (provided as a parameter) and creates a stack trace:
543              
544             $rj->set_err( 'Something went boom' );
545              
546             =cut
547              
548             sub set_err
549             {
550             my ( $self, $errstr ) = @_;
551              
552             $self->{err_str} = $errstr || '';
553             $self->{err_trace} = Devel::StackTrace->new;
554             }
555              
556             =head2 err_str()
557              
558             Returns the last error message that was set:
559              
560             my $msg = $rj->err_str;
561              
562             =cut
563              
564             sub err_str
565             {
566             my ( $self ) = @_;
567              
568             return $self->{err_str};
569             }
570              
571             =head2 err_trace()
572              
573             Returns a backtrace for the last error that was encountered:
574              
575             my $trace = $rj->err_trace;
576              
577             =cut
578              
579             sub err_trace
580             {
581             my ( $self ) = @_;
582              
583             return $self->{err_trace}->as_string;
584             }
585              
586             #*********************************************************************
587             # Private Methods and Routines
588             #*********************************************************************
589              
590             sub _init
591             {
592             # Sets $ERROR and returns undef if $args{db_file} is not defined
593             my ( $self, %args ) = @_;
594              
595             @{ $self->{params} }{ keys %args } = values %args;
596              
597             if( ! defined $args{db_file} )
598             {
599             $ERROR = "Cannot create new object - db_file is a required parameter";
600             return;
601             }
602              
603             my $rjdb = RandomJungle::File::DB->new( db_file => $args{db_file} );
604              
605             if( ! defined $rjdb )
606             {
607             $self->set_err( $RandomJungle::File::DB::ERROR );
608             return;
609             }
610              
611             $self->{rjdb} = $rjdb;
612             }
613              
614             sub _classify_oob_states
615             {
616             # Takes aref to OOB states and aref to labels (e.g., tree IDs or sample labels)
617             # Translates the OOB state at each position into 'in_bag' and 'oob'.
618             # Returns an href with those categories as keys, which point to arefs containing the
619             # labels in each category. This can be used to find, for a given sample, which trees
620             # the sample was used to construct. It can also be used to find which samples were
621             # in/out of bag for a given tree.
622             # Sets err_str and returns undef if @states and @labels are different sizes, or
623             # if a state is invalid.
624             my ( $self, $states, $labels ) = @_;
625              
626             if( scalar @$states != scalar @$labels )
627             {
628             $self->set_err( "Warning: number of OOB states does not equal the number of labels provided" );
629             return;
630             }
631              
632             my ( @used, @notused );
633              
634             foreach my $i ( 0 .. scalar @$states - 1 )
635             {
636             if( $states->[$i] eq '0' )
637             {
638             push( @used, $labels->[$i] ); # in bag
639             }
640             elsif( $states->[$i] eq '1' )
641             {
642             push( @notused, $labels->[$i] ); # OOB
643             }
644             else
645             {
646             my $val = $states->[$i];
647             $self->set_err( "Warning: unrecognized OOB state [$val] for label $labels->[$i]" );
648             return;
649             }
650             }
651              
652             my $results = { in_bag => \@used, oob => \@notused };
653              
654             return $results;
655             }
656              
657             =head1 SEE ALSO
658              
659             RandomJungle::Jungle, RandomJungle::Tree, RandomJungle::Tree::Node,
660             RandomJungle::XML, RandomJungle::OOB, RandomJungle::RAW,
661             RandomJungle::DB, RandomJungle::Classification_DB
662              
663             =head1 AUTHOR
664              
665             Robert R. Freimuth
666              
667             =head1 COPYRIGHT
668              
669             Copyright (c) 2011 Mayo Foundation for Medical Education and Research. All rights reserved.
670              
671             This program is free software; you can redistribute it and/or modify
672             it under the same terms as Perl itself.
673              
674             The full text of the license can be found in the
675             LICENSE file included with this module.
676              
677             =cut
678              
679             #*********************************************************************
680             # Guts
681             #*********************************************************************
682              
683             =begin guts
684              
685             $self
686             params
687             db_file => $filename
688             => $val
689             rjdb => $RJ_DB object
690             err_str => $errstr
691             err_trace => Devel::StackTrace object
692              
693             =cut
694              
695             1;
696