File Coverage

blib/lib/Microarray/CdtDataset.pm

Criterion	Covered	Total	%
statement	4	6	66.6
branch			n/a
condition			n/a
subroutine	2	2	100.0
pod			n/a
total	6	8	75.0

line	stmt	sub	time	code
1				package Microarray::CdtDataset;
2
3				# License information (the MIT license)
4
5				# Copyright (c) 2003 Christian Rees, Janos Demeter, John Matese, Gavin
6				# Sherlock; Stanford University
7
8				# Permission is hereby granted, free of charge, to any person
9				# obtaining a copy of this software and associated documentation files
10				# (the "Software"), to deal in the Software without restriction,
11				# including without limitation the rights to use, copy, modify, merge,
12				# publish, distribute, sublicense, and/or sell copies of the Software,
13				# and to permit persons to whom the Software is furnished to do so,
14				# subject to the following conditions:
15
16				# The above copyright notice and this permission notice shall be
17				# included in all copies or substantial portions of the Software.
18
19				# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20				# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21				# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22				# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
23				# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
24				# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
25				# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26				# SOFTWARE.
27
28
29				# Author : Gavin Sherlock (based on Christian Rees' dataset object)
30				# Date : 14th August 2002
31
32				# Re-visited by John Matese, under auspices of the GMOD project
33				# Date : 28th May 2003
34
35				# Description: This package implements an object that serves as an
36				# abstraction to a cdtDataset. It is different than the
37				# Microarray::DataMatrix::CdtFile abstraction, because it deals with
38				# the cdtFile in the context of gtr and/or atr files. It also
39				# provides methods by which the geneXplorer program can interact with
40				# a cdtDataset.
41				# The essential purpose of CdtDataset's initialization functions is to
42				# de-construct the .cdt file into its constituent data parts of the
43				# dataset:
44				# 1) the data matrix (.data_matrix)
45				# 2) the bioassay names or slidenames (.expt_info)
46				# 3) the annotations of the spotted features/reporters/sequences
47				# (.feature_info)
48				# 4) any additional meta information about the set (.meta)
49				# 5) additionally, it computes or creates the following:
50				# a) a binary file containing a list of feature-feature
51				# correlations (.binCor)
52				# b) a 2-color image representation of the data matrix
53				# (.data_matrix.png)
54				# c) a image representation of the expt_info file
55				# (.expt_info.png)
56				#
57				# Known Issues: There are good reasons to add additional meta data to
58				# a dataset, including possibly the organism of the set or the
59				# location of the default display configuration file to display the
60				# .feature_info. These would probably have to be called in the constructor.
61				#
62				#
63				# Note: there are many vestiges of code left over from previous
64				# developers. These could be viewed as either an emergent/embryonic
65				# API or alternatively as vestigial limbs needing to be lopped off.
66				# i.e. the need for these methods/accessors existed at some time in
67				# the past, and may indeed be resurrected in the future. Currently,
68				# they are not supported.
69				#
70				#
71				# Future Plans: Currently, only the .cdt file of a clustered dataset
72				# is utilized. In the future, the other data files detailing the
73				# clustering [gene tree(.gtr) and array tree(.atr)] should be
74				# utilized, and DatasetImageMaker should export suitable image
75				# representations for these files. Furthermore, It would be great to
76				# pull general dataset methods from this class into a future class,
77				# Microarray::Dataset. That way, you could make a MageMLDataset class
78				# as well, and still keep many of the general class attributes/methods
79				# in the same locations. Microarray:Dataset would inherit constructor
80				# methods (i.e. knowledge of the file structure) from either
81				# CdtDataset orMageMLDataset at initialization (perhaps a run-time ISA
82				# declaration within the constructor). Otherwise, I don't see a huge
83				# advantage to having these specialized (and somewhat misnamed)
84				# classes, in the sense that Dataset only need to know how to parse
85				# the initialization file while convertind a new dataset
86
87
88
89	1	1	8277	use strict;
	1		2
	1		35
90	1	1	1741	use GD;
	0
	0
91				use File::Basename;
92
93				use Microarray::Config;
94				use Microarray::DatasetImageMaker;
95				use Microarray::Utilities::Filesystem qw(DirectoryIsValid EnsureTrailingSlash);
96				use Microarray::DataMatrix::CdtFile;
97				use Microarray::DataMatrix::PclFile;
98
99
100				my $dbg = 0;
101
102				my $VERSION = "0.1";
103
104				my $PACKAGE = 'Microarray::CdtDataset';
105
106				my $kCdtBase = $PACKAGE.'::__cdtBase';
107				my $kCdtPath = $PACKAGE.'::__cdtPath';
108
109
110				my $kName = $PACKAGE.'::__name'; # the full qualified name of the dataset
111				my $kFileBaseName = $PACKAGE.'::__fileBaseName'; # the stem of the file names (for dataset)
112
113				my $kDataPath = $PACKAGE.'::__dataPath';
114				my $kImagePath = $PACKAGE.'::__imagePath';
115				my $kContrast = $PACKAGE.'::__contrast';
116				my $kShouldInitialize = $PACKAGE.'::__shouldInitialize';
117				my $kCdtFileObject = $PACKAGE.'::__cdtFileObject';
118				my $kCdtFileName = $PACKAGE.'::__cdtFileName';
119				my $kHeight = $PACKAGE.'::__height';
120				my $kWidth = $PACKAGE.'::__width';
121				my $kColorScheme = $PACKAGE.'::__colorscheme';
122				my $kConfig = $PACKAGE.'::__config';
123				my $kCorrCutoff = $PACKAGE.'::__corrCutoff';
124
125				my $kDefaultContrast = 4;
126				my $kDefaultInitialization = 0;
127				my $kDefaultCorrCutoff = 0.5;
128				my $kDefaultColorScheme = 'yb'; # yellow/blue
129
130				my $kMinCorrCutoff = 0.2;
131
132				my $kImgType = Microarray::Config->ImageType;
133
134				my @metaColumns = ($kWidth, $kHeight, $kContrast, $kColorScheme, $kCorrCutoff);
135
136				my %kColorSchemeTranslation = Microarray::Config->ColorSchemeTranslationHash;
137
138				my $kCdtSuffix = '.cdt';
139				my $kMetaSuffix = '.meta';
140				my $kLockSuffix = '.lock';
141				my $kFeatureSuffix = '.feature_info';
142				my $kMatrixSuffix = '.data_matrix';
143				my $kExptInfoSuffix = '.expt_info';
144				my $kPclSuffix = '.pcl';
145				my $kStdCorrSuffix = '.stdCor';
146				my $kBinCorrSuffix = '.binCor';
147
148				my $kInfoGifSuffix = $kExptInfoSuffix.'.'.$kImgType;
149				my $kMatrixGifSuffix = $kMatrixSuffix.'.'.$kImgType;
150
151				my @kRequiredFileSuffixes = (
152				# cdt not required, unless we start copying it there
153				# $kCdtSuffix,
154				$kMetaSuffix,
155				$kFeatureSuffix,
156				$kMatrixSuffix,
157				$kExptInfoSuffix,
158				$kBinCorrSuffix
159				# images may reside in a different directory than the data, should fix this
160				# $kInfoGifSuffix,
161				# $kMatrixGifSuffix
162				);
163
164
165				####################################################################
166				sub new {
167				####################################################################
168				# This is the constructor. There are two modes in which the
169				# constructor can be used. In one mode, it will create various files
170				# which support the dataset, using the cdt, (and hopefully in the
171				# future, gtr and atr files). In the second mode, it will assume that
172				# these files already exist and just return the constructed objevt.
173				# Thus when a dataset is first created, there will be the overhead of
174				# creating the additional files, but subsequent creation of a
175				# cdtDataset object will not have that overhead. The constructor
176				# takes the following arguments:
177				#
178				# name : The fully qualified name of the dataset (slash/delimited),
179				# which encodes the location and stem of the files,
180				# without any extensions, and with no path
181				# information. If the 'initialize' argument is set
182				# (see below), a directory tructure of the same name
183				# will also be created to contain the exported data
184				# files.
185				#
186				# datapath : This required path prefix is where any newly created data
187				# files should be placed (or read from).
188				#
189				# imagepath : An optional path prefix where any newly created image files
190				# should be placed (or read from). Will default to
191				# datapath if none is specified.
192				#
193				# contrast : If a dataset is being instantiated for the first
194				# time, then a contrast is needed for image
195				# generation. If no contrast is provided, then a
196				# default value of 4 will be used. As the data are
197				# expected to be in log base 2, this corresponds to a
198				# 16-fold change as the maximum color in any image.
199				#
200				# colorscheme : Can either be 'red/green' (the default if none is
201				# specified) or 'yellow/blue'
202				#
203				# corrcutoff : If a dataset is initiated for the first time, correlation
204				# values are generated for each feature-pair and values
205				# above the cutoff are saved in a binary .binCor file
206				#
207				# initialize : A filepath of the originating .cdt file indicate
208				# whether to initialize all the required supporting
209				# files that a cdtDataset needs. This defaults to 0
210				# (assumes that the necessary supporting files already
211				# exist. If it is a filepath, then the dataset is
212				# initialized using it
213				#
214				#
215				# Note that if you supply a contrast, you must set initialize to 1, as
216				# a contrast is useless in the absence of initialization. Both the
217				# 'dataset' and 'path' arguments are absolutely required.
218				#
219				# Usage, eg if you have a file:
220				#
221				# my $ds = Microarray::CdtDataset->new(name=>dataset/name, # name of the dataset
222				# datapath=>$dir, # prefix path where dataset files will be written
223				# contrast=>2, # image contrast
224				# initialize=>/path/to/file.cdt);
225
226
227				my $class = shift;
228				my $self = { };
229
230				bless ($self, $class);
231
232				eval {
233				$self->__init(@_);
234				};
235
236				if ($@) {
237				die "The following error occurred: $@\n";
238				};
239
240				return $self;
241				}
242
243				############################################################################
244				sub __init{
245				############################################################################
246				# This method takes care of all of the initialization of the
247				# attributes of the cdtDataset
248
249				my $self = shift;
250
251				$self->__checkAndSetConstructorArguments(@_);
252
253				if ($self->__shouldInitialize){
254
255				$self->__initializeDataset;
256
257				$self->__setShouldInitialize(0); # so we know it's done
258
259				}else{
260
261				$self->__checkRequiredFilesExist;
262
263				# we need to load some meta information instead
264				$self->_load_meta;
265
266				}
267
268				# now load all the required feature and experiment info into
269				# memory
270
271				$self->__loadExptInfo;
272				$self->__loadFeatureInfo;
273
274				}
275
276				############################################################################
277				sub __checkAndSetConstructorArguments{
278				############################################################################
279				# This private method checks that the constructor arguments pass all
280				# sanity checks, and that files that should exist do exist.
281
282				my ($self, %args) = @_;
283
284				$self->__checkAndSetInitializationState(%args);
285				$self->__checkAndSetConfig(%args);
286				$self->__checkAndSetDataPath(%args);
287				$self->__checkAndSetImagePath(%args);
288				$self->__checkAndSetDatasetName(%args);
289				$self->__checkAndSetContrast(%args);
290				$self->__checkAndSetColorScheme(%args);
291				$self->__checkAndSetCorrCutoff(%args);
292
293				}
294
295
296				############################################################################
297				sub __checkAndSetInitializationState{
298				############################################################################
299				# This method checks and sets whether the object needs full
300				# initialization. There are meant to be 2 initilization requests.
301				# The first (initialization=>) would request that the dataset be
302				# created de novo from an initial file, and the second
303				# (initialization=>1) would just remake the images with a different
304				# constrast and different colors. The second initialization has not
305				# been adequately tested.
306
307				my ($self, %args) = @_;
308
309				if (exists($args{'initialize'})){
310
311				# the argument must be a cdt file path or a boolean
312				if (-e $args{'initialize'}){
313
314				if (!-r $args{'initialize'}){
315
316				die "The cdt file for initialization, $args{'initialize'}, is not readable.";
317
318				}elsif (!-T $args{'initialize'}){
319
320				die "The cdt file for initialization, $args{'initialize'}, is not a text file.";
321
322				}
323
324				$self->__setCdtFileName($args{'initialize'});
325				$self->__setShouldInitialize(1);
326
327				}elsif ($args{'initialize'} == 1){
328
329				# this should signify that the caller wants to re-make the
330				# images only, based on a pre-exisiting dataset. JCM
331				# note: This probably has not been thoroughly tested! (no
332				# explicit client available (though easy to modify
333				# bin/makeMicroarrayDataset), and change of constructor
334				# API has occurred)
335
336				$self->__setShouldInitialize(1);
337
338				}else {
339
340				die "The 'initialize' argument must be equal to 1 or be a valid $kCdtSuffix filepath. \nA value of '$args{'initialize'}' was supplied\n";
341				}
342
343				}else{
344
345				$self->__setShouldInitialize($kDefaultInitialization); # set default
346
347				}
348				}
349
350				############################################################################
351				sub __checkAndSetConfig{
352				############################################################################
353				# This private method checks and stores the config object that should
354				# be passed in
355
356				my ($self, %args) = @_;
357
358				if (exists($args{'config'})){
359
360				if ($args{'config'}->isa("Microarray::Config")){
361
362				$self->{$kConfig} = $args{'config'};
363
364				}else{
365
366				die "The 'config' argument you provided is not a Microarray::Config object.";
367
368				}
369
370				}else{
371
372				die "A 'config' argument was not supplied to the ".ref($self)." constructor.";
373
374				}
375
376				}
377
378				############################################################################
379				sub __checkAndSetDataPath{
380				############################################################################
381				# This private method checks that an Path is supplied, that
382				# corresponds to an existent directory, then stores it in the object.
383
384				my ($self, %args) = @_;
385
386				if (exists($args{'datapath'})){
387
388				# check that it's good
389
390				unless (&DirectoryIsValid($args{'datapath'})) {
391
392				die "The supplied 'datapath', $args{'datapath'}, either does not exist or is not a directory.";
393
394				}
395
396				# fix up the Path so it has a trailing forward slash
397				$args{'datapath'} = &EnsureTrailingSlash($args{'datapath'});
398
399				}else{
400
401				die "A 'datapath' argument was not supplied to the ".ref($self)." constructor.";
402
403				}
404
405				$self->__setDataPath($args{'datapath'});
406
407				}
408
409
410				############################################################################
411				sub __checkAndSetImagePath{
412				############################################################################
413				# This private method checks that an Path is supplied, that
414				# corresponds to an existent directory, then stores it in the object.
415
416				my ($self, %args) = @_;
417
418				if (exists($args{'imagepath'})){
419
420				# check it's good
421
422				unless (&DirectoryIsValid($args{'imagepath'})) {
423
424				die "The supplied 'imagepath', $args{'imagepath'}, either does not exist or is not a directory.";
425
426				}
427
428				# fix up the Path so it has a trailing forward slash
429				$args{'imagepath'} = &EnsureTrailingSlash($args{'imagepath'});
430
431				}else{
432
433				# use datapath as the default (generic module)
434				$args{'imagepath'} = $self->datapath;
435
436				}
437
438				$self->__setImagePath($args{'imagepath'});
439
440				}
441
442
443				############################################################################
444				sub __checkAndSetDatasetName{
445				############################################################################
446				# This method checks that a dataset was given to the constructor. In
447				# addition because CdtDataset creates and stores all its images and
448				# data in a directory hierarchy, the initially specified data and
449				# image paths are augmented with the dataset name directories (which
450				# are created upon initialization)
451
452				my ($self, %args) = @_;
453
454				# check we have a good name
455
456				if (!exists($args{'name'})){
457
458				die "The required 'name' argument was not supplied to the ".ref($self)." constructor.";
459
460				}
461
462				my $fullPathToData = &EnsureTrailingSlash($self->datapath.$args{'name'});
463				my $fullPathToImages = &EnsureTrailingSlash($self->imagepath.$args{'name'});
464
465				if ($self->__shouldInitialize) { # make them if initialization was requested
466
467				$self->__setDataPath($self->__ensureDirectoriesExist($self->datapath, $args{'name'}));
468				$self->__setImagePath($self->__ensureDirectoriesExist($self->imagepath, $args{'name'}));
469
470
471				}else{ # make certain the dataset is where they said it is
472
473				&DirectoryIsValid($fullPathToData) \|\| die "Could not validate dataset directories (datapath + dataset name) residence at $fullPathToData.\n";
474
475				&DirectoryIsValid($fullPathToImages) \|\| die "Could not validate dataset image directories (image path + dataset name) residence at $fullPathToImages.\n";
476
477
478				}
479
480				my $datasetBase = basename($args{'name'});
481
482				$self->__setDataPath($fullPathToData); # augmenting datapath with data set directory name
483				$self->__setImagePath($fullPathToImages); # augmenting imagepath with data set directory name
484				$self->__setDatasetName($args{'name'});
485				$self->__setFileBaseName($datasetBase);
486
487				}
488
489
490
491				############################################################################
492				sub __checkAndSetContrast{
493				############################################################################
494				# This method determines if the contrast is valid, and then stores the
495				# value in the object
496
497				my ($self, %args) = @_;
498
499				if (exists($args{'contrast'})){
500
501				if (!exists($args{'initialize'})){
502
503				die "A 'contrast' argument was provided to the ".ref($self)." constructor, but an initialize argument was not.";
504
505				}elsif ($args{'contrast'} <= 0){
506
507				die "The supplied value for the 'contrast' argument must be greater than zero. A value of $args{'contrast'} was supplied.";
508
509				}
510
511				$self->__setContrast($args{'contrast'});
512
513				}else{
514
515				$self->__setContrast($kDefaultContrast); # set the default
516
517				}
518
519				}
520
521				############################################################################
522				sub __checkAndSetColorScheme{
523				############################################################################
524				# This method determines if the colorscheme is valid, and then stores
525				# the value in the object
526
527				my ($self, %args) = @_;
528
529				if (exists($args{'colorscheme'})){
530
531				if (!exists($args{'initialize'})){
532
533				die "A 'colorscheme' argument was provided to the ".ref($self)." constructor, but an initialize argument was not.";
534
535				}elsif (!exists($kColorSchemeTranslation{$args{'colorscheme'}})){
536
537				die "The supplied value for the 'colorscheme' argument must be one of \n\n".
538				join("\n", keys %kColorSchemeTranslation)."\n\nA value of $args{'colorscheme'} was supplied.";
539
540				}
541
542				$self->__setColorScheme(($kColorSchemeTranslation{$args{'colorscheme'}}));
543
544				}else{
545
546				$self->__setColorScheme($kDefaultColorScheme); # set the default
547
548				}
549
550				}
551
552				############################################################################
553				sub __checkAndSetCorrCutoff{
554				############################################################################
555				# This method determines if the correlation cutoff value is valid, and then stores
556				# the value in the object
557
558				my ($self, %args) = @_;
559
560				if (exists($args{'corrcutoff'})){
561
562				if (!exists($args{'initialize'})){
563
564				die "A 'corrcutoff' argument was provided to the ".ref($self)." constructor, but an initialize argument was not.";
565
566				}elsif ($args{'corrcutoff'} !~ /^[\d\.]+$/g
567				\|\| $args{'corrcutoff'} > 1
568				\|\| $args{'corrcutoff'} < $kMinCorrCutoff){
569
570				die "The supplied value for the 'corrcutoff' argument must be a number \n\n".
571				"$kMinCorrCutoff =< corrcutoff =< 1"."\n\nA value of $args{'corrcutoff'} was supplied.";
572				}
573
574				$self->__setCorrCutoff($args{'corrcutoff'});
575
576				}else{
577
578				$self->__setCorrCutoff($kDefaultCorrCutoff); # set the default
579
580				}
581
582				}
583
584
585				#####################################################################
586				sub __checkRequiredFilesExist{
587				#####################################################################
588				# This method checks that all the required files for the dataset exist
589				# If they do not, it will cause a fatal error
590
591				my $self = shift;
592
593				my $prefix = $self->datapath;
594				$prefix = &EnsureTrailingSlash($prefix);
595
596				$prefix .= $self->fileBaseName;
597
598				foreach my $suffix (@kRequiredFileSuffixes){
599
600				die $prefix.$suffix." does not exist." if (!-e $prefix.$suffix);
601
602				}
603				}
604
605
606				############################################################################
607				sub __setCdtInfo {
608				############################################################################
609				# this subroutine takes the initalize arguement and store the path and
610				# the stem of the .cdt filename
611
612				my ($self, $filepath);
613
614				my ($base, $inpath, $suffix) = fileparse($filepath, $kCdtSuffix);
615				$self->{$kCdtBase} = $base;
616				$self->{$kCdtPath} = $inpath;
617
618
619				}
620
621
622
623				#####################################################################
624				sub __setFileBaseName {
625				#####################################################################
626				# This method allows the filename stem (no suffix) of the datafiles
627				# use to initialize the dataset to be set
628
629				my ($self, $nameBase) = @_;
630				$self->{$kFileBaseName} = $nameBase;
631				}
632
633
634				#####################################################################
635				sub __setDataPath {
636				#####################################################################
637				# This method allows the path to where the data files for the dataset
638				# exist to be set
639
640				my ($self, $outpath) = @_;
641				$self->{$kDataPath} = $outpath;
642				}
643
644
645				#####################################################################
646				sub __setImagePath {
647				#####################################################################
648				# This method allows the path to where the image files for the dataset
649				# exist to be set
650
651				my ($self, $outpath) = @_;
652				$self->{$kImagePath} = $outpath;
653				}
654
655
656				#####################################################################
657				sub __setDatasetName {
658				#####################################################################
659				# This method allows the name of the dataset to be set.
660
661				my ($self, $datasetName) = @_;
662				$dbg && print "\t dataset name $datasetName \n";
663				$self->{$kName} = $datasetName;
664				}
665
666
667				#####################################################################
668				sub __setCdtFileName {
669				#####################################################################
670				# This method sets the name of the cdtFile
671
672				my ($self, $cdtFile) = @_;
673				$self->{$kCdtFileName} = $cdtFile;
674				}
675
676
677				#####################################################################
678				sub __setContrast {
679				#####################################################################
680				# This method allows the contrast to be set.
681
682				my ($self, $contrast) = @_;
683				$self->{$kContrast} = $contrast;
684				}
685
686
687				#####################################################################
688				sub __setColorScheme {
689				#####################################################################
690				# This method allows the colorscheme to be set.
691
692				my ($self, $colorScheme) = @_;
693				$self->{$kColorScheme} = $colorScheme;
694				}
695
696
697				#####################################################################
698				sub __setCorrCutoff {
699				#####################################################################
700				# This method allows the correaltion cutoff to be set.
701
702				my ($self, $corrCutoff) = @_;
703				$self->{$kCorrCutoff} = $corrCutoff;
704				}
705
706
707				#####################################################################
708				sub __setShouldInitialize {
709				#####################################################################
710				# This method allows a flag to be set as to whether full
711				# initialization need to take place
712
713				my ($self, $shouldInitialize) = @_;
714				$self->{$kShouldInitialize} = $shouldInitialize;
715				}
716
717
718				#####################################################################
719				sub __setHeight {
720				#####################################################################
721				# This private method allows the 'height' of the dataset to be set.
722				# This in fact corresponds to the number of rows in the cdt file.
723
724				my ($self, $height) = @_;
725				$self->{$kHeight} = $height;
726				}
727
728
729				#####################################################################
730				sub __setWidth {
731				#####################################################################
732				# This private method allows the 'height' of the dataset to be set.
733				# This in fact corresponds to the number of rows in the cdt file.
734
735				my ($self, $width) = @_;
736				$self->{$kWidth} = $width;
737				}
738
739
740				#####################################################################
741				sub name {
742				#####################################################################
743				# This method returns the fully qualified name of the dataset
744
745				return $_[0]->{$kName};
746				}
747
748
749				#####################################################################
750				sub _cdtFileName {
751				#####################################################################
752				# This method returns the name of the cdtFile
753
754				return $_[0]->{$kCdtFileName};
755				}
756
757
758				#####################################################################
759				sub _cdtBase {
760				#####################################################################
761				# This method returns the base name string of the files comprising of
762				# the dataset, sans suffices
763
764				return $_[0]->{$kCdtBase};
765				}
766
767
768				#####################################################################
769				sub _cdtPath {
770				#####################################################################
771				# This method returns the path to the cdt file of thebeing converted
772				# into a dataset
773
774				return $_[0]->{$kCdtPath};
775				}
776
777
778				######################################################################
779				sub __ensureDirectoriesExist {
780				######################################################################
781				# This subroutine checks to see that the full outpath is created if
782				# necessary, by extended a previouslt validated filepath. It is
783				# tended for use only when initializating a dataset, where the dataset
784				# directories might need to be created and appended to the data and
785				# image out paths
786
787				my ($self, $path, $possibleExtension) = @_;
788
789				my @dirnames = split(/\//, $possibleExtension);
790
791				$dbg && print "Ensuring that all directories exist for the dataset initialization\n";
792				$dbg && print "\t$path + @dirnames\n";
793
794
795				$path = &EnsureTrailingSlash($path);
796
797				while (my $dir = shift(@dirnames)) {
798
799				# extend the tree if requested
800				$path .= "$dir/";
801
802				# does it exist, or should we make it (new directory)?
803				unless (&DirectoryIsValid($path)) {
804
805				$dbg && print "\tcreating $path directory\n";
806
807				# if not, create it first...
808				mkdir($path, 0775) \|\| die "Couldn't create directory: $path";
809
810				}
811				$path = &EnsureTrailingSlash($path);
812				}
813
814				return $path; # this should be the validated, extended path
815
816				}
817
818
819				#####################################################################
820				sub datapath {
821				#####################################################################
822				# This method returns the path to which data files either written
823				# or read from
824
825				return $_[0]->{$kDataPath};
826				}
827
828
829				#####################################################################
830				sub imagepath {
831				#####################################################################
832				# This method returns the path to which image files are either written
833				# or read from
834
835				return $_[0]->{$kImagePath};
836				}
837
838
839				#####################################################################
840				sub contrast {
841				#####################################################################
842				# This method returns the contrast
843
844				return $_[0]->{$kContrast};
845				}
846
847
848				#####################################################################
849				sub colorScheme {
850				#####################################################################
851				# This method returns the colorScheme
852
853				return $_[0]->{$kColorScheme};
854				}
855
856				#####################################################################
857				sub corrCutoff {
858				#####################################################################
859				# This method returns the correlation cutoff
860
861				return $_[0]->{$kCorrCutoff};
862				}
863
864
865				#####################################################################
866				sub fileBaseName {
867				#####################################################################
868				# This method returns the base name string of the files comprising of
869				# the dataset, sans suffices
870
871				return $_[0]->{$kFileBaseName};
872				}
873
874
875				#####################################################################
876				sub height {
877				#####################################################################
878				# This method returns the number of data rows in the cdtFile
879
880				return $_[0]->{$kHeight};
881				}
882
883
884				#####################################################################
885				sub width {
886				#####################################################################
887				# This method returns the number of data columns in the cdtFile
888
889				return $_[0]->{$kWidth};
890				}
891
892				#####################################################################
893				sub __config{
894				#####################################################################
895				# This private method returns the config object that was used during
896				# construction
897
898				return $_[0]->{$kConfig};
899
900				}
901
902				#####################################################################
903				sub __cdtFileObject {
904				#####################################################################
905				# This private method returns a cdtFile Object. If one does not exist
906				# within the object, one will be created. If one does exist, that
907				# will simply be returned. This will likely fail for sets that are
908				# already converted, because the .cdt file is not copied into the
909				# dataset location. This is a design issue that needs to be
910				# discussed, in addition to the fact that it is private method, when
911				# it seems like other software might actually want to retrieve the
912				# Datamatix object
913
914				my $self = shift;
915
916				if (!exists($self->{$kCdtFileObject})){ # we need to create one
917
918				$self->{$kCdtFileObject} = Microarray::DataMatrix::CdtFile->new(file => $self->_cdtFileName,
919				tmpDir => $self->__config->tmpPath);
920				}
921
922				return $self->{$kCdtFileObject};
923				}
924
925
926				#####################################################################
927				sub __shouldInitialize {
928				#####################################################################
929				# This private method returns whether the object needs initialization
930
931				return $_[0]->{$kShouldInitialize};
932				}
933
934
935				#####################################################################
936				sub __initializeDataset {
937				#####################################################################
938				# This method creates a new dataset from a CDT (clustered data) file.
939				# The CDT file format was defined by Michael Eisen for his Windows
940				# applications TreeView and Cluster. It has certain drawbacks, for
941				# example not more then two columns per gene can be used to store
942				# additional information. This can be partly resolved by putting more
943				# data into one record field. A kludgy fix.
944
945				my $self = shift;
946
947				$self->__lock; # lock dataset
948
949				# first extract salient info from cdtfile, and
950				# create the correlations files
951
952				$self->__dissectCDT;
953				$self->__prepareCorrelations;
954				$self->__compressCorrelations;
955
956
957				# should have enough data dissected so far to load the experiment
958				# info, and we'll need these for the image header, so load them
959
960				$self->__loadExptInfo;
961
962				# now create the required images
963				my $imageMaker = Microarray::DatasetImageMaker->new();
964				$imageMaker->makeImage('dataset' => $self,
965				'type' => 'matrix');
966				$imageMaker->makeImage('dataset' => $self,
967				'type' => 'header');
968
969				# now write out our 'meta' information,
970				# for quick access later on
971
972				$self->__prepareMetaFile;
973
974				$self->__unlock; # unlock dataset
975
976				}
977
978
979				#####################################################################
980				sub __lock {
981				#####################################################################
982				# This method locks the dataset
983
984				my $self = shift;
985				my $lockFile = $self->datapath.$self->fileBaseName.$kLockSuffix;
986				open (OUT, ">".$lockFile) \|\| die "Cannot lock dataset using lockfile $lockFile : $!";
987				close (OUT);
988
989				}
990
991
992				#####################################################################
993				sub __unlock {
994				#####################################################################
995				# This method unlocks the dataset
996
997				my $self = shift;
998				my $lockFile = $self->datapath.$self->fileBaseName.$kLockSuffix;
999				unlink ($lockFile);
1000				}
1001
1002
1003				#####################################################################
1004				sub __dissectCDT {
1005				#####################################################################
1006				# This method determines the contents of the cdtfile, and stores some
1007				# of the cdtMeta data for quick retrieval. Note that the previous
1008				# version did its own parsing of the cdtFile. This is now delegated
1009				# to the cdtFile object.
1010
1011				my $self = shift;
1012
1013				my $cdtFileObject = $self->__cdtFileObject;
1014
1015				$self->__setWidth($cdtFileObject->numColumns);
1016				$self->__setHeight($cdtFileObject->numRows);
1017
1018				$self->__saveCdtExptNames($cdtFileObject->columnNamesArrayRef);
1019
1020				$cdtFileObject->createIndexFile($self->datapath.$self->fileBaseName.$kFeatureSuffix);
1021				$cdtFileObject->createRawMatrixFile($self->datapath.$self->fileBaseName.$kMatrixSuffix);
1022
1023				}
1024
1025
1026				######################################################################
1027				sub __saveCdtExptNames {
1028				######################################################################
1029				# This method (we may eliminate it later) save the names of the data
1030				# columns from the cdtFile (these are usually the experiment names) to
1031				# a file. This is later used by GeneXplorer, but also provides a
1032				# quick way of looking up the data, without having to read the cdtFile
1033				# in.
1034
1035				my ($self, $exptNamesARef) = @_;
1036
1037				my $file = $self->datapath.$self->fileBaseName.$kExptInfoSuffix;
1038
1039				# write index number and name to file
1040
1041				open (OUT, ">".$file) \|\| die "Cannot write out experiment info to $file : $!";
1042
1043				print OUT "ID\tNAME\n";
1044
1045				for (my $i=0; $i < @{$exptNamesARef}; $i++){
1046
1047				print OUT $i, "\t", $exptNamesARef->[$i], "\n";
1048
1049				}
1050
1051				close (OUT);
1052
1053				}
1054
1055
1056				#####################################################################
1057				sub __prepareCorrelations {
1058				#####################################################################
1059				# This method prepares a correlations file
1060
1061				my $self = shift;
1062
1063				# first we have to create a pcl file, with an index in the first
1064				# column
1065
1066				my $pclFileName = $self->datapath.$self->fileBaseName.$kPclSuffix;
1067
1068				print "pcl name: $pclFileName\n";
1069
1070				print "tmp path: ".$self->__config->tmpPath."\n";
1071
1072				$self->__createIndexedPclFile($pclFileName);
1073
1074				my $pcl = Microarray::DataMatrix::PclFile->new(tmpDir => $self->__config->tmpPath,
1075				file => $pclFileName);
1076
1077				# then use the pcl file to create correlations
1078
1079				$pcl->createCorrelationsFile(cutoff=>$self->corrCutoff);
1080
1081				# Now we can get rid of the pcl file
1082				unlink $pclFileName \|\| warn "Couldn't unlink $pclFileName : $!";
1083
1084				}
1085
1086
1087				#####################################################################
1088				sub __createIndexedPclFile{
1089				#####################################################################
1090				# This method creates a pcl file from the cdt file that was used to
1091				# instantiate the object. This is coded here, rather than using the
1092				# cdtFile method to convert to a pcl, because the pcl file must have
1093				# an index for it's names, rather than the names themselves.
1094
1095				my ($self, $pclFile) = @_;
1096
1097				open (IN, $self->_cdtFileName) \|\| die "Cannot open cdt file ".$self->_cdtFileName." : $!";
1098
1099				print "trying to create $pclFile \n\n";
1100
1101				open (OUT, ">".$pclFile) \|\| die "Cannot create $pclFile : $!";
1102
1103				my $count = 1;
1104				my $index = 0;
1105				my $hasGtr = 0;
1106				my @line;
1107				my $numColumns;
1108
1109				while (){
1110
1111				chomp;
1112
1113				@line = split("\t", $_, -1);
1114
1115				if ($count == 1){
1116
1117				if ($line[0] eq "GID"){
1118
1119				shift (@line);
1120				$hasGtr = 1;
1121
1122				}
1123
1124				$numColumns = scalar(@line);
1125
1126				print OUT join("\t", @line), "\n";
1127				print OUT "EWEIGHT\t\t";
1128
1129				print OUT "\t1" x (scalar(@line)-3), "\n";
1130
1131				$count++;
1132
1133				next;
1134
1135				}elsif ($count == 2){
1136
1137				$count++;
1138
1139				next if ($line[0] eq "AID");
1140
1141				}
1142
1143				# if we get here, it's a data line
1144
1145				shift (@line) if $hasGtr;
1146
1147				next if !@line;
1148
1149				$line[0] = $index;
1150
1151				if (scalar(@line) != $numColumns){
1152
1153				die "In your cdtFile, data line $index has ".scalar(@line)." columns, instead of $numColumns.\n";
1154
1155				}
1156
1157				print OUT join("\t", @line), "\n";
1158
1159				$index++;
1160
1161				}
1162
1163				close OUT;
1164				}
1165
1166
1167				#####################################################################
1168				sub __compressCorrelations {
1169				#####################################################################
1170				# This method takes a correlations file as output by Gavin Sherlocks
1171				# correlations program. These represent the correlation values of a
1172				# certain gene (array element) intensity vector vs. all other vectors
1173				# in a data matrix.
1174				# The output generated is a binary representation of the list of
1175				# correlation values for each row in the data matrix (= expression
1176				# vectors).
1177				#
1178				# The file is built like this:
1179				# ############################
1180				# name content bytes
1181				# ################################################
1182				# header
1183				# ################################################
1184				# index_size length of index 2
1185				# index offset for rows index_size * 2
1186				# ################################################
1187				# body
1188				# ################################################
1189				# data 1..n correlation data 4 * look up in index
1190				# -> index correlated vector 2 \
1191				# -> corr correlation 2 / 2 words (16 int)
1192
1193				# The correlation data is stored in lists of pairs of the most
1194				# correlated vectors index number (row in the table) and the
1195				# correlation value. The correlation value has been multiplied by
1196				# (2^16)-1 (65535) to make it an integer. To retrieve the original
1197				# value, divide the integer by (2^16)-1. No negative correlation
1198				# values are allowed.
1199
1200				my $self = shift;
1201
1202				my $corrFile = $self->datapath.$self->fileBaseName.$kStdCorrSuffix;
1203
1204				my $header = "";
1205				my $numVecs = 0;
1206				my $body = "";
1207
1208				open(IN, $corrFile) \|\| die "cannot open correlations file $corrFile: $!\n";
1209
1210				while (){
1211
1212				chomp;
1213
1214				my @values = split(/\t/, $_); # split each row, containing the index/correlation pairs
1215				my $index = shift @values; # first value in row is the index of the vector we correlate to
1216				my $numCorrVectors = scalar(@values)/2; # determine how many correlated vectors exist for this one (pairs/2)
1217
1218				for (my $j=1; $j < @values; $j += 2) { # look at the correlations
1219
1220				# multiply correlation values and make them into an int
1221				# between 0 and 65535
1222
1223				$values[$j] = int($values[$j] * 65535);
1224
1225				}
1226
1227				# add the number of pairs to the file header
1228				$header .= pack("n", $numCorrVectors);
1229
1230				# pack values into string of unsigned ints
1231				# and add the packed row to the file body
1232
1233				$body .= pack("n*", @values);
1234
1235				$numVecs++;
1236
1237				}
1238
1239				close IN;
1240
1241				# store the number of vectors as first word of header
1242
1243				$header = pack("n", $numVecs).$header;
1244
1245				my $binFile = $self->datapath.$self->fileBaseName.$kBinCorrSuffix;
1246
1247				open (OUT, ">$binFile") \|\| die "cannot open binary correlations $binFile: $!\n";
1248
1249				print OUT $header, $body;
1250
1251				close (OUT);
1252
1253				# now get rid of the stdCor file, as we don;t need it now
1254
1255				unlink $corrFile \|\| warn "Can't remove $corrFile : $!\n";
1256
1257				}
1258
1259
1260				#############################################################################
1261				sub __prepareMetaFile {
1262				#############################################################################
1263				# This method writes out a file of meta information that pertain to
1264				# the dataset, in the form of name=value pair.
1265
1266				my $self = shift;
1267
1268				my $file = $self->datapath.$self->fileBaseName.$kMetaSuffix;
1269
1270				open(OUT, ">".$file) \|\| die "Cannot create meta file, $file : $!";
1271
1272				foreach my $key (@metaColumns) {
1273
1274				print OUT $key, "=", $self->{$key}, "\n";
1275
1276				}
1277
1278				close(OUT);
1279
1280				}
1281
1282
1283				#####################################################################
1284				sub _load_meta {
1285				#####################################################################
1286				# This method loads in previously cached meta data
1287
1288				my $self = shift;
1289
1290				my $filename = $self->datapath.$self->fileBaseName.$kMetaSuffix;
1291
1292				open(IN, $filename) \|\| die "cannot open _meta_ $filename $!";
1293
1294				while (){
1295
1296				chomp;
1297
1298				my ($key, $value) = split("=",$_);
1299
1300				$self->{$key} = $value;
1301
1302				}
1303
1304				close IN;
1305
1306				}
1307
1308				###
1309				#
1310				#
1311				# STUFF FROM HERE ON DOWN IS CRAP.... so says Gavin?
1312				#
1313				###
1314
1315				# JCM note: Tried removing it, but many of these subroutines are still
1316				# used by both this class and Explorer. My guess is that the rest are
1317				# likely used by other clients that Christian wrote
1318
1319				######################################################################
1320				sub __loadExptInfo {
1321				######################################################################
1322				## This method loads the expt_info data
1323
1324				my $self = shift;
1325				$self->{EXPT_INFO} = $self->__load_table("expt_info");
1326				}
1327
1328
1329				######################################################################
1330				sub __loadFeatureInfo {
1331				######################################################################
1332
1333				my $self = shift;
1334				$self->{FEATURE_INFO} = $self->__load_table("feature_info");
1335				}
1336
1337
1338				######################################################################
1339				sub __load_table {
1340				######################################################################
1341				# loads an ASCII table. It is expected that the first row contains the
1342				# column headers It is also expected that the first column contains
1343				# numeric id's starting at '0'. returns a reference to the table
1344				# structure
1345
1346				my ($self, $tableName) = @_;
1347
1348				my $file = $self->datapath.$self->fileBaseName.".".$tableName;
1349
1350				my ($table, $index, $i, @record);
1351
1352				open(IN, $file) \|\| die "cannot open _table $file $!\n";
1353
1354				my $firstrow = ();
1355				chomp($firstrow);
1356
1357				my @head = split("\t", $firstrow, -1);
1358
1359				# since the first columns header is always 'ID' we discard it
1360				shift @head;
1361
1362				while () {
1363
1364				chomp;
1365				@record = split("\t", $_, -1);
1366				$index = shift @record;
1367
1368				$i = 0;
1369				my %record = map { $head[$i++] => $_ } @record;
1370				$$table[$index] = \%record;
1371
1372				}
1373
1374				return $table;
1375
1376				}
1377
1378
1379				######################################################################
1380				sub image {
1381				######################################################################
1382				# Returns the data matrix as a GD::Image, drawn with 1x1 pixel per
1383				# value at the contrast last used/initialized with $ds->new()
1384				#
1385				# Usage: $ds->image();
1386
1387				my $self = shift;
1388
1389				my $type = shift;
1390
1391				my $image;
1392
1393				if ($type eq "matrix") {
1394				$image = &_load_image($self->imagepath.$self->fileBaseName.".data_matrix.$kImgType");
1395				return $image;
1396				}
1397
1398				if ($type eq "expt_info") {
1399				$image = &_load_image($self->imagepath.$self->fileBaseName.$kInfoGifSuffix);
1400				return $image;
1401				}
1402
1403				die "This type of image is not known\n";
1404
1405				}
1406
1407
1408				######################################################################
1409				sub _load_image {
1410				######################################################################
1411				# this protected method just opens up the previously stored matrix
1412				# image (from dataset initialization) , created a GD::Image object
1413				# with it, and returns it. Possible bug: it relies on GD::Image
1414				# version (>1.19) to pick $kImgType, when perhaps it should rely on
1415				# the filename suffix (.gif, .png) instead. This may prevent the
1416				# portability of intact datasets from one filesystem to another, but
1417				# in the end, you're always going to be limited by the version of GD...
1418
1419				my $filename = shift;
1420
1421				open(IN, $filename) \|\| die "cannot open _image $filename! $!\n";
1422
1423				my $funcname = "newFrom".ucfirst($kImgType);
1424				my $image = GD::Image->$funcname(*IN);
1425				return $image;
1426
1427				}
1428
1429
1430				######################################################################
1431				#sub vector {
1432				######################################################################
1433				#
1434				# my $self = shift;
1435				# my $index = @_;
1436				#
1437				# # load the data matrix only on demand
1438				# if ( !defined( $self->{MATRIX} ) ) {
1439				#
1440				# $self->_load_matrix();
1441				#
1442				# }
1443				#
1444				# return wantarray ? @{$self->{MATRIX}[$index]} : $self->{MATRIX}[$index];
1445				#
1446				#}
1447				#
1448				#
1449				######################################################################
1450				#sub _load_matrix {
1451				######################################################################
1452				#
1453				# my $self = shift;
1454				#
1455				# my ($matrix, $index);
1456				#
1457				# my $filename = $self->datapath.$self->fileBaseName.".data_matrix";
1458				#
1459				# $dbg && print "\tload matrix filename: $filename\n";
1460				#
1461				# $matrix = [ ];
1462				#
1463				# open(IN, $filename) \|\| die "cannot open _matrix_ $filename $!\n";
1464				#
1465				# while () {
1466				#
1467				# chomp;
1468				# my @row = split("\t", $_, -1);
1469				# $index = shift @row;
1470				#
1471				# $$matrix[$index] = \@row;
1472				#
1473				# }
1474				#
1475				# $self->{MATRIX} = $matrix;
1476				#
1477				#}
1478				#
1479				######################################################################
1480				#sub getMatrixValue {
1481				######################################################################
1482				#
1483				# my $self = shift;
1484				#
1485				# my $x = shift;
1486				# my $y = shift;
1487				#
1488				# if ($x > $self->width \|\| $y > $self->height \|\| $x < 0 \|\| $y < 0) {
1489				# die "index values (x = $x, y = $y) exceed range in getMatrixValue!\n";
1490				# }
1491				#
1492				# return $self->{MATRIX}[$y][$x];
1493				#
1494				#}
1495				#
1496				#
1497				######################################################################
1498				#sub _save_table {
1499				######################################################################
1500				#
1501				# ### just started, has to be filled in 2000-10-
1502				#
1503				# # saves an ASCII table. It is expected that the first row contains the column headers
1504				# # It is also expected that the first column contains numeric id's starting at '0'.
1505				# # returns a reference to the table structure
1506				# my $self = shift;
1507				# my $suffix = shift;
1508				#
1509				# my $file = $self->datapath;
1510				# $file .= $self->fileBaseName;
1511				#
1512				# # save the feature table
1513				# if ($suffix =~ /feature/i) {
1514				#
1515				# $file .= ".feature_info";
1516				#
1517				# my @keys = $self->getFeatureKeys();
1518				#
1519				# open( OUT, ">$file" ) \|\| die "cannot open $file in _save_table: $!\n";
1520				#
1521				# my $header = "ID\t".join("\t",@keys);
1522				#
1523				# $header .= "\n";
1524				#
1525				# print OUT $header;
1526				#
1527				# for(my $i=0;$i<$self->height;$i++) {
1528				#
1529				# print OUT $i;
1530				# foreach (@keys) {
1531				# print OUT "\t".$self->getFeature($i,$_);
1532				# }
1533				# print OUT "\n";
1534				#
1535				# }
1536				#
1537				# close(OUT);
1538				#
1539				# }
1540				#
1541				# # save experiment table
1542				# if ($suffix =~ /expt/) {
1543				#
1544				# my @keys = $self->getExperimentKeys();
1545				#
1546				# $file .= ".expt_info";
1547				#
1548				# open(OUT, ">$file") \|\| die "cannot open $file in _save_table: $!\n";
1549				#
1550				# for(my $i=0;$i<$self->width;$i++) {
1551				#
1552				# print OUT $i;
1553				# foreach (@keys) {
1554				# print OUT "\t".$self->getFeature($i,$_);
1555				# }
1556				# print OUT "\n";
1557				#
1558				# }
1559				#
1560				# close(OUT);
1561				#
1562				# }
1563				#
1564				#
1565				#
1566				#}
1567				#
1568				######################################################################
1569				#sub serialize {
1570				######################################################################
1571				# writes the dataset annotation to the repository, used in conjunction
1572				# with setFeature()
1573				#
1574				# my $self = shift;
1575				#
1576				# $self->_save_table('feature');
1577				#
1578				#}
1579				#
1580				######################################################################
1581				#sub getExperimentKeys { # wrapper to keep interface intact
1582				######################################################################
1583				# my $self = shift;
1584				#
1585				# my @keys = ( keys %{$self->{EXPT_INFO}[0]} );
1586				#
1587				# return wantarray ? (@keys) : \@keys;
1588				#
1589				#}
1590				#
1591				#######################################################################
1592				#sub experiment_keys {
1593				#######################################################################
1594				#
1595				# my $self = shift;
1596				# my @keys = ( keys %{$self->{EXPT_INFO}[0]} );
1597				#
1598				# return wantarray ? (@keys) : \@keys;
1599				#
1600				#}
1601				#
1602				######################################################################
1603				sub experiment {
1604				######################################################################
1605
1606				my $self = shift;
1607				my( $index, $field ) = @_;
1608
1609				if ( $index > $self->width() ) {
1610				die "error: index larger then data\n";
1611				}
1612
1613				if ( exists( $self->{EXPT_INFO}[$index]{$field} ) ) {
1614				return $self->{EXPT_INFO}[$index]{$field};
1615				} else {
1616				return "$index $field";
1617				}
1618
1619				}
1620
1621				######################################################################
1622				#sub featureAttributeExists {
1623				######################################################################
1624				# Returns true if the attribute passed as an argument
1625				# (e.g. 'CHROMOSOME') exists
1626				#
1627				# Usage: $ds->featureAttributeExists()
1628				#
1629				#
1630				# my $self = shift;
1631				# my $attr = shift;
1632				#
1633				# return (exists($self->{FEATURE_INFO}->[0]->{$attr}));
1634				#
1635				#}
1636
1637				#####################################################################
1638				sub getFeatureKeys {
1639				#####################################################################
1640				# returns the keys (attributes) for the features (gene expression row
1641				# vectors)
1642				#
1643				#Usage: $ds->getFeatureKeys()
1644
1645				my $self = shift;
1646
1647				my @keys = ( keys %{$self->{FEATURE_INFO}[0]} );
1648
1649				return wantarray ? (@keys) : \@keys;
1650
1651				}
1652
1653
1654				#####################################################################
1655				sub feature {
1656				######################################################################
1657				# required by the search function of Explorer
1658
1659				my $self = shift;
1660				my( $index, $field ) = @_;
1661
1662				if ( $index > $self->height ) {
1663				return "error: index larger then data\n";
1664				}
1665
1666				if ( exists( $self->{FEATURE_INFO}->[$index]->{$field} ) ) {
1667				return $self->{FEATURE_INFO}->[$index]->{$field}
1668				} else {
1669				return "no field by this name: $field.";
1670				}
1671				}
1672
1673
1674				#####################################################################
1675				sub getFeature {
1676				#####################################################################
1677
1678				my $self = shift;
1679				my( $index, $field ) = @_;
1680
1681				if ( $index > $self->height ) {
1682				return "error: index larger then data\n";
1683				}
1684
1685				if ( exists( $self->{FEATURE_INFO}->[$index]->{$field} ) ) {
1686				return $self->{FEATURE_INFO}->[$index]->{$field}
1687				} else {
1688				return "no field by this name: $field.";
1689				}
1690				}
1691
1692
1693				######################################################################
1694				#sub setFeature {
1695				######################################################################
1696				# Sets the column in row to [also see
1697				# serialize() ] JCM: This is probably for updating the dataset
1698				#
1699				#
1700				# my $self = shift;
1701				# my( $index, $field, $newval ) = @_;
1702				#
1703				# if ( $index > $self->height ) {
1704				#
1705				# return "error: index larger then data\n";
1706				#
1707				# }
1708				#
1709				# if ( exists( $self->{FEATURE_INFO}->[$index]->{$field} ) ) {
1710				#
1711				# $self->{FEATURE_INFO}->[$index]->{$field} = $newval;
1712				#
1713				# } else {
1714				#
1715				# # this adds a column to the table. We iterate over all
1716				# # rows and add the new row/record field
1717				# for (my $i=0;$i<$self->height;$i++) {
1718				# $self->{FEATURE_INFO}->[$i]->{$field} = '';
1719				# }
1720				#
1721				# # after we added the column, we assign the passed value
1722				# $self->{FEATURE_INFO}->[$index]->{$field} = $newval;
1723				#
1724				# }
1725				#
1726				#}
1727				#
1728
1729
1730				#####################################################################
1731				sub search {
1732				#####################################################################
1733				# Returns an array of data matrix row numbers where matched in
1734				# column . When using 'ALL' as , all
1735				# columns will be searched
1736
1737				my $self = shift;
1738
1739				my $query = shift;
1740				my $field = shift;
1741
1742				my( @keys, @hits );
1743
1744				if ($dbg) { print "Now searching $query in $field...\n", " " };
1745
1746				# search a specific field...
1747				if ( $field eq "ALL" ) {
1748				@keys = $self->getFeatureKeys();
1749				# ... or all fields
1750				} else {
1751				push @keys, $field;
1752				}
1753
1754				for(my $i=0;$i<$self->height;$i++) {
1755
1756				if ( $self->_search_feature( $i, $query, \@keys ) ) {
1757				push @hits, $i;
1758				}
1759
1760				}
1761
1762				return (@hits);
1763
1764				}
1765
1766
1767				#####################################################################
1768				sub _search_feature {
1769				#####################################################################
1770				# usage: $hit = $self->_search_feature( 100, "kinase", ['ACC','NAME','SYMBOL'])
1771				############################################################################-
1772				# this function returns true, if the feature queried contains the passed
1773				# string values(s). The parameters to this function are:
1774				# - required: the index number of the feature
1775				# - required: a search term
1776				# - optional: an array reference, containing the names of fields to search,
1777				# if not passed, all fields will be searched.
1778
1779				my $self = shift;
1780
1781				my $index = shift;
1782				my $query = shift;
1783
1784				my $field_aref = shift;
1785
1786				my $field;
1787
1788				# search field(s) for the query
1789				foreach $field ( @{$field_aref} ) {
1790
1791				if ( $self->feature( $index, $field ) =~ /$query/i ) {
1792
1793				if ($dbg) { print "found $query in field $field in feature $index", " " }
1794
1795				return 1;
1796				}
1797				}
1798
1799				# if we got here, nothing was found, return 'FALSE'
1800				return 0;
1801
1802				}
1803
1804
1805				#####################################################################
1806				sub correlations {
1807				#####################################################################
1808				# Returns the precalculated correlation values for row . Up to
1809				# 50 correlations values > 0.5 are stored. As an example client
1810				# usage, see Explorer's/gx retrieval of those profiles correlated to
1811				# the query (user-clicked profile within zoom view).
1812
1813
1814
1815
1816				my $self = shift;
1817
1818				my( $seed, $neighbours, $corrs ) = @_;
1819
1820				my $corr_file = $self->datapath.$self->fileBaseName.$kBinCorrSuffix;
1821
1822				&_get_correlations(
1823				$corr_file,
1824				$seed,
1825				$neighbours,
1826				$corrs
1827				);
1828
1829
1830				}
1831
1832
1833				#####################################################################
1834				sub _get_correlations {
1835				#####################################################################
1836				# required for Explorer to retrieve those profiles highly correlated
1837				# to the query (user-clicked profile within zoom view)
1838
1839
1840				my (
1841				$file,
1842				$startVector,
1843				$cVec_aref,
1844				$cCor_aref
1845				) = @_;
1846
1847				my ($tmp, $numRows, $indexSize, $index, $vecOffset, $vector);
1848				my (@indices, @vecOffsets, @vecSizes);
1849
1850				open (IN, $file) \|\| die "cannot open _correlations_ $file: $!\n";
1851
1852				# read the first word, indicating the number of rows
1853				# contained in the body
1854				read(IN, $tmp, 2);
1855
1856				$numRows = unpack("n", $tmp);
1857
1858				# if the function is only being called with the filename, return the number of correlations
1859				if ($startVector eq "") {
1860				return $numRows;
1861				}
1862
1863				if ( ($startVector < 0) \|\| ($startVector > $numRows-1) ) {
1864				die "Error: loadCompressedCorrelations: start vector ($startVector) wrong size\n";
1865				}
1866
1867				$indexSize = $numRows * 2; # the index that follows in the head is numRows words (= 2 bytes)
1868
1869				# after this value, an index containing the same number of words,
1870				# each indicating the number of correlation values stored per row, follows
1871				# reading from the current filepointer is equivalent to setting it with seek(IN, 2, 0);
1872				read(IN, $index, $indexSize );
1873
1874				@indices = unpack("n*", $index);
1875
1876				# initialize the first offset outside loop
1877				$vecOffsets[0] = 0;
1878				$vecSizes[0] = $indices[0] * 4;
1879				$vecOffset = $vecSizes[0];
1880
1881				# build the offset size for all entries in body, since the index only contains the number
1882				# of correlations for the individual gene. We need cumulative offset values
1883				for (my $j=1;$j<=$#indices;$j++) {
1884
1885				my $vecSize = $indices[$j] * 4;
1886				$vecSizes[$j] = $vecSize;
1887				$vecOffset += $vecSize; # number of reference/correlation pairs, each pair is two words ( = 4 bytes total)
1888				$vecOffsets[$j+1] = $vecOffset;
1889
1890				}
1891
1892				seek(IN, $vecOffsets[$startVector], 1); # go to position for this vectors correlation data in file
1893				read(IN, $vector, $vecSizes[$startVector]);
1894
1895				my @values = unpack("n*", $vector);
1896
1897				my $iter = 0;
1898				for (my $k=0; $k<=$#values; $k++) {
1899				if (!($k % 2)) {
1900				$cVec_aref->[$iter] = $values[$k];
1901				} else {
1902				$cCor_aref->[$iter] = sprintf("%.4f", $values[$k] / 65535);
1903				$iter++;
1904				}
1905				}
1906
1907				close(IN);
1908
1909				}
1910
1911
1912				######################################################################
1913				#sub size {
1914				######################################################################
1915				## This method returns the size of the dataset. It is based on the
1916				## assumption (which is currently wrong) that all file in the dataset
1917				## start with the dataset name.
1918				#
1919				# my $self = shift;
1920				#
1921				# if ( exists($self->{'SIZE'}) ) {
1922				#
1923				# return $self->{'SIZE'};
1924				#
1925				# } else {
1926				#
1927				# my $stem = $self->datapath().$self->fileBaseName.".*";
1928				#
1929				# my @files = glob($stem);
1930				#
1931				# my $size;
1932				#
1933				# for (@files) {
1934				# $size += (-s $_);
1935				# }
1936				#
1937				# $self->{'SIZE'} = $size;
1938				#
1939				# return $self->{'SIZE'};
1940				#
1941				# }
1942				#
1943				#}
1944
1945				1; # to make perl happy
1946
1947
1948				__END__