File Coverage

blib/lib/Bio/MaxQuant/ProteinGroups/Response.pm

Criterion	Covered	Total	%
statement	27	596	4.5
branch	0	150	0.0
condition	0	33	0.0
subroutine	9	37	24.3
pod	28	28	100.0
total	64	844	7.5

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::MaxQuant::ProteinGroups::Response;
2
3	1			1		28006	use 5.006;
	1					6
	1					79
4	1			1		6	use strict;
	1					3
	1					52
5	1			1		6	use warnings FATAL => 'all';
	1					8
	1					66
6
7
8	1			1		6	use Carp;
	1					2
	1					111
9
10	1			1		1143	use Statistics::Reproducibility;
	1					29417
	1					43
11	1			1		1461	use Text::CSV;
	1					20175
	1					6
12	1			1		1252	use IO::File;
	1					11928
	1					142
13	1			1		9	use File::Path qw(make_path);
	1					1
	1					51
14	1			1		1688	use Math::SigFigs;
	1					2865
	1					9062
15
16							our $SigFigs = 3;
17
18							=head1 NAME
19
20							Bio::MaxQuant::ProteinGroups::Response - Analyze MQ proteinGroups for differential responses
21
22							=head1 VERSION
23
24							Version 0.04
25
26							=cut
27
28							our $VERSION = '0.04';
29
30
31							=head1 SYNOPSIS
32
33							This module is tailored for MaxQuant data, but could be applicable elsewhere.
34							The target experiment is one where several celltypes have been assayed for
35							responses to different conditions, e.g. cancer cell lines responding to
36							hormones and drugs. The module help to analyse responses to the conditions
37							within each cell line and differences in those responses between cell lines.
38							Those differences in responses indicate that the proteins involved are markers
39							of the mechanism by which the cells differ in their response, and are therefore
40							not only good targets to exploit for biomarkers, but also for biological follow up.
41
42							use Bio::MaxQuant::ProteinGroups::Response;
43
44							my $resp = Bio::MaxQuant::ProteinGroups::Response->new(
45							filepath=>'proteinGroups.txt'
46							);
47
48							$resp->replicate_comparison(output_directory=>'./replicate_comparisons');
49							$resp->calculate_response_comparisons(output_directory=>'./responses');
50							$resp->calculate_differential_response_comparisons(output_directory=>'./differential_responses');
51
52							The data are output as tables in the directories. They are the printable tables
53							returned from Statistics::Reproducibility.
54
55							=head1 SUBROUTINES/METHODS
56
57							=head2 new
58
59							creates a new ProteinGroups object.
60
61							Options:
62							filepath - path to the file! default is proteinGroups.txt
63							separator - NOT table separator! This is the separator
64							used in the experiment name to separate cellline from
65							condition from replicate. Default is full stop (period)
66							rseparator - used for separating the compared cells/conditions.
67							the default is hyphen (-)
68							replicate_indicator - used in differential response comparisons
69							to indicate which cell the individual replicates were compared
70							(with the median of the other cell)
71
72							=cut
73
74							sub new {
75	0			0	1		my $p = shift;
76	0		0				my $c = ref($p) \|\| $p;
77	0						my %defaults = (
78							filepath => 'proteinGroups.txt',
79							separator => '.',
80							rseparator => '-',
81							replicate_indicator => '•',
82							resultsfile => '',
83							);
84	0						my %opts = (%defaults, @_);
85
86	0						my $o = {%opts};
87	0						bless $o, $c;
88
89	0	0					my $io = IO::File->new($opts{filepath}, 'r')
90							or die "Could not read $opts{filepath}: $!";
91	0						my $csv = Text::CSV->new({sep_char=>"\t"});
92	0						my $colref = $csv->getline($io);
93	0						$csv->column_names (@$colref);
94
95	0						$o->{csv} = $csv;
96	0						$o->{io} = $io;
97	0						$o->{header} = $colref;
98	0						$o->{median_exclude} = [];
99	0						return $o;
100							}
101
102							=head2 resultsfile
103
104							returns a handle to the results file, ready for writing.
105
106							this is not callde until processing starts, but when it is
107							it will clobber the old file.
108
109							=cut
110
111							sub resultsfile {
112	0			0	1		my $o = shift;
113	0	0					return unless $o->{resultsfile};
114	0	0					return $o->{resultsfile_io} if exists $o->{resultsfile_io};
115
116	0	0					$o->{resultsfile_io} = IO::File->new($o->{resultsfile},'w')
117							or die "Could not write $o->{resultsfile}: $!";
118
119	0						return $o->{resultsfile_io};
120							}
121
122							=head2 experiments
123
124							Returns the list of experiments in the file as a hash.
125							Keys are names, values are listrefs of cellline,condition,replicate.
126							Caches! So once called, it will not re-read the file
127							unless/until you delete $o->{experiments}
128
129							Also populates cellines, conditions and replicates lists, which are
130							accessible by their own accessors.
131
132							=cut
133
134							sub experiments {
135	0			0	1		my $o = shift;
136							# figure out experiment, unless already done...
137	0	0					if(exists $o->{experiments}){
138	0						return %{$o->{experiments}};
	0
139							}
140	0						my @header = @{$o->{header}};
	0
141	0						my %celllines = ();
142	0						my %conditions = ();
143	0						my %replicates = ();
144	0						my %condition_replicates = ();
145	0						my %expts = ();
146	0						foreach (@header){
147	0	0					next unless /^Experiment\s(\S+)$/;
148	0						my $expt = $1;
149	0						my ($cell, $cond, $repl) = $o->parse_experiment_name($expt);
150	0	0	0				return carp "bad experiment name format $_" unless
			0
151							(defined $cell && defined $cond && defined $repl);
152	0						$expts{$expt} = [$cell, $cond, $repl];
153	0						$celllines{$cell} = 1;
154	0						$conditions{$cond} = 1;
155	0						$replicates{$repl} = 1;
156	0						my $cc = $cell . $o->{separator} . $cond;
157	0	0					$condition_replicates{$cc} = [] unless exists $condition_replicates{$cc};
158	0						push @{$condition_replicates{$cc}}, $expt;
	0
159							}
160	0						$o->{experiments} = \%expts;
161	0						$o->{celllines} = [keys %celllines];
162	0						$o->{conditions} = [keys %conditions];
163	0						$o->{condition_replicates} = {%condition_replicates};
164	0						$o->{replicates} = [keys %replicates];
165	0						return %expts;
166							}
167
168							=head2 quickNormalize
169
170							TO BE REMOVED
171
172							Does a quick normalization of ALL the input columns. They are each normalized
173							by their own median, and not directly to each other.
174
175							Two options are available:
176
177							select => [list of indices]
178							exclude => [list of indices]
179
180							Select allows to choose a particular subset of rows on which to normalize, e.g. some
181							proteins you know don't change.
182							Exclude allows to choose a particular subset of rows to exclude from the
183							normalization, e.g. contaminants.
184
185
186							sub quickNormalize {
187							my ($o,%opts) = @_;
188							my $d = $o->data;
189							my $n = $o->{n};
190							my @I = (0..$n-1);
191							if($opts{exclude}){
192							my %I;
193							@I{@I} = @I;
194							delete $I{$_} foreach @{$opts{exclude}};
195							@I = sort {$a <=> $b} keys %I;
196							}
197							if($opts{select}){
198							@I = @{$opts{select}};
199							}
200							$o->{quicknorm} = {
201							map {
202							my $med = median ((@{$d->{$_}})[@I]);
203							($_ => [map {/\d/ ? $_ - med : ''} @{$d->{$_}}])
204							}
205							keys %$d;
206							}
207							}
208
209							TO BE REMOVED
210
211							=cut
212
213
214
215							=head2 blankRows
216
217							Option: select (as for quick Normalize)
218
219							This allows blanking the data for a subset (e.g. contaminants) so that they do not
220							contribute to the statistics.
221
222
223
224							=cut
225
226							sub blankRows {
227	0			0	1		my ($o,%opts) = @_;
228	0						my $d = $o->data;
229	0						my $n = $o->{n};
230	0						my @I = @{$opts{select}};
	0
231	0						foreach my $k(keys %$d){
232	0						blankItems($d->{$k}, @I);
233							}
234							}
235
236							=head2 blankItems
237
238							help function, accepts a listref and a list of indices to blank (set to '')
239							returns the listref for your convenience.
240
241
242							=cut
243
244							sub blankItems {
245	0			0	1		my ($listref,@I) = @_;
246	0						foreach my $i(@I){
247	0						$listref->[$i] = '';
248							}
249	0						return $listref;
250							}
251
252
253							=head2 celllines
254
255							Returns the list of cell lines. Ensures experiments() is called.
256
257							=cut
258
259							sub celllines {
260	0			0	1		my $o = shift;
261	0						$o->experiments; # just make sure it's been called!
262	0						return @{$o->{celllines}};
	0
263							}
264
265							=head2 conditions
266
267							Returns the list of conditions. Ensures experiments() is called.
268
269							=cut
270
271							sub conditions {
272	0			0	1		my $o = shift;
273	0						$o->experiments; # just make sure it's been called!
274	0						return @{$o->{conditions}};
	0
275							}
276
277							=head2 condition_replicates
278
279							Returns a hash of key=conditions, value=list of replicates.
280							Ensures experiments() is called.
281
282							=cut
283
284							sub condition_replicates {
285	0			0	1		my $o = shift;
286	0						$o->experiments; # just make sure it's been called!
287	0						return %{$o->{condition_replicates}};
	0
288							}
289
290							=head2 replicates
291
292							Returns the list of replicates. Ensures experiments() is called.
293
294							=cut
295
296							sub replicates {
297	0			0	1		my $o = shift;
298	0						$o->experiments; # just make sure it's been called!
299	0						return @{$o->{replicates}};
	0
300							}
301
302							=head2 parse_experiment_name
303
304							Method to parse the experiment name.
305							Uses $o->{separator} to separate into 3 parts. Uses index and
306							substr, not regexes. Default separator is dot/fullstop/period "." .
307
308							=cut
309
310							sub parse_experiment_name {
311	0			0	1		my $o = shift;
312	0						my $expt = shift;
313	0						my $dot1 = index($expt, $o->{separator});
314	0						my $dot2 = index($expt, $o->{separator}, $dot1 + 1);
315	0						my $cell = substr($expt,0,$dot1);
316	0						my $cond = substr($expt,$dot1+1, $dot2-$dot1-1);
317	0						my $repl = substr($expt, $dot2+1);
318	0						return ($cell,$cond,$repl);
319							}
320
321							=head2 parse_response_name
322
323							Method to parse the response name.
324							Uses $o->{rseparator} to separate into 3 parts. Uses index and
325							substr, not regexes. Default separator is hyphen "-", which
326							should not be used in experiment name!
327
328							=cut
329
330							sub parse_response_name {
331	0			0	1		my $o = shift;
332	0						my $expt = shift;
333	0						my $dot1 = index($expt, $o->{separator});
334	0						my $dot2 = index($expt, $o->{rseparator}, $dot1 + 1);
335	0						my $cell = substr($expt,0,$dot1);
336	0						my $cond1 = substr($expt,$dot1+1, $dot2-$dot1-1);
337	0						my $cond2 = substr($expt, $dot2+1);
338	0						return ($cell,$cond1,$cond2);
339							}
340
341
342							=head2 replicate_comparison
343
344							Uses Statistics::Reproducibility to get normalized values and
345							metrics on each condition.
346
347							Caches!
348
349							=cut
350
351							sub replicate_comparison {
352	0			0	1		my $o = shift;
353	0						my %opts = (
354							output_directory => '',
355							@_
356							);
357	0	0					if($opts{output_directory}){
358	0	0					make_path($opts{output_directory}) unless -d $opts{output_directory};
359							}
360
361	0	0					if(exists $o->{replicate_comparison}){
362	0						return $o->{replicate_comparison};
363							}
364	0						my $data = $o->data;
365	0						my %cr = $o->condition_replicates;
366	0						$o->{replicate_comparison} = {};
367	0						my $depth = -1;
368	0						foreach my $cr(keys %cr){
369	0						print STDERR "Processing $cr...\n";
370	0						my @cols = @{$cr{$cr}};
	0
371	0						my @mydata = map {$data->{$_}} @cols;
	0
372	0						my $results = Statistics::Reproducibility
373							->new()
374							->data(@mydata)
375							->run()
376							->printableTable($depth);
377	0						$o->{replicate_comparison}->{$cr} = $results;
378
379	0						$o->dump_results_table('replicates', $cr, $results, \@cols);
380
381	0	0					if($opts{output_directory}){
382	0	0					my $fo = new IO::File($opts{output_directory}.'/'.$cr.'.txt', 'w')
383							or die "Could not write $opts{output_directory}/$cr.txt: $!";
384	0						print STDERR "Writing $opts{output_directory}/$cr.txt...\n";
385	0						print $fo join("\t", @{$results->[0]})."\n";
	0
386	0						my $table_length = 0;
387	0	0					foreach (@$results){ $table_length = @$_ if @$_ > $table_length; }
	0
388	0						foreach my $i(0..$table_length-1){
389	0	0					print $fo join("\t", map {
390	0						defined $results->[$_]->[$i]
391							? sigfigs($results->[$_]->[$i])
392							: ''
393							} (1..$#$results)
394							)."\n";
395							}
396	0						close($fo);
397							}
398							}
399	0						return $o->{replicate_comparison};
400							}
401
402
403							=head2 response_comparisons
404
405							Returns the list of comparisons that can be made between conditions
406							within each cell line, given the replicates available.
407
408							At least 2 replicates must be available for a comparison to be made.
409
410							Caches.
411
412							=cut
413
414							sub response_comparisons {
415	0			0	1		my $o = shift;
416	0	0					if(exists $o->{response_comparisons}){
417	0						return %{$o->{response_comparisons}};
	0
418							}
419	0						my %expts = $o->experiments;
420	0						my @expts = sort keys %expts;
421	0						my $sep = $o->{separator};
422	0						my $rsep = $o->{rseparator};
423	0						my %comparisons = ();
424	0						foreach my $i(0..$#expts-1){
425	0						my $e1 = $expts[$i];
426	0						my ($cell1,$cond1,$repl1) = @{$expts{$e1}};
	0
427	0						foreach my $j($i+1..$#expts){
428	0						my $e2 = $expts[$j];
429	0						my ($cell2,$cond2,$repl2) = @{$expts{$e2}};
	0
430							# we want same cell line
431	0	0					next unless $cell2 eq $cell1;
432							# and different condition
433	0	0					next if $cond2 eq $cond1;
434	0						my $comp_key = "$cell1$sep$cond1$rsep$cond2";
435							# store them in a useful way...
436	0	0					$comparisons{$comp_key} = {$cond1=>{},$cond2=>{}}
437							unless defined $comparisons{$comp_key};
438	0						$comparisons{$comp_key}->{$cond2}->{$e2} = "$cell1$sep$cond1$rsep$cond2$repl2";
439	0						$comparisons{$comp_key}->{$cond1}->{$e1} = "$cell1$sep$cond1$repl1$rsep$cond2";
440							}
441							}
442	0						$o->{response_comparisons} = \%comparisons;
443	0						return %comparisons;
444							}
445
446
447							=head2 cell_comparisons
448
449							Returns the list of comparisons that can be made between cells
450							within each condition, given the replicates available.
451
452							At least 2 replicates must be available for a comparison to be made.
453
454							Caches.
455
456							=cut
457
458							sub cell_comparisons {
459	0			0	1		my $o = shift;
460	0	0					if(exists $o->{cell_comparisons}){
461	0						return %{$o->{cell_comparisons}};
	0
462							}
463	0						my %expts = $o->experiments;
464	0						my @expts = sort keys %expts;
465	0						my $sep = $o->{separator};
466	0						my $rsep = $o->{rseparator};
467	0						my %comparisons = ();
468	0						foreach my $i(0..$#expts-1){
469	0						my $e1 = $expts[$i];
470	0						my ($cell1,$cond1,$repl1) = @{$expts{$e1}};
	0
471	0						foreach my $j($i+1..$#expts){
472	0						my $e2 = $expts[$j];
473	0						my ($cell2,$cond2,$repl2) = @{$expts{$e2}};
	0
474							# we want same condition
475	0	0					next unless $cond1 eq $cond1;
476							# and different cell line
477	0	0					next if $cell1 eq $cell2;
478	0						my $comp_key = "$cell1$rsep$cell2$sep$cond2";
479							# store them in a useful way...
480	0	0					$comparisons{$comp_key} = {$cell1=>{},$cell2=>{}}
481							unless defined $comparisons{$comp_key};
482	0						$comparisons{$comp_key}->{$cell2}->{$e2} = "$cell1$rsep$cell2$repl2$sep$cond1";
483	0						$comparisons{$comp_key}->{$cell1}->{$e1} = "$cell1$repl1$rsep$cell2$sep$cond1";
484							}
485							}
486	0						$o->{cell_comparisons} = \%comparisons;
487	0						return %comparisons;
488							}
489
490							=head2 differential_response_comparisons
491
492							Returns the list of comparisons that can be made between cell line
493							responses to a each condition.
494
495							Caches.
496
497							=cut
498
499							sub differential_response_comparisons {
500	0			0	1		my $o = shift;
501	0	0					if(exists $o->{differential_response_comparisons}){
502	0						return %{$o->{differential_response_comparisons}};
	0
503							}
504	0						my %rcs = $o->response_comparisons;
505	0						my @rcs = sort keys %rcs;
506	0						my %comparisons = ();
507	0						foreach my $i(0..$#rcs-1){
508	0						my $rc1 = $rcs[$i];
509	0						my ($cell1, $cond1_1,$cond1_2) = $o->parse_response_name($rc1);
510	0						foreach my $j($i+1..$#rcs){
511	0						my $rc2 = $rcs[$j];
512	0						my ($cell2, $cond2_1,$cond2_2) = $o->parse_response_name($rc2);
513	0	0	0				next unless ($cond1_1 eq $cond2_1 && $cond1_2 eq $cond2_2)
			0
			0
514							\|\| ($cond1_1 eq $cond2_2 && $cond1_2 eq $cond2_1);
515	0						my $key = $cell1 . $o->{rseparator} . $cell2
516							. $o->{separator} . $cond1_1 . $o->{rseparator} . $cond1_2;
517	0						$comparisons{$key} = {$rc1=>$cell1, $rc2=>$cell2};
518							}
519							}
520	0						$o->{differential_response_comparisons} = \%comparisons;
521	0						return %comparisons;
522							}
523
524							=head2 data
525
526							Reads in all the protein ratios from the proteinGroups file.
527							Also reads other identifying information, such as id and Leading
528							Proteins. Reads each non-normalized ratio column into a list and
529							stores them in a hash by experiment name.
530
531							=cut
532
533							sub data {
534	0			0	1		my $o = shift;
535	0	0					if(exists $o->{data}){
536	0						return $o->{data};
537							}
538	0						my ($csv,$io) = map {$o->{$_}} qw/csv io/;
	0
539	0						my %expts = $o->experiments;
540	0						my @expts = sort keys %expts;
541	0						$o->{data} = {map {($_=>[])} @expts};
	0
542	0						seek($io,0,0);
543	0						$csv->getline($io); # make sure for sure we're at start of data
544	0						my $size = (stat $o->{filepath})[7];
545	0						my $count = 0;
546	0						while(! eof($io)){
547	0						my $hr = $csv->getline_hr($io);
548	0						foreach(@expts){
549	0						push @{$o->{data}->{$_}}, $o->datum($hr->{"Ratio H/L $_"});
	0
550							}
551	0						my $pos = tell($io);
552	0						print STDERR "\r$pos/$size";
553	0						$count ++;
554							}
555	0						print STDERR "\n";
556	0						$o->{n} = $count;
557	0						return $o->{data};
558							}
559
560							=head2 datum
561
562							Converts one datum into a logged ratio or an empty string, depending.
563
564							=cut
565
566							sub datum {
567	0			0	1		my ($o,$d) = @_;
568	0	0					if($d =~ /\d/){
569	0						return log($d)/log(2);
570							}
571							else {
572	0						return '';
573							}
574							}
575
576							=head2 calculate_response_comparisons
577
578							calculates the differences between conditions in a cell type.
579							outputs a bunch of files. You can specify the diretory with
580							output_directory option.
581
582							=cut
583
584							sub calculate_response_comparisons {
585	0			0	1		my $o = shift;
586	0						my %opts = (
587							output_directory => '',
588							@_);
589
590	0	0					if($opts{output_directory}){
591	0	0					make_path($opts{output_directory}) unless -d $opts{output_directory};
592							}
593	0						my %rcs = $o->response_comparisons;
594	0						my @rcs = sort keys %rcs;
595							# so, here for this protein, we calculate the comparisons
596							# for everything... first we need to log, and then subtract...
597							# this does mean that we need to normalize here independent
598							# of the Statistics::Reproducibility thing (or hijack it)
599
600	0						my %cfmedians = ();
601	0						my %comparisons = ();
602
603	0						foreach my $cf(@rcs){ # each comparison
604	0						my ($cell,@crap) = $o->parse_experiment_name($cf);
605	0						my @conds = sort keys %{$rcs{$cf}};
	0
606	0	0					die "not two conditions!" unless @conds == 2;
607	0						my ($cond1,$cond2) = @conds;
608	0						my %counterpart = ($cond1=>$cond2, $cond2=>$cond1);
609							# we will calculate condition replicate minus counterpart median
610	0						my %sign = ($cond1=>1, $cond2=>-1);
611
612	0						my %medians = $o->medians;
613	0						my %reps1 = %{$rcs{$cf}->{$cond1}};
	0
614	0						my %reps2 = %{$rcs{$cf}->{$cond2}};
	0
615	0						my @column_names = sort((values %reps1),(values %reps2));
616	0						my %columns = map {($_=>[])} (@column_names);
	0
617
618	0						$cfmedians{$cf} = [map {[]} 1..$o->{n}];
	0
619
620	0						my $data = $o->{normalized};
621							# we'll take the median of each protein here
622
623	0						my $sign = 0;
624	0						foreach my $cond(sort keys %{$rcs{$cf}}){ # each of the two conditions... sorted by name
	0
625	0						my $sign = $sign{$cond};
626	0						my $counterpart = $counterpart{$cond};
627	0						my $sep = $o->{separator};
628							#my $cc = "$cell$sep$cond";
629	0						my $ccc = "$cell$sep$counterpart";
630							#print STDERR "$cc : $ccc : \n";
631	0						my %reps = %{$rcs{$cf}->{$cond}}; # these are the replicates in this condition
	0
632	0						foreach my $r(sort keys %reps){ # replicates
633	0						my $key = $reps{$r};
634							#print STDERR " : $r : $key \n";
635	0						foreach my $i(0..$o->{n}-1){ # each protein... check enough data
636	0	0	0				if(
637							#defined $data->{$r}->[$i] &&
638							$data->{$r}->[$i] ne ''
639							#&& defined $medians{$ccc}->[$i]
640							&& $medians{$ccc}->[$i] ne ''){
641							# now these are sorted, so we do $cond-$replicate for
642	0						my $value =
643							$sign * ($data->{$r}->[$i] - $medians{$ccc}->[$i]);
644	0						push @{$columns{$key}}, $value;
	0
645							# collect the values to make medians later...
646	0						push @{$cfmedians{$cf}->[$i]}, $value;
	0
647							}
648							else {
649	0						push @{$columns{$key}}, '';
	0
650							}
651							}
652							}
653							}
654							#
655	0						foreach my $i(0..$o->{n}-1){ # each protein... check enough data
656	0	0					if(@{$cfmedians{$cf}->[$i]} < 2){
	0
657	0						$cfmedians{$cf}->[$i] = '';
658							}
659							else {
660	0						$cfmedians{$cf}->[$i] = median(@{$cfmedians{$cf}->[$i]});
	0
661							}
662							}
663	0						$o->{response_comparison_medians} = \%cfmedians;
664							#
665	0						%comparisons = (%comparisons, %columns);
666
667	0						print STDERR "Processing $cf...\n";
668	0						my @mydata = map {$columns{$_}} @column_names;
	0
669							#print Dumper @mydata;
670	0						my $depth = -1;
671	0						my $results = Statistics::Reproducibility
672							->new()
673							->data(@mydata)
674							->run()
675							->printableTable($depth);
676
677							#$o->{replicate_comparison}->{$cf} = $results;
678	0						$o->dump_results_table('responses', $cf, $results, \@column_names);
679
680	0	0					if($opts{output_directory}){
681	0	0					my $fo = IO::File->new("$opts{output_directory}/$cf.txt",'w')
682							or die "Could not write $opts{output_directory}/$cf.txt: $!";
683	0						print STDERR "Writing $opts{output_directory}/$cf.txt...\n";
684	0						print $fo join("\t", @{$results->[0]})."\n";
	0
685	0						my $table_length = 0;
686	0	0					foreach (@$results){ $table_length = @$_ if @$_ > $table_length; }
	0
687	0						foreach my $i(0..$table_length-1){
688	0	0					print $fo join("\t", map {
689	0						defined $results->[$_]->[$i]
690							? sigfigs($results->[$_]->[$i])
691							: ''
692							} (1..$#$results)
693							)."\n";
694							}
695	0						close($fo);
696							}
697							}
698	0						$o->{response_comparison_results} = \%comparisons;
699
700							}
701
702							=head2 calculate_cell_comparisons
703
704							calculates the differences between cell types in a condition.
705							outputs a bunch of files. You can specify the diretory with
706							output_directory option.
707
708							=cut
709
710							sub calculate_cell_comparisons {
711	0			0	1		my $o = shift;
712	0						my %opts = (
713							output_directory => '',
714							@_);
715
716	0	0					if($opts{output_directory}){
717	0	0					make_path($opts{output_directory}) unless -d $opts{output_directory};
718							}
719	0						my %rcs = $o->cell_comparisons;
720	0						my @rcs = sort keys %rcs;
721
722
723							# so, here for this protein, we calculate the comparisons
724							# for everything... first we need to log, and then subtract...
725							# this does mean that we need to normalize here independent
726							# of the Statistics::Reproducibility thing (or hijack it)
727
728	0						my %cfmedians = ();
729	0						my %comparisons = ();
730
731	0						foreach my $cf(@rcs){ # each comparison
732	0						my ($cell,$cond,$rep) = $o->parse_experiment_name($cf.'.');
733
734	0						my @cells = sort keys %{$rcs{$cf}};
	0
735
736	0	0					die "not two cells!" unless @cells == 2;
737	0						my ($cell1,$cell2) = @cells;
738	0						my %counterpart = ($cell1=>$cell2, $cell2=>$cell1);
739							# we will calculate cell replicate minus counterpart median
740	0						my %sign = ($cell1=>1, $cell2=>-1);
741
742	0						my %medians = $o->medians;
743	0						my %reps1 = %{$rcs{$cf}->{$cell1}};
	0
744	0						my %reps2 = %{$rcs{$cf}->{$cell2}};
	0
745	0						my @column_names = sort((values %reps1),(values %reps2));
746	0						my %columns = map {($_=>[])} (@column_names);
	0
747
748	0						$cfmedians{$cf} = [map {[]} 1..$o->{n}];
	0
749
750	0						my $data = $o->{normalized};
751							# we'll take the median of each protein here
752
753	0						my $sign = 0;
754	0						foreach my $cell(sort keys %{$rcs{$cf}}){ # each of the two cells... sorted by name
	0
755	0						my $sign = $sign{$cell};
756	0						my $counterpart = $counterpart{$cell};
757	0						my $sep = $o->{separator};
758							#my $cc = "$cell$sep$cond";
759	0						my $ccc = "$counterpart$sep$cond";
760							#print STDERR "$cc : $ccc : \n";
761	0						my %reps = %{$rcs{$cf}->{$cell}}; # these are the replicates in this cell
	0
762	0						foreach my $r(sort keys %reps){ # replicates
763	0						my $key = $reps{$r};
764							#print STDERR " : $r : $key \n";
765	0						foreach my $i(0..$o->{n}-1){ # each protein... check enough data
766	0	0	0				if(
			0
			0
767							defined $data->{$r}->[$i] &&
768							$data->{$r}->[$i] ne ''
769							&& defined $medians{$ccc}->[$i]
770							&& $medians{$ccc}->[$i] ne ''){
771							# now these are sorted, so we do $cond-$replicate for
772	0						my $value =
773							$sign * ($data->{$r}->[$i] - $medians{$ccc}->[$i]);
774	0						push @{$columns{$key}}, $value;
	0
775							# collect the values to make medians later...
776	0						push @{$cfmedians{$cf}->[$i]}, $value;
	0
777							}
778							else {
779	0						push @{$columns{$key}}, '';
	0
780							}
781							}
782							}
783							}
784							#
785	0						foreach my $i(0..$o->{n}-1){ # each protein... check enough data
786	0	0					if(@{$cfmedians{$cf}->[$i]} < 2){
	0
787	0						$cfmedians{$cf}->[$i] = '';
788							}
789							else {
790	0						$cfmedians{$cf}->[$i] = median(@{$cfmedians{$cf}->[$i]});
	0
791							}
792							}
793	0						$o->{cell_comparison_medians} = \%cfmedians;
794							#
795	0						%comparisons = (%comparisons, %columns);
796
797	0						print STDERR "Processing $cf...\n";
798	0						my @mydata = map {$columns{$_}} @column_names;
	0
799							#print Dumper @mydata;
800	0						my $depth = -1;
801	0						my $results = Statistics::Reproducibility
802							->new()
803							->data(@mydata)
804							->run()
805							->printableTable($depth);
806
807							#$o->{replicate_comparison}->{$cf} = $results;
808	0						$o->dump_results_table('celldiffs', $cf, $results, \@column_names);
809
810	0	0					if($opts{output_directory}){
811	0	0					my $fo = IO::File->new("$opts{output_directory}/$cf.txt",'w')
812							or die "Could not write $opts{output_directory}/$cf.txt: $!";
813	0						print STDERR "Writing $opts{output_directory}/$cf.txt...\n";
814	0						print $fo join("\t", @{$results->[0]})."\n";
	0
815	0						my $table_length = 0;
816	0	0					foreach (@$results){ $table_length = @$_ if @$_ > $table_length; }
	0
817	0						foreach my $i(0..$table_length-1){
818	0	0					print $fo join("\t", map {
819	0						defined $results->[$_]->[$i]
820							? sigfigs($results->[$_]->[$i])
821							: ''
822							} (1..$#$results)
823							)."\n";
824							}
825	0						close($fo);
826							}
827							}
828	0						$o->{cell_comparison_results} = \%comparisons;
829
830							}
831
832
833							=head2 sigfigs
834
835							Helper function
836							Tries FormatSigFigs($_[0],$SigFigs), but only if $_[0] actually looks like a number!
837							$SigFigs is a global in this module and is set to 3.
838
839							=cut
840
841							sub sigfigs {
842	0			0	1		my $x = shift;
843	0	0					if($x =~ /^[-\.\d]+$/){
844	0	0					if($x<1000){
845	0						return FormatSigFigs($x,$SigFigs);
846							}
847							else {
848	0						return int($x);
849							}
850							}
851							else {
852	0						return $x;
853							}
854							}
855
856							=head2 calculate_differential_response_comparisons
857
858
859
860							=cut
861
862							sub calculate_differential_response_comparisons {
863	0			0	1		my $o = shift;
864	0						my %opts = (
865							output_directory => '',
866							@_);
867
868	0	0					if($opts{output_directory}){
869	0	0					make_path($opts{output_directory}) unless -d $opts{output_directory};
870							}
871	0						my %rcs = $o->response_comparisons;
872	0						my %drcs = $o->differential_response_comparisons;
873	0						my %rcms = %{$o->{response_comparison_medians}};
	0
874	0						my @rcs = sort keys %rcs;
875	0						my @drcs = sort keys %drcs;
876							# so, here for this protein, we calculate the comparisons
877							# for everything... first we need to log, and then subtract...
878							# this does mean that we need to normalize here independent
879							# of the Statistics::Reproducibility thing (or hijack it)
880
881							# and now the next bit... :-S
882
883							# here we need to get use the response comparisons, and so need to
884							# look up the keys in %rcs.
885
886	0						my %response_comparison_results = %{$o->{response_comparison_results}};
	0
887	0						my %response_comparison_medians = %{$o->{response_comparison_medians}};
	0
888
889	0						my $sep = $o->{separator};
890	0						my $rsep = $o->{rseparator};
891
892	0						my %differentials = ();
893
894	0						foreach my $cf(@drcs){ # each comparison
895	0						my $rsepi = index($cf,$rsep);
896	0						my $sepi = index($cf,$sep);
897	0						my $cell1 = substr($cf,0,$rsepi);
898	0						my $cell2 = substr($cf,$rsepi+1,$sepi-$rsepi-1);
899	0						my @cells = ($cell1,$cell2);
900	0						my %counterpart = ($cell1=>$cell2, $cell2=>$cell1);
901	0						my %sign = ($cell1 => 1, $cell2 => -1);
902	0						my $repind = $o->{replicate_indicator};
903	0						my %key = ($cell1 => "$cell1$repind$rsep$cell2", $cell2 => "$cell1$rsep$cell2$repind");
904	0						my $condcomp = substr($cf,$sepi+1);
905
906	0						my @keys = ();
907
908	0						foreach my $cell(@cells){
909	0						my $counterpart = $counterpart{$cell};
910	0						my $sign = $sign{$cell};
911	0						my $cellcomp = "$cell$sep$condcomp";
912	0						my $countercomp = "$counterpart$sep$condcomp";
913	0						my @rcs = map {values %$_} values %{$rcs{$cellcomp}};
	0
	0
914	0						my $cellkey = $key{$cell};
915							# always cell1 - cell2, let's do reps - median
916	0						foreach my $rc(@rcs){
917	0						my $repcombo = substr($rc, index($rc,$sep)+1);
918	0						my $key = "$cellkey$sep$repcombo";
919	0						push @keys, $key;
920	0						$differentials{$key} = [];
921	0						foreach my $i(0..$o->{n}-1){
922	0						my $cell_replicate = $o->{response_comparison_results}->{$rc}->[$i];
923	0						my $counter_median = $o->{response_comparison_medians}->{$countercomp}->[$i];
924	0						my $value = '';
925	0	0	0				if($cell_replicate ne '' && $counter_median ne ''){
926	0						$value = ($cell_replicate - $counter_median) * $sign;
927							}
928	0						push @{$differentials{$key}}, $value;
	0
929							}
930							}
931							}
932	0						@keys = sort @keys;
933
934	0						print STDERR "Processing $cf\n";
935	0						my @mydata = map {$differentials{$_}} @keys;
	0
936
937	0						my $depth = -1;
938	0						my $results = Statistics::Reproducibility
939							->new()
940							->data(@mydata)
941							->run()
942							->printableTable($depth);
943
944							#$o->{replicate_comparison}->{$cf} = $results;
945	0						$o->dump_results_table('differential_responses', $cf, $results, \@keys);
946
947	0	0					if($opts{output_directory}){
948	0	0					my $fo = IO::File->new("$opts{output_directory}/$cf.txt",'w')
949							or die "Could not write $opts{output_directory}/$cf.txt: $!";
950	0						print STDERR "Writing $opts{output_directory}/$cf.txt...\n";
951	0						print $fo join("\t", @{$results->[0]})."\n";
	0
952	0						my $table_length = 0;
953	0	0					foreach (@$results){ $table_length = @$_ if @$_ > $table_length; }
	0
954	0						foreach my $i(0..$table_length-1){
955	0	0					print $fo join("\t", map {
956	0						defined $results->[$_]->[$i]
957							? sigfigs($results->[$_]->[$i])
958							: ''
959							} (1..$#$results)
960							)."\n";
961							}
962	0						close($fo);
963							}
964							}
965							}
966
967							=head2 medians
968
969							calculates the medians for all replicate sets and stores them in
970							$o->{medians}
971
972							=cut
973
974							sub medians {
975							# this function has been manually verified
976
977	0			0	1		my $o = shift;
978
979	0	0					return %{$o->{medians}} if exists $o->{medians};
	0
980	0						my %opts = (exclude=>$o->{median_exclude},output_directory=>'',@_);
981	0	0					if($opts{output_directory}){
982	0	0					make_path($opts{output_directory}) unless -d $opts{output_directory};
983							}
984							# exclude => [indices]
985	0						my @I = @{$opts{exclude}};
	0
986
987	0						my $data = $o->data;
988	0						my %cr = $o->condition_replicates;
989	0						my %medians = ();
990
991	0						my @keys = sort keys %$data;
992	0						my $k = scalar @keys;
993	0						my @mydata = map { blankItems([@{$data->{$_}}],@I)} @keys;
	0
	0
994
995							# here we have to do subtract medians...
996
997	0						my $depth = -1;
998	0						my $results = Statistics::Reproducibility
999							->new()
1000							->data(@mydata)
1001							->subtractMedian()
1002							->printableTable($depth);
1003
1004							#print Dumper $results;
1005
1006	0						my @relevant_columns = (@$results)[3..$k+2]; # NEED TO SORT THIS OUT!
1007
1008	0						my %normalized = ();
1009	0						@normalized{@keys} = @relevant_columns;
1010	0						$o->{normalized} = \%normalized;
1011
1012	0						foreach my $i(0..$o->{n}-1){
1013	0						foreach my $cond(keys %cr){
1014	0						my @repkeys = @{$cr{$cond}};
	0
1015	0						$medians{$cond}->[$i] = median ( map {$normalized{$_}->[$i]} @repkeys );
	0
1016							}
1017							}
1018	0						$o->{medians} = \%medians;
1019
1020	0	0					if($opts{output_directory}){
1021	0						print "Outputting to $opts{output_directory}...\n";
1022	0						dumpHashtable($opts{output_directory}.'/normalized.txt', $o->{normalized});
1023	0						dumpHashtable($opts{output_directory}.'/medians.txt', $o->{medians});
1024	0						print "Done\n";
1025							}
1026	0	0					if($o->resultsfile){
1027	0						print "Outputting to $o->{resultsfile}...\n";
1028	0						$o->put_resultsfile_hashtable('normalized','normalized',$o->{normalized});
1029	0						$o->put_resultsfile_hashtable('normalized','medians',$o->{medians});
1030	0						print "Done\n";
1031							}
1032
1033	0						return %medians;
1034							}
1035
1036							=head2 put_resultsfile_hashtable
1037
1038							a method called by medians() if resultsfile was defined. Calls put_resultsfile with
1039							some medians and normalized data.
1040
1041							=cut
1042
1043							sub put_resultsfile_hashtable {
1044	0			0	1		my ($o,$section,$derivation,$ht) = @_;
1045							# HoL
1046	0						$o->put_resultsfile(
1047							[
1048							map {
1049	0						my @en = $o->parse_experiment_name($_);
1050	0	0					my $en = $en[1]
1051							? join($o->{separator}, @en[0..1])
1052							: $_;
1053							[
1054	0						"n/s:$section/n:$en/d:$derivation/k:$_/t:$derivation/",
1055	0						map {sigfigs($_)} @{$ht->{$_}}
	0
1056							]
1057							} sort keys %$ht
1058							]
1059							);
1060							}
1061
1062
1063							=head2 dumpHashtable
1064
1065							helper function that dumps a HoL as a tab delimited table.
1066
1067							=cut
1068
1069							sub dumpHashtable {
1070	0			0	1		my ($fn,$hol) = @_;
1071	0	0					my $io = IO::File->new($fn, 'w') or die "Could not write $fn: $!";
1072	0						my @h = sort keys %$hol;
1073	0						my $L = 0;
1074	0						foreach (@h){
1075	0						my $l = scalar @{$hol->{$_}};
	0
1076	0	0					$L = $l if $l > $L;
1077							}
1078	0						print $io join("\t", @h)."\n";
1079	0						foreach my $i (0..$L-1){
1080	0						print $io join("\t", map {sigfigs($hol->{$_}->[$i])} @h)."\n";
	0
1081							}
1082							}
1083
1084
1085							=head2 median
1086
1087							helper function that does a simple median calculation
1088
1089							=cut
1090
1091							sub median {
1092	0	0		0	1		my @x = sort {$a <=> $b} map { /\d/ ? $_ : () } @_; # strips non-numbers (ish)
	0
	0
1093	0	0					return '' if scalar(@x) < 2; # minumum is 2!
1094	0	0					if(@x % 2){ #odd
1095	0						return $x[scalar(@x) / 2]; # 0 1 2 3 4 @/2
1096							}
1097							else { # even
1098	0						return $x[(scalar(@x)-1) / 2] / 2
1099							+ $x[(scalar(@x)+1) / 2] / 2;# 0 1 2 3 4 5 (@-1)/2 , (@+1)/2
1100							}
1101							}
1102
1103
1104							=head2 put_resultsfile
1105
1106							take a list of lists (ref) and outputs directly to $o->{resultsfile}.
1107							This is as an alternative or addition to the output_file options
1108							avaiable for some methods, and is called by dump_results_table
1109							and others throughout processing.
1110
1111							=cut
1112
1113							sub put_resultsfile {
1114	0			0	1		my ($o,$table) = @_;
1115	0						my $io = $o->resultsfile;
1116	0	0					if($io){
1117	0						print $io map {join("\t", @$_)."\n"} @$table;
	0
1118							}
1119							}
1120
1121							=head2 dump_results_table
1122
1123							Dumps a results table to a file ($o->{complete_results_file})
1124							for laster use.
1125
1126							=cut
1127
1128							sub dump_results_table {
1129	0			0	1		my ($o,$section,$name,$data,$keys) = @_;
1130	0						my @results = translate_results_table($section,$name,$data,$keys);
1131	0						$o->put_resultsfile(\@results);
1132							}
1133
1134							=head2 translate_results_table
1135
1136							helper function that separates out and better labels the different results from
1137							Statistics::Reproducbility
1138
1139							=cut
1140
1141							sub translate_results_table {
1142	0			0	1		my ($section,$name,$table,$keys) = @_;
1143							# headers we get are:
1144							# Column x (x is any number)
1145							# Regression, M, C (a list for columns)
1146							# Statistic, Value (a list for set of columns)
1147							# DerivedFrom (how the columns on the left were derived from those on the right)
1148	0						my @header = @{$table->[0]};
	0
1149	0						my $DerivedFrom = 'source';
1150	0						my $compareColumn = 1;
1151	0						my $flag = '';
1152	0						my %compareFinder = ();
1153	0						my $i = @header;
1154	0						my @results = ();
1155	0						while($i>0){
1156	0						my $index = $i;
1157	0						my $row = $table->[$index];
1158	0						$i--;
1159	0						my $h = $header[$i];
1160	0	0					if($h =~ /^Column\s(\d+)$/){
		0
		0
		0
		0
1161	0						my $c = $1;
1162	0						my $j = $c - 1;
1163	0						my $newname = "n/s:$section/n:$name/d:$DerivedFrom/k:"
1164							.$keys->[$j]."/t:data/";
1165	0						$header[$i] = $newname;
1166	0						push @results, [$newname, map {sigfigs($_)} @$row];
	0
1167	0						$compareFinder{$c} = $#results;
1168							}
1169							elsif($h eq 'DerivedFrom'){
1170	0						$DerivedFrom = $table->[$i+1]->[0];
1171	0	0					$flag = $DerivedFrom eq 'rotateToRegressionLine'
1172							? '*' : '';
1173							}
1174							elsif($h =~ /Regression/){
1175	0						my $M = $table->[$index+1];
1176	0						my $C = $table->[$index+2];
1177	0						foreach my $k(0..$#$row){
1178	0						my ($h,$m,$c) = map {$_->[$k]} ($row,$M,$C);
	0
1179	0	0					if($h =~ /^Column\s(\d+)$/){
1180	0						my $j = $1 - 1;
1181	0						my $newname = "1/s:$section/n:$name/d:$DerivedFrom/k:".$keys->[$j];
1182
1183	0						push @results, ["$newname/t:M/", sigfigs($m)];
1184	0						push @results, ["$newname/t:C/", sigfigs($c)];
1185							}
1186							}
1187							}
1188							elsif($h =~ /Statistic/){
1189	0						my $V = $table->[$index+1];
1190	0						foreach my $k(0..$#$row){
1191	0						my ($h,$v) = map {$_->[$k]} ($row,$V);
	0
1192	0						my $newname = "1/s:$section/n:$name/d:$DerivedFrom/k:$name/t:$h/";
1193	0	0					if($h eq 'CompareColumn'){
1194	0						my @cc = @{$results[$compareFinder{$v}]};
	0
1195	0						$v = shift @cc;
1196	0						push @results, [
1197							"n/s:$section/n:$name/d:$DerivedFrom/k:$name/t:spread/$flag",
1198							@cc
1199							];
1200							}
1201	0						push @results, [$newname, sigfigs($v)];
1202							}
1203							}
1204							elsif($h =~ /^M$\|^C$\|^Value$/){
1205							# ignore, because we've already collected it in Statistic or Regression.
1206							}
1207							else {
1208	0	0					my $thisflag = $h eq 'SpreadOverErrorPvalue' ? $flag : '';
1209	0						my $newname = "n/s:$section/n:$name/d:$DerivedFrom/k:"
1210							."$name/t:$h/$thisflag";
1211	0						push @results, [$newname, map {sigfigs($_)} @$row];
	0
1212							}
1213							}
1214	0						return @results;
1215							}
1216
1217
1218
1219
1220
1221
1222
1223
1224							=head1 AUTHOR
1225
1226							Jimi, C<< >>
1227
1228							=head1 BUGS
1229
1230							Please report any bugs or feature requests to C, or through
1231							the web interface at L. I will be notified, and then you'll
1232							automatically be notified of progress on your bug as I make changes.
1233
1234
1235
1236
1237							=head1 SUPPORT
1238
1239							You can find documentation for this module with the perldoc command.
1240
1241							perldoc Bio::MaxQuant::ProteinGroups::Response
1242
1243
1244							You can also look for information at:
1245
1246							=over 4
1247
1248							=item * RT: CPAN's request tracker (report bugs here)
1249
1250							L
1251
1252							=item * AnnoCPAN: Annotated CPAN documentation
1253
1254							L
1255
1256							=item * CPAN Ratings
1257
1258							L
1259
1260							=item * Search CPAN
1261
1262							L
1263
1264							=back
1265
1266
1267							=head1 ACKNOWLEDGEMENTS
1268
1269
1270							=head1 LICENSE AND COPYRIGHT
1271
1272							Copyright 2014 Jimi.
1273
1274							This program is free software; you can redistribute it and/or modify it
1275							under the terms of the the Artistic License (2.0). You may obtain a
1276							copy of the full license at:
1277
1278							L
1279
1280							Any use, modification, and distribution of the Standard or Modified
1281							Versions is governed by this Artistic License. By using, modifying or
1282							distributing the Package, you accept this license. Do not use, modify,
1283							or distribute the Package, if you do not accept this license.
1284
1285							If your Modified Version has been derived from a Modified Version made
1286							by someone other than you, you are nevertheless required to ensure that
1287							your Modified Version complies with the requirements of this license.
1288
1289							This license does not grant you the right to use any trademark, service
1290							mark, tradename, or logo of the Copyright Holder.
1291
1292							This license includes the non-exclusive, worldwide, free-of-charge
1293							patent license to make, have made, use, offer to sell, sell, import and
1294							otherwise transfer the Package with respect to any patent claims
1295							licensable by the Copyright Holder that are necessarily infringed by the
1296							Package. If you institute patent litigation (including a cross-claim or
1297							counterclaim) against any party alleging that the Package constitutes
1298							direct or contributory patent infringement, then this Artistic License
1299							to you shall terminate on the date that such litigation is filed.
1300
1301							Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
1302							AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
1303							THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
1304							PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
1305							YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
1306							CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
1307							CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
1308							EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1309
1310
1311							=cut
1312
1313							1; # End of Bio::MaxQuant::ProteinGroups::Response