File Coverage

blib/lib/Bio/MaxQuant/Evidence/Statistics.pm
Criterion Covered Total %
statement 336 343 97.9
branch 108 132 81.8
condition 32 51 62.7
subroutine 37 39 94.8
pod 32 32 100.0
total 545 597 91.2


line stmt bran cond sub pod time code
1             package Bio::MaxQuant::Evidence::Statistics;
2              
3 2     2   116425 use 5.006;
  2         10  
  2         78  
4 2     2   11 use strict;
  2         5  
  2         504  
5 2     2   13 use warnings;
  2         34  
  2         61  
6              
7 2     2   25849 use Text::CSV;
  2         41290  
  2         71  
8 2     2   76 use Carp;
  2         5  
  2         174  
9 2     2   2970 use Storable;
  2         7531  
  2         132  
10 2     2   1769 use Statistics::Distributions;
  2         6904  
  2         8860  
11              
12             =head1 NAME
13              
14             Bio::MaxQuant::Evidence::Statistics - Additional statistics on your SILAC evidence
15              
16             =head1 VERSION
17              
18             Version 0.01
19              
20             =cut
21              
22             our $VERSION = '0.01';
23              
24              
25             =head1 SYNOPSIS
26              
27             Read/convert your evidence file to a more rapidly processable format,
28             and perform various operations and statistics across/between multiple
29             experiments. Supports multidimensional experiments with replicate
30             analyses.
31              
32              
33             use Bio::MaxQuant::Evidence::Statistics;
34              
35             my $foo = Bio::MaxQuant::Evidence::Statistics->new();
36            
37             # get the essential data from an evidence file
38             $foo->parseEssentials(filename=>$evidencePath);
39              
40             # store the essentials for later
41             $foo->writeEssentials(filename=>$essentialsPath);
42              
43             # laod previously stored essentials
44             $foo->readEssentials(filename=>$essentialsPath);
45              
46              
47             =head1 SUBROUTINES/METHODS
48              
49             =head2 new
50              
51             Create a new object:
52              
53             my $foo = Bio::MaxQuant::Evidence::Statistics->new();
54              
55             =cut
56              
57             sub new {
58 8526     8526 1 244897 my $p = shift;
59 8526   66     39854 my $c = ref($p) || $p;
60 8526         212771 my %defaults = (
61             separator => "\t",
62             essential_column_names => {
63             'Protein group IDs' => 1,
64             'Modified' => 1,
65             'Leading Proteins' => 1,
66             'PEP' => 1,
67             'Ratio H/L' => 1,
68             'Intensity H' => 1,
69             'Intensity L' => 1,
70             'Contaminant' => 1,
71             'Reverse' => 1,
72             },
73             list_column_names => {
74             'Modified' => 1,
75             'PEP' => 1,
76             'Ratio H/L' => 1,
77             'Intensity H' => 1,
78             'Intensity L' => 1,
79             },
80             key_column_name => 'id',
81             experiment_column_name => 'Experiment',
82             csv_options => {sep_char=>"\t"},
83             );
84 8526         61038 my %options = (%defaults, @_);
85 8526         38674 my $o = {defaults=>\%options};
86 8526         45526 bless $o, $c;
87 8526         44095 return $o;
88             }
89              
90             =head2 parseEssentials(%options)
91              
92             Reads the essential data from an evidence file. Evidence files
93             for large analyses can be very big and take a long time to process,
94             to we only read what's necessary, and can save this for convenience
95             and speed too, using writeEssentials().
96              
97             The data are stored by Protein group IDs, i.e. one entry per protein
98             group. Other data stored here are:
99              
100             =over
101              
102             =item id
103              
104             =item Protein group IDs
105              
106             =item Modified -- is this actually the right name??
107              
108             =item Leading Proteins
109              
110             =item Experiment
111              
112             =item PEP
113              
114             =item Ratio H/L
115              
116             =item Intensity H
117              
118             =item Intensity L
119              
120             =item Contaminant
121              
122             =item Reverse
123              
124             =back
125              
126             The column names used for storage are defined in the default option
127             essential_column_names, and can be changed when you call new, or when you call
128             parseEssentials. The option is a hash of column names whose values
129             detmerine whether the column is kept by their truthness... e.g.
130              
131             $o->parseEssentials(essential_column_names=>(
132             'id' => 1, # kept
133             'PEP' => 0, # discarded
134             #foo => ?, # discarded
135             ));
136              
137             If a column doesn't exist, it does not complain!
138              
139             The method takes a hash of options.
140              
141             options:
142              
143             =over
144              
145             =item filename - path of the file to process
146              
147             =item separator - passed to Text::CSV (default is tab)
148              
149             =item key_column_name - change the column keyed on (default is id)
150              
151             =item experiment_column_name - change the column the data are split on
152              
153             =item list_column_names - change the columns stored as lists
154              
155             =back
156              
157             =head3 list_column_names
158              
159             Some columns are the same across all the evidence in a protein group,
160             eg, the id is obviously the same, Contaminant and Reverse, Protein IDs,
161             and so on. The default, therefore, is to overwrite the column with
162             the value seen in an evidence. BUT, some columns have a different value
163             in each evidence, e.g. Ratio H/L or PEP. Whatever columns are given in
164             list_column_names, which true values, will be appended as lists, so in the
165             final data, there will be one row per protein and any column bearing multiple
166             evidences for that protein will be a list.
167              
168             If that makes no sense, write to me and I'll try to change it.
169              
170             =cut
171              
172             sub parseEssentials {
173 1     1 1 32 my $o = shift;
174 1         3 my %defaults = %{$o->{defaults}};
  1         7  
175 1         9 my %options = (%defaults, @_);
176 1         109 my $io = IO::File->new($options{filename}, 'r');
177 1         10403 my $csv = Text::CSV->new($options{csv_options});
178             # head the column names, just like in the pod...
179 1         104 $csv->column_names ($csv->getline ($io));
180             # now getline_hr will give us hashrefs :-)
181             # we just need to know which to keep...
182 1         2762 my %k = %{$options{essential_column_names}};
  1         11  
183 1         3 my $i = $options{key_column_name};
184 1         2 my $e = $options{experiment_column_name};
185 1         1 my %l = %{$options{list_column_names}};
  1         5  
186 1         3 my %data = ();
187 1         3 my @ids = ();
188 1         2 my @sharedids = ();
189 1         2 my @uniqueids = ();
190            
191 1         6 while(! eof($io)){
192 2793         14698 my $hr = $csv->getline_hr($io);
193 192717 100 66     733196 my %h = map {
194 2793         10827691 exists $k{$_} && $k{$_} # exists and true
195             ? ($_=>$hr->{$_}) # key => value
196             : () # empty
197             } keys %$hr;
198 2793         20018 my $key = $hr->{$i};
199 2793         6330 my $expt = $hr->{$e};
200             # store it...
201 2793         5536 push @ids, $key; # keep track of what we've got
202             # store stuff by expt, then id, then column
203 2793 100       10747 $data{$expt} = {} unless exists $data{$expt};
204 22344 100 66     168005 $data{$expt}->{$key} = { # set up this expt/key... unless it exists
205             map {
206 2793 50       18909 exists $l{$_} && $l{$_}
207             ? ($_ => []) # it's an array
208             : ($_ => '') # it's a scalar
209             } keys %h
210             } unless exists $data{$expt}->{$key};
211             # add the data...
212 2793         35190 foreach (keys %h){ # each column
213 22344 100 66     106263 if(exists $l{$_} && $l{$_}){ # is it a list column?
214 11172         15756 push @{$data{$expt}->{$key}->{$_}}, $h{$_}; # push it
  11172         57249  
215             }
216             else {
217 11172         43575 $data{$expt}->{$key}->{$_} = $h{$_}; # set it
218             }
219             }
220 2793 100       24661 if($data{$expt}->{$key}->{'Protein group IDs'} =~ /;/){
221 400         19618 push @sharedids, $key;
222             }
223             else {
224 2393         62966 push @uniqueids, $key;
225             }
226             }
227 1         5 $o->{data} = \%data;
228 1         176 $o->{ids} = [sort {$a <=> $b} @ids];
  2792         5087  
229 1         35 $o->{sharedids} = [sort {$a <=> $b} @sharedids];
  399         615  
230 1         104 $o->{uniqueids} = [sort {$a <=> $b} @uniqueids];
  2392         4168  
231 1         8937 $o->{cache} = {};
232             }
233              
234             =head2 experiments
235              
236             Returns a list of the experiments in the data.
237              
238             =cut
239              
240             sub experiments {
241 876     876 1 4189 my $o = shift;
242 876         3042 my $data = $o->{data};
243 876         10294 return keys %$data;
244             }
245              
246             =head2 replicated
247              
248             Returns a list of the experiment names without the replicate portion.
249              
250             The names are assumed to be Cell.Condition.Replicate, i.e. full-stop (period) separated.
251              
252             =cut
253              
254             sub replicated {
255 5     5 1 11 my $o = shift;
256 5 100       26 return @{$o->{cache}->{replicated}} if exists $o->{cache}->{replicated};
  3         20  
257 2         5 my %repl = ();
258 2         9 my @expts = $o->experiments();
259 2         12 foreach (@expts){
260 54         177 s/\.[^.]+$//;
261 54         112 $repl{$_} = 1;
262             }
263 2         16 $o->{cache}->{replicated} = [keys %repl];
264 2         12 return @{$o->{cache}->{replicated}};
  2         29  
265             }
266              
267             =head2 orthogonals
268              
269             Returns a list of sets of orthogonal experiments, that is 3 experiments in which the first has
270             one condition in common with the other two, but they have nothing in common with each other.
271              
272             e.g. A.X A.Y B.X
273              
274             The rationale behind this is that quantitative differences across this set indicate mechanistic
275             links between, for example, cell line and drug treatment. If a reponse is seen to a drug, and
276             a different repsonse is seen in a different cell-type, this system will pick that up. The
277             fourth member of the comparison (in the example that would be B.Y) could be anything... and the
278             interpretation would still be that there is a differential response.
279              
280             =cut
281              
282             sub orthogonals {
283 9     9 1 26 my $o = shift;
284 9 100       49 return @{$o->{cache}->{orthogonals}} if exists $o->{cache}->{orthogonals};
  7         167  
285 2         6 my @repls = $o->replicated();
286 2         7 my @orths = ();
287 2         5 foreach my $c1(@repls){
288 18         47 my ($p,$x) = split(/\./, $c1, 2);
289 18         31 foreach my $c2(@repls){
290 162 100       298 next if $c2 eq $c1;
291 144         312 my ($q,$y) = split(/\./, $c2, 2);
292 144 100       344 next unless $p eq $q;
293 36         56 foreach my $c3(@repls){
294 324 100 100     1218 next if $c3 eq $c1 || $c3 eq $c2;
295 252         513 my ($r,$z) = split(/\./, $c3, 2);
296 252 100       631 next unless $x eq $z;
297 72         212 push @orths, "$c1 $c2 $c3";
298             }
299             }
300             }
301 2         9 $o->{cache}->{orthogonals} = \@orths;
302 2         60 return @orths;
303             }
304              
305             =head2 pairs
306              
307             Returns a list of pairs of replicated experiments (e.g. A.X A.Y, A.X B.X ...)
308             that represents all possible comparisons.
309              
310             =cut
311              
312             sub pairs {
313 9     9 1 32 my $o = shift;
314 9 100       67 return @{$o->{cache}->{pairs}} if exists $o->{cache}->{pairs};
  7         180  
315 2         12 my @r = $o->replicated();
316 2         7 my @pairs = ();
317 2         17 foreach my $r1(sort @r){
318 18         68 foreach my $r2(sort @r){
319 162 100       322 next unless $r1 lt $r2;
320 72         191 push @pairs, "$r1 $r2";
321             }
322             }
323 2         9 $o->{cache}->{pairs} = \@pairs;
324 2         38 return @pairs;
325             }
326              
327             =head2 ids
328              
329             Returns a list of evidence ids in the data.
330              
331             =cut
332              
333             sub ids {
334 2     2 1 5 return @{shift()->{ids}};
  2         18  
335             }
336              
337             =head2 sharedIds
338              
339             Returns a list containing the ids of those evidences shared between protein groups.
340              
341             =cut
342              
343             sub sharedIds {
344 2     2 1 826 return @{shift()->{sharedids}};
  2         13  
345             }
346              
347             =head2 uniqueIds
348              
349             Returns a list containing the ids of those evidences unique to one protein group.
350              
351             =cut
352              
353             sub uniqueIds {
354 2     2 1 6 return @{shift()->{uniqueids}};
  2         14  
355             }
356              
357             =head2 saveEssentials(%options)
358              
359             Save the essential data (quicker to read again in future)
360              
361             =cut
362              
363             sub saveEssentials {
364 1     1 1 811 my $o = shift;
365 1         2 my %defaults = %{$o->{defaults}};
  1         14  
366 1         8 my %options = (%defaults, @_);
367             # here we want to save everything
368 1         10 store $o, $options{filename};
369             }
370              
371             =head2 loadEssentials
372              
373             Load up previously saved essentials
374              
375             =cut
376              
377             sub loadEssentials {
378 12     12 1 488851 my $o = shift;
379 12         46 my %defaults = %{$o->{defaults}};
  12         112  
380 12         247 my %options = (%defaults, @_);
381 12         98 my $p = retrieve($options{filename});
382 12         517213 %$o = %$p;
383 12         257 return $o;
384             }
385              
386              
387             =head2 extractColumnValues
388              
389             =cut
390              
391             sub extractColumnValues {
392 8379     8379 1 31340 my ($o, %options) = @_;
393             # options:
394 8379         43524 my %defaults = (
395             column => 'id', # which column to collect
396             experiment => '', # only extract this expt (all if false)
397             'split' => 1, # true = split cell on ; before adding to results
398             'nodup' => 1, # true = remove duplicates
399             'emptiesok' => 0, # true = include empty values in output
400             );
401 8379         72925 %options = (%defaults, %options);
402 8379         29336 my $data = $o->{data};
403 8379 100       35181 my $results = $options{nodup} ? {} : [];
404 8379 50       65973 my @expts = $options{experiment} ? ($options{experiment}) : (keys %$data);
405 8379         21929 foreach my $e(@expts){
406 9009         15520 foreach my $k(keys %{$data->{$e}}){
  9009         143376  
407 457848         1621126 my $value = $data->{$e}->{$k}->{$options{column}};
408 457848 100       1460648 if(ref($value) eq ''){
409 8379         41310 $value = [split /;/, $value];
410             }
411 457848 50       1687922 my @values = $options{'split'} ? (@$value) : (join(';',@$value));
412 457848         873372 foreach (@values){
413 459048 100 66     2430962 next unless $_ ne '' || $options{'emptiesok'};
414 437416 100       1012527 if($options{nodup}){
415 51727         377677 $results->{$_} = 1;
416             }
417             else {
418 385689         1858907 push @$results, $_;
419             }
420             }
421             }
422             }
423             # use Data::Dumper;
424             # print STDERR Dumper $results;
425 8379 100       179184 return $options{nodup} ? (keys %$results) : (@$results);
426             }
427              
428             =head2 proteinCount
429              
430             =cut
431              
432             sub proteinCount {
433 2     2 1 4689705 my $o = shift;
434 2 100       22 return $o->{cache}->{proteinCount} if exists $o->{cache}->{proteinCount};
435 1         8 my @proteins = $o->getLeadingProteins();
436 1         6 $o->{cache}->{proteinCount} = scalar @proteins;
437 1         14 return $o->{cache}->{proteinCount};
438             }
439              
440             =head2 getProteinGroupIds
441              
442             =cut
443              
444             sub getProteinGroupIds {
445 2     2 1 8 my $o = shift;
446 2 100       16 $o->{cache}->{proteinGroupIds} = [sort $o->extractColumnValues(column=>'Protein group IDs')] unless exists $o->{cache}->{proteinGroupIds};
447 2         7 return @{$o->{cache}->{proteinGroupIds}}
  2         32  
448             }
449              
450             =head2 getLeadingProteins
451              
452             =cut
453              
454             sub getLeadingProteins {
455 4     4 1 1343 my $o = shift;
456 4 100       37 $o->{cache}->{leadingProteins} = [sort $o->extractColumnValues(column=>'Leading Proteins')] unless exists $o->{cache}->{proteinGroupIds};
457 4         13 return @{$o->{cache}->{leadingProteins}};
  4         54  
458             }
459              
460             =head2 logRatios
461              
462             Logs ratios (base 2) throughout the dataset, and sets a flag so it can't get logged again.
463              
464             Treatment of "special values": empty string, <= 0, NaN, and any other non-number are removed
465             from the data!
466              
467             =cut
468              
469             sub logRatios {
470 17     17 1 1120 my $o = shift;
471 17 100       120 return 0 if $o->{logged};
472 11         41 $o->{logged} = 1;
473 11         31 my $data = $o->{data};
474 11         350 foreach my $exptname(keys %$data){
475 297         1436 my $experiment = $data->{$exptname};
476 297         17597 foreach my $proteinGroupId(keys %$experiment){
477 30724         73697 my $proteinGroup = $experiment->{$proteinGroupId};
478 30724         94740 my $ratios = $proteinGroup->{'Ratio H/L'};
479 30724         53972 my @newRatios = ();
480 30724         82672 foreach (0..$#$ratios){
481 30724 100       265522 $ratios->[$_] = $ratios->[$_] =~ /^\d+\.?\d*$/
482             ? log($ratios->[$_])/log(2)
483             : '';
484             }
485             }
486             }
487 11         166 return 1;
488             }
489              
490             =head2 filter
491              
492             returns a set of protein records based on filter parameters...
493              
494             =head3 options
495              
496             =over
497              
498             =item experiment - regular expression to match experiment name
499              
500             =item proteinGroupId - regular expression to match protein group id
501              
502             =item leadingProteins - regular expression to match leading protein ids
503              
504             =item notLeadingProteins - regular expression to not match leading protein ids
505              
506             =back
507              
508             Returns a filtered object of the same type, with relevant flags set (e.g. whether
509             data has been logged, etc).
510              
511             Warning, intentionally does not perform a deep clone!
512              
513             =cut
514              
515             sub filter {
516 8511     8511 1 47300 my ($o,%opts) = @_;
517             # options :
518             # use Data::Dumper;
519             # print STDERR 'OPTS: ', Dumper \%opts;
520 8511         24747 my $data = $o->{data};
521 8511         29499 my $result = {};
522 8511         122390 foreach my $experiment(keys %$data){
523 229797 100 66     1723510 if(! exists $opts{experiment} || $experiment =~ /$opts{experiment}/){
524 9063         34023 $result->{$experiment} = {};
525 9063         24743 my $exptdata = $data->{$experiment};
526 9063         578225 foreach my $pgid(keys %$exptdata){
527 944201 50 33     2852151 if(! exists $opts{proteinGroupId} || $pgid =~ /$opts{proteinGroupId}/){
528 944201         1904095 my $pgdata = $exptdata->{$pgid};
529 944201 100 100     5914064 if(! exists $opts{leadingProteins} || $pgdata->{'Leading Proteins'} =~ /$opts{leadingProteins}/){
530 466181 100 66     1402744 if(! exists $opts{notLeadingProteins} || $pgdata->{'Leading Proteins'} !~ /$opts{notLeadingProteins}/){
531 463434         1914190 $result->{$experiment}->{$pgid} = $pgdata;
532             }
533             }
534             }
535             }
536             }
537             }
538             # print STDERR Dumper $result if $opts{experiment} eq qr/^LCC1.nE.r2$/;
539 8511         80978 my $p = $o->new;
540 8511         143809 %$p = %$o;
541 8511         36483 $p->{data} = $result;
542 8511         20120 $o->{lastfiltered} = $p;
543 8511         37450 return $p;
544             }
545              
546             =head2 replicateMedian
547              
548             options are passed to filter.
549              
550             =cut
551              
552             sub replicateMedian {
553 355     355 1 174821 my ($o,%opts) = @_;
554 355         1563 my $f = $o->filter(%opts);
555 355         1707 return $f->median(
556             $f->extractColumnValues(
557             column => 'Ratio H/L',
558             nodup => 0,
559             )
560             );
561             }
562              
563             =head2 deviations
564              
565             returns an hashref with the following keys
566              
567             =over
568              
569             =item n - the number of items
570              
571             =item sd - the standard deviation (from the mean)
572              
573             =item mad - the median absolute deviation (from the median)
574              
575             =item sd_via_mad - the standard deviation estimated from the median absolute deviation
576              
577             =back
578              
579             =cut
580              
581             sub deviations {
582 2826     2826 1 79414 my ($o,%opts) = @_;
583             # cache
584 2826 100       38227 $o->{cache}->{deviations} = {} unless exists $o->{cache}->{deviations};
585 2826         22879 my $cachekey = join('::', map {"$_=$opts{$_}"} sort keys %opts);
  16780         59712  
586             # print STDERR "$cachekey\n";
587 2826 100       21707 return $o->{cache}->{deviations}->{$cachekey} if exists $o->{cache}->{deviations}->{$cachekey};
588             ##
589 2825         18148 my $f = $o->filter(%opts);
590 2825         17091 my @values = $f->extractColumnValues(
591             column => 'Ratio H/L',
592             nodup => 0,
593             );
594 2825         24307 my $n = scalar @values;
595 2825 50       30591 my $d = $n > 1 ? $o->sd(@values) : '';
596 2825         12770 $d->{'values'} = \@values;
597 2825 50       22829 $d->{mad} = $n ? $o->mad(@values) : '';
598            
599 2825 50       25093 $d->{sd_from_mad} = $d->{sd_via_mad} = $n ? $d->{mad} * 1.4826016694 : '';
600 2825 50       20046 $d->{usv_mad} = $n ? $d->{sd_from_mad} ** 2 : '';
601 2825 50       23226 $d->{median} = $n ? $o->median(@values) : '';
602             # I should think about caching here!!! DONE!
603 2825         28047 $o->{cache}->{deviations}->{$cachekey} = $d;
604 2825         36098 return $d;
605             }
606              
607             =head2 mean
608              
609             given a list of values, returns the mean
610              
611             =cut
612              
613             sub mean {
614 2827     2827 1 68611 my ($o,@values) = @_;
615 2827 50       19824 if(scalar(@values) < 1){ return ''; }
  0         0  
616 2827         26156 return $o->sum(@values) / scalar @values;
617             }
618              
619             =head2 sd (unbiased standard deviation)
620              
621             given a list of values, returns a hash with keys mean and sd (standard deviation).
622              
623             =cut
624              
625             sub sd {
626 2827     2827 1 37100 my ($o,@values) = @_;
627 2827         6132 my $n = scalar(@values);
628 2827 50       19764 my $mean = $n ? $o->mean(@values) : '';
629 2827         12553 my $sos = $o->sum(map {($_ - $mean)**2} @values);
  330832         586274  
630 2827 50       21744 if($n > 1){
631 2827         10641 $sos /= ($n-1);
632             }
633             else {
634 0         0 $sos = '';
635             }
636             return {
637 2827         37144 sd => sqrt($sos),
638             usv => $sos,
639             mean => $mean,
640             n => $n,
641             };
642             }
643              
644             =head2 sum
645              
646             given a list of values, returns the sum
647              
648             =cut
649              
650             sub sum {
651 5654     5654 1 114118 my ($o,@values) = @_;
652 5654         12944 my $t = 0;
653 5654         326071 $t += $_ foreach @values;
654 5654         58407 return $t;
655             }
656              
657             =head2 mad
658              
659             given a list of values, returns the median absolute deviation
660              
661             =cut
662              
663             sub mad {
664 2825     2825 1 43932 my ($o,@values) = @_;
665 2825 50       13610 if(scalar(@values) < 1){
666 0         0 return '';
667             }
668 2825         17105 my $median = $o->median(@values);
669 2825         9318 my @ads = map {abs ($_ - $median)} @values;
  330822         613525  
670 2825         37064 return $o->median(@ads);
671             }
672              
673             =head2 ttest
674              
675             Given options, experiment1, experiment2 and optional filters,
676             returns a hash of statistics...
677              
678             stats1 and stats2 are hashes of deviations: sd, mad, sd_via_mad, usv, n, values
679              
680             ttest is hash of Welch's ttest results: t, df, p
681              
682             ttest_mad is like ttest but based on median and median absolute deviateions.
683              
684             The p-values are derived using Welch's Ttest and the t-distribution function from
685             Statistics::Distributions.
686              
687             MAD and medians are much more robust to outliers, which are significant in peptide ratios.
688              
689              
690             =cut
691              
692             sub ttest {
693 5202     5202 1 52181 my ($o,%opts) = @_;
694             # cache
695 5202 100       19464 if($opts{experiment1} gt $opts{experiment2}){ # sort requested expts
696 1723         5586 ($opts{experiment1}, $opts{experiment2}) = ($opts{experiment2}, $opts{experiment1});
697             }
698 5202 100       24355 $o->{cache}->{ttests} = {} unless exists $o->{cache}->{ttests};
699 5202         34198 my $cachekey = join('::', map {"$_=$opts{$_}"} sort keys %opts);
  25966         90980  
700 5202 100       57857 return $o->{cache}->{ttests}->{$cachekey} if exists $o->{cache}->{ttests}->{$cachekey};
701             ##
702 1402         5744 $opts{experiment} = $opts{experiment1};
703 1402         8534 my $d1 = $o->deviations(%opts);
704 1402         12589 $opts{experiment} = $opts{experiment2};
705 1402         11812 my $d2 = $o->deviations(%opts);
706 1402         15876 my $tt = $o->welchs_ttest(
707             mean1 => $d1->{mean},
708             mean2 => $d2->{mean},
709             usv1 => $d1->{usv},
710             usv2 => $d2->{usv},
711             n1 => $d1->{n},
712             n2 => $d2->{n},
713             );
714 1402 50 33     27797 $tt->{p} = ($d1->{n} && $d2->{n}) ? Statistics::Distributions::tprob(int ($tt->{df}), $tt->{t}) : '';
715 1402         371766 my $tm = $o->welchs_ttest(
716             mean1 => $d1->{median},
717             mean2 => $d2->{median},
718             usv1 => $d1->{usv_mad},
719             usv2 => $d2->{usv_mad},
720             n1 => $d1->{n},
721             n2 => $d2->{n},
722             );
723 1402 50 33     18817 $tm->{p} = ($d1->{n} && $d2->{n}) ? Statistics::Distributions::tprob(int ($tm->{df}), $tm->{t}) : '';
724            
725 1402         307460 my $r = {
726             stats1 => $d1, stats2 => $d2, ttest => $tt, ttest_mad => $tm
727             };
728 1402         12334 $o->{cache}->{ttests}->{$cachekey} = $r;
729 1402         13433 return $r;
730             }
731              
732             =head2 welchs_ttest
733              
734             performs Welch's ttest, given mean1, mean2, usv1, usv2, n1 and n2 in a hash.
735              
736             e.g.
737              
738             $o->welchs_ttest( mean1 => 4, mean2 => 3, # sample mean
739             usv1 => 1, usv2 => 1.1, # unbiased sample variance (returned as usv from $o->sd
740             n1 => 4, n2=> 7 # number of observations
741            
742             also performs Welch-Satterthwaite to calculate degrees of freedom (to look up in t-statistic table)
743              
744             Returns hashref containing t and df.
745              
746             =cut
747              
748             sub welchs_ttest {
749 2805     2805 1 27807 my ($o, %t) = @_;
750 2805         9249 my ($x1,$x2,$v1,$v2,$n1,$n2) = map {$t{$_}} qw/mean1 mean2 usv1 usv2 n1 n2/;
  16830         37434  
751 2805         9893 my ($vn1,$vn2) = ($v1/$n1, $v2/$n2);
752 2805         8904 my $t = abs($x1 - $x2) / sqrt( $vn1 + $vn2 );
753 2805         12687 my $df = ($vn1 + $vn2)**2 / ( $vn1**2/($n1-1) + $vn2**2/($n2-1) );
754 2805         21831 return {t => $t, df => $df};
755             }
756              
757             =head2 replicateMedianSubtractions
758              
759             Logs data, if not already done, calculates median for each replicate, and subtracts median from each evidence in that replicate.
760              
761             =cut
762              
763             sub replicateMedianSubtractions {
764 5     5 1 369 my ($o, %opts) = @_; # can set filter here
765 5         28 $o->logRatios();
766 5         34 foreach my $replicate($o->experiments()){
767 135         941 my $median = $o->replicateMedian(%opts, experiment=>$replicate);
768 135         868 my $p = $o->filter(experiment=>$replicate);
769 135         280 foreach my $pgid(keys %{$p->{data}->{$replicate}}){
  135         2847  
770 13965         18686 foreach my $i(0.. $#{$p->{data}->{$replicate}->{$pgid}->{'Ratio H/L'}}){
  13965         69328  
771 13965 100       100753 if($p->{data}->{$replicate}->{$pgid}->{'Ratio H/L'}->[$i] =~ /\d/){
772 13280         78911 $p->{data}->{$replicate}->{$pgid}->{'Ratio H/L'}->[$i] -= $median;
773             }
774             }
775             }
776             }
777             # i guess we should do something better with generating this status:
778 5         80 return 1;
779             }
780              
781             =head2 median
782              
783             given a list of numbers, returns the median... assumes all items are numbers!
784              
785             =cut
786              
787             sub median {
788 8834     8834 1 18817 my $o = shift;
789 8834         55317 my @list = sort {$a <=> $b} @_;
  5892786         7771332  
790 8834         17954 my $n = scalar @list;
791 8834 100       49007 if($n % 2){ # remainder on division by two -> it's odd!
792 3940         50834 return $list[($n-1)/2]; # index of last over 2, e.g. 21 items, last index 20, return 10.
793             }
794             else { # it's not odd... so it's even
795 4894         66267 return ($list[$n/2 - 1] + $list[$n/2]) / 2; # length over 2 and the same minus 1, e.g. 20 items, we want 9 and 10.
796             }
797             }
798              
799              
800             =head2 experimentMaximumPvalue
801              
802             =cut
803              
804             sub experimentMaximumPvalue {
805 866     866 1 13973 my ($o,%opts) = @_;
806             # run through experiments and collect replicate names for comparisons...
807             # this should be filtered for individual proteins using the leadingProteins option.
808 866         3287 my @reps1 = ();
809 866         2720 my @reps2 = ();
810 866         4963 foreach my $rep($o->experiments){
811 23382 100       105896 if($rep =~ /^$opts{experiment1}/){
812 2598         5303 push @reps1, $rep;
813             }
814 23382 100       127597 if($rep =~ /^$opts{experiment2}/){
815 2598         8756 push @reps2, $rep;
816             }
817             }
818             # now there must be enough replicates with enough observations in each...
819 866 50       7896 $opts{minimum_observations} = 2 unless exists $opts{minimum_observations};
820 866 50       4606 $opts{minimum_replicates} = 2 unless exists $opts{minimum_replicates};
821 866         12660 my $reps1 = 0;
822 866         1966 my $reps2 = 0;
823 866         2609 foreach my $rep(@reps1){
824 2598         11161 my $f = $o->filter(experiment=>$rep, leadingProteins=>$opts{filter});
825 2598         11830 my @values = $f->extractColumnValues(column => 'Ratio H/L');
826 2598 100       23886 $reps1 ++ if scalar(@values) > $opts{minimum_observations};
827             }
828 866         3446 foreach my $rep(@reps2){
829 2598         11475 my $f = $o->filter(experiment=>$rep, leadingProteins=>$opts{filter});
830 2598         11934 my @values = $f->extractColumnValues(column => 'Ratio H/L', nodup => 0);
831 2598 100       18547 $reps2 ++ if scalar(@values) > $opts{minimum_observations};
832             }
833 866 100 100     12666 return {p_max => -1, p_mad_max => -1} if $reps1 < $opts{minimum_replicates} || $reps2 < $opts{minimum_replicates};
834            
835             # compare each combination of replicates
836 518         1623 my $p_max = 0;
837 518         1734 my $p_mad_max = 0;
838 518         1947 foreach my $r1(@reps1){
839 1554         3874 foreach my $r2(@reps2){
840 4662         27775 my $tt = $o->ttest(%opts, experiment1=>$r1, experiment2=>$r2);
841 4662 100       48833 $p_max = $tt->{ttest}->{p} if $tt->{ttest}->{p} > $p_max;
842 4662 100       30063 $p_mad_max = $tt->{ttest_mad}->{p} if $tt->{ttest_mad}->{p} > $p_mad_max;
843             }
844             }
845             # compare experiments overall
846 518         3385 my $tt = $o->ttest(%opts);
847 518 100       5314 $p_max = $tt->{ttest}->{p} if $tt->{ttest}->{p} > $p_max;
848 518 50       3888 $p_mad_max = $tt->{ttest_mad}->{p} if $tt->{ttest_mad}->{p} > $p_mad_max;
849            
850             # report the maxima
851 518         7798 return {p_max=>$p_max, p_mad_max=>$p_mad_max};
852             }
853              
854             =head2 fullProteinComparison
855              
856             Does a full comparison on a particular protein, i.e. compares all pairs of conditions, also does
857             differential response analysis. Allows limitation of analysis to proteotypic peptides.
858              
859             =cut
860              
861             sub fullProteinComparison {
862 8     8 1 126 my ($o, %opts) = @_;
863             # %opts should have our protein listed as "filter"
864 8         47 my @pairs = $o->pairs();
865 8         59 my @orths = $o->orthogonals();
866 8         43 my %results = ();
867 8         24 foreach my $p(@pairs){
868 288         2410 my ($e1,$e2) = split(/\s+/, $p);
869 288         2154 $results{$p} = $o->experimentMaximumPvalue(%opts, experiment1=>$e1, experiment2=>$e2);
870             }
871 8         40 foreach my $p(@orths){
872 288         2239 my ($e1,$e2,$e3) = split(/\s+/, $p);
873 288         2014 my $r1 = $o->experimentMaximumPvalue(%opts, experiment1=>$e1, experiment2=>$e2);
874 288         2414 my $r2 = $o->experimentMaximumPvalue(%opts, experiment1=>$e1, experiment2=>$e3);
875 288 100 100     4190 if($r1->{p_max} < 0 || $r2->{p_max} < 0){
    100          
876 120         2579 $r1->{p_max} = -1;
877             }
878             elsif($r2->{p_max} > $r1->{p_max}) {
879 54         174 $r1->{p_max} = $r2->{p_max};
880             }
881 288 100 100     2818 if($r1->{p_mad_max} < 0 || $r2->{p_mad_max} < 0){
    100          
882 120         548 $r1->{p_mad_max} = -1;
883             }
884             elsif($r2->{p_mad_max} > $r1->{p_mad_max}) {
885 52         272 $r1->{p_mad_max} = $r2->{p_mad_max};
886             }
887 288         2604 $results{$p} = $r1;
888             }
889 8         274 return \%results;
890             }
891              
892             =head2 fullComparison
893              
894             Does a full comparison for each protein. Returns hash of hashes.
895              
896             =cut
897              
898             sub fullComparison {
899 1     1 1 2005 my $o = shift;
900 1         7 my @leadingProteins = $o->getLeadingProteins();
901 1         5 my %results = ();
902 1         4 foreach my $lp(@leadingProteins){
903 7         39 $results{$lp} = $o->fullProteinComparison(filter=>$lp);
904             }
905 1         15 return \%results;
906             }
907              
908             =head2 direction
909              
910             given two values, returns whether the different between first and second is positive or negative
911              
912             returns '+' or '-'
913              
914             =cut
915              
916             sub direction {
917 0 0   0 1   return $_[1] > $_[2] ? '-' : '+';
918             }
919              
920             =head2 directionsDisagree
921              
922             given two directions, which could be '+', '-' or '', returns true if one is '+' and the other is '-'
923              
924             =cut
925              
926             sub directionsDisagree {
927 0 0 0 0 1   return if $_[1] eq '-' && $_[2] eq '+';
928 0 0 0       return if $_[1] eq '+' && $_[2] eq '-';
929 0           return 1; # must be the same or one is blank.
930             }
931              
932              
933              
934              
935             =head1 AUTHOR
936              
937             jimi, C<< >>
938              
939             =head1 BUGS
940              
941             Please report any bugs or feature requests to C, or through
942             the web interface at L. I will be notified, and then you'll
943             automatically be notified of progress on your bug as I make changes.
944              
945              
946              
947              
948             =head1 SUPPORT
949              
950             You can find documentation for this module with the perldoc command.
951              
952             perldoc Bio::MaxQuant::Evidence::Statistics
953              
954              
955             You can also look for information at:
956              
957             =over 4
958              
959             =item * RT: CPAN's request tracker (report bugs here)
960              
961             L
962              
963             =item * AnnoCPAN: Annotated CPAN documentation
964              
965             L
966              
967             =item * CPAN Ratings
968              
969             L
970              
971             =item * Search CPAN
972              
973             L
974              
975             =back
976              
977              
978             =head1 ACKNOWLEDGEMENTS
979              
980              
981             =head1 LICENSE AND COPYRIGHT
982              
983             Copyright 2014 jimi.
984              
985             This program is released under the following license: artistic2
986              
987              
988             =cut
989              
990             1; # End of Bio::MaxQuant::Evidence::Statistics