File Coverage

Bio/Tools/Signalp/ExtendedSignalp.pm
Criterion Covered Total %
statement 194 197 98.4
branch 107 116 92.2
condition 17 22 77.2
subroutine 16 16 100.0
pod 5 5 100.0
total 339 356 95.2


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Tools::Signalp::ExtendedSignalp
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Emmanuel Quevillon
7             #
8             # Copyright Emmanuel Quevillon
9             #
10             # You may distribute this module under the same terms as perl itself
11             #
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Tools::Signalp::ExtendedSignalp - enhanced parser for Signalp output
17              
18             =head1 SYNOPSIS
19              
20             use Bio::Tools::Signalp::ExtendedSignalp;
21             my $params = [qw(maxC maxY maxS meanS D)];
22             my $parser = new Bio::Tools::Signalp::ExtendedSignalp(
23             -fh => $filehandle
24             -factors => $params
25             );
26              
27             $parser->factors($params);
28             while( my $sp_feat = $parser->next_feature ) {
29             #do something
30             #eg
31             push @sp_feat, $sp_feat;
32             }
33              
34             =head1 DESCRIPTION
35              
36             # Please direct questions and support issues to I
37              
38             Parser module for Signalp.
39              
40             Based on the EnsEMBL module Bio::EnsEMBL::Pipeline::Runnable::Protein::Signalp
41             originally written by Marc Sohrmann (ms2 a sanger.ac.uk) Written in BioPipe by
42             Balamurugan Kumarasamy (savikalpa a fugu-sg.org) Cared for by the Fugu
43             Informatics team (fuguteam@fugu-sg.org)
44              
45             You may distribute this module under the same terms as perl itself
46              
47             Compared to the original SignalP, this method allow the user to filter results
48             out based on maxC maxY maxS meanS and D factor cutoff for the Neural Network (NN)
49             method only. The HMM method does not give any filters with 'YES' or 'NO' as result.
50              
51             The user must be aware that the filters can only by applied on NN method.
52             Also, to ensure the compatibility with original Signalp parsing module, the user
53             must know that by default, if filters are empty, max Y and mean S filters are
54             automatically used to filter results.
55              
56             If the used gives a list, then the parser will only report protein having 'YES'
57             for each factor.
58              
59             This module supports parsing for full, summary and short output form signalp.
60             Actually, full and summary are equivalent in terms of filtering results.
61              
62             =head1 FEEDBACK
63              
64             =head2 Mailing Lists
65              
66             User feedback is an integral part of the evolution of this and other
67             Bioperl modules. Send your comments and suggestions preferably to
68             the Bioperl mailing list. Your participation is much appreciated.
69              
70             bioperl-l@bioperl.org - General discussion
71             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
72              
73             =head2 Support
74              
75             Please direct usage questions or support issues to the mailing list:
76              
77             I
78              
79             rather than to the module maintainer directly. Many experienced and
80             reponsive experts will be able look at the problem and quickly
81             address it. Please include a thorough description of the problem
82             with code and data examples if at all possible.
83              
84             =head2 Reporting Bugs
85              
86             Report bugs to the Bioperl bug tracking system to help us keep track
87             of the bugs and their resolution. Bug reports can be submitted via
88             the web:
89              
90             https://github.com/bioperl/bioperl-live/issues
91              
92             =head1 AUTHOR
93              
94             Based on the Bio::Tools::Signalp module
95             Emmanuel Quevillon
96              
97             =head1 APPENDIX
98              
99             The rest of the documentation details each of the object methods.
100             Internal methods are usually preceded with a _
101              
102             =cut
103              
104             package Bio::Tools::Signalp::ExtendedSignalp;
105              
106 1     1   453 use strict;
  1         1  
  1         24  
107 1     1   3 use Data::Dumper;
  1         1  
  1         36  
108 1     1   323 use Bio::SeqFeature::Generic;
  1         2  
  1         28  
109             # don't need Bio::Root::Root/IO (already in inheritance tree)
110 1     1   6 use base qw(Bio::Tools::Signalp Bio::Tools::AnalysisResult);
  1         1  
  1         285  
111              
112             #Supported arguments
113             my $FACTS = {
114             'maxC' => 1,
115             'maxS' => 1,
116             'maxY' => 1,
117             'meanS' => 1,
118             'D' => 1,
119             };
120              
121             =head2 new
122              
123             Title : new
124             Usage : my $obj = new Bio::Tools::Signalp::ExtendedSignalp();
125             Function: Builds a new Bio::Tools::Signalp::ExtendedSignalp object
126             Returns : Bio::Tools::Signalp::ExtendedSignalp
127             Args : -fh/-file => $val, # for initing input, see Bio::Root::IO
128              
129              
130             =cut
131              
132             sub new {
133 10     10 1 113 my($class,@args) = @_;
134              
135 10         30 my $self = $class->SUPER::new(@args);
136 10         18 $self->_initialize_io(@args);
137              
138 10         24 my $factors = $self->_rearrange([qw(FACTORS)], @args);
139             #To behave like the parent module (Bio::Tools::Signalp) we default factors to these two factors
140 10 100 100     29 if($factors && scalar(@$factors)){
141 4         5 $factors = $factors;
142             }
143             else{
144 6         10 $factors = [qw(maxY meanS)];
145             }
146 10 50       29 $factors && $self->factors($factors);
147            
148 10         35 return $self;
149             }
150              
151             =head2 next_feature
152              
153             Title : next_feature
154             Usage : my $feat = $signalp->next_feature
155             Function: Get the next result feature from parser data
156             Returns : Bio::SeqFeature::Generic
157             Args : none
158              
159              
160             =cut
161              
162             sub next_feature {
163              
164 44     44 1 634 my ($self) = @_;
165              
166 44 50       62 if(!$self->_parsed()){
167 44         67 $self->_parse();
168             }
169              
170 44   100     43 return shift @{$self->{_features}} || undef;
171              
172             }
173              
174             =head2 _filterok
175              
176             Title : _filterok
177             Usage : my $feat = $signalp->_filterok
178             Function: Check if the factors required by the user are all ok.
179             Returns : 1/0
180             Args : hash reference
181              
182              
183             =cut
184              
185             sub _filterok {
186              
187 88     88   81 my($self, $hash) = @_;
188              
189             #We hope everything will be fine ;)
190 88         63 my $bool = 1;
191              
192             #If the user did not give any filter, we keep eveything
193 88 50       56 return $bool unless keys %{$self->{_factors}};
  88         162  
194              
195             #If only one of the factors parsed is equal to NO based on the user factors cutoff
196             #Then the filter is not ok.
197 88         69 foreach my $fact (keys %{$self->factors()}){
  88         112  
198 146 100 100     471 if(exists($hash->{$fact}) && $hash->{$fact} =~ /^N/){
199 96         100 $bool = 0;
200             }
201             }
202              
203 88         236 return $bool;
204              
205             }
206              
207             =head2 factors
208              
209             Title : factors
210             Usage : my $feat = $signalp->factors
211             Function: Get/Set the filters required from the user
212             Returns : hash
213             Args : array reference
214              
215              
216             =cut
217              
218             sub factors {
219              
220 98     98 1 63 my($self, $array) = @_;
221              
222 98 100       128 if($array){
223 10         11 $self->{_factors} = { };
224 10         18 foreach my $f (@$array){
225 17 50       24 if(exists($FACTS->{$f})){
226 17         25 $self->{_factors}->{$f} = 1;
227             }
228             else{
229 0         0 $self->throw("[$f] incorrect factor. Supported:\n- ".join("\n- ", keys %$FACTS)."\n");
230             }
231             }
232             }
233              
234 98         168 return $self->{_factors};
235              
236             }
237              
238             =head2 _parsed
239              
240             Title : _parsed
241             Usage : obj->_parsed()
242             Function: Get/Set if the result is parsed or not
243             Returns : 1/0 scalar
244             Args : On set 1
245              
246              
247             =cut
248              
249             sub _parsed {
250              
251 44     44   34 my($self, $parsed) = @_;
252              
253 44 50       73 if(defined($parsed)){
254 0         0 $self->{_parsed} = $parsed;
255             }
256              
257 44         73 return $self->{_parsed};
258              
259             }
260              
261             =head2 _parse
262              
263             Title : _parse
264             Usage : obj->_parse
265             Function: Parse the SignalP result
266             Returns :
267             Args :
268              
269              
270             =cut
271              
272             sub _parse {
273              
274 44     44   33 my($self) = @_;
275              
276             #Let's read the file...
277 44         84 while (my $line = $self->_readline()) {
278              
279 30         24 chomp $line;
280             #We want to be sure to catch the first non empty line to be ablte to determine
281             #which format we are working with...
282 30 100       161 next unless ($line =~ /^>(\S+)|^# SignalP-[NHM]+ \S+ predictions/);
283              
284 10 100       34 if($line =~ /^>(\S+)/){
    50          
285 5         11 $self->_pushback($line);
286 5         10 $self->_parse_summary_format();
287 5         5 last;
288             }
289             elsif($line =~ /^# SignalP-[NHM]+ \S+ predictions/){
290 5         11 $self->_pushback($line);
291 5         11 $self->_parse_short_format();
292 5         7 last;
293             }
294             else{
295 0         0 $self->throw("Unable to determine the format type.");
296             }
297             }
298              
299 44         46 return;
300             }
301              
302             =head2 _parse_summary_format
303              
304             Title : _parse_summary_format
305             Usage : $self->_parse_summary_format
306             Function: Method to parse summary/full format from signalp output
307             It automatically fills filtered features.
308             Returns :
309             Args :
310              
311             =cut
312              
313             sub _parse_summary_format {
314              
315 5     5   5 my($self) = @_;
316              
317 5         5 my $feature = undef;
318 5         26 my $ok = 0;
319              
320 5         8 while(my $line = $self->_readline()){
321              
322 245 100       348 if($line =~ /^SignalP-NN result:/){
323 38         56 $self->_pushback($line);
324 38         49 $feature = $self->_parse_nn_result($feature);
325             }
326 245 100       325 if($line =~ /^SignalP-HMM result:/){
327 27         39 $self->_pushback($line);
328 27         36 $feature = $self->_parse_hmm_result($feature);
329             }
330              
331 245 100 66     597 if($line =~ /^---------/ && $feature){
332 42         58 my $new_feature = $self->create_feature($feature);
333 42 100       56 push @{$self->{_features}}, $new_feature if $new_feature;
  16         22  
334 42         121 $feature = undef;
335             }
336             }
337              
338 5         8 return;
339             }
340              
341              
342             =head2 _parse_nn_result
343              
344             Title : _parse_nn_result
345             Usage : obj->_parse_nn_result
346             Function: Parses the Neuronal Network (NN) part of the result
347             Returns : Hash reference
348             Args :
349              
350              
351             =cut
352              
353             sub _parse_nn_result {
354              
355 38     38   30 my($self, $feature) = @_;
356              
357 38         28 my $ok = 0;
358 38         33 my %facts;
359              
360             #SignalP-NN result:
361             #>MGG_11635.5 length = 100
362             ## Measure Position Value Cutoff signal peptide?
363             # max. C 37 0.087 0.32 NO
364             # max. Y 37 0.042 0.33 NO
365             # max. S 3 0.062 0.87 NO
366             # mean S 1-36 0.024 0.48 NO
367             # D 1-36 0.033 0.43 NO
368              
369 38         45 while(my $line = $self->_readline()){
370              
371 350         258 chomp $line;
372              
373 350 100       463 if($line =~ /^SignalP-NN result:/){
374 38         30 $ok = 1;
375 38         66 next;
376             }
377              
378 312 50       355 $self->throw("Wrong line for parsing NN results.") unless $ok;
379              
380 312 100       1300 if ($line=~/^\>(\S+)\s+length/) {
    100          
    100          
    100          
    100          
    100          
    100          
    100          
381 38         46 $self->seqname($1);
382 38         39 %facts = ();
383 38         65 next;
384             }
385             elsif($line =~ /max\.\s+C\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
386 38         73 $feature->{maxCprob} = $1;
387 38         37 $facts{maxC} = $2;
388 38         63 next;
389             }
390             elsif ($line =~ /max\.\s+Y\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
391 38         47 $feature->{maxYprob} = $1;
392 38         39 $facts{maxY} = $2;
393 38         67 next;
394             }
395             elsif($line =~ /max\.\s+S\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
396 38         54 $feature->{maxSprob} = $1;
397 38         35 $facts{maxS} = $2;
398 38         62 next;
399             }
400             elsif ($line=~/mean\s+S\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
401 38         51 $feature->{meanSprob} = $1;
402 38         39 $facts{meanS} = $2;
403 38         69 next;
404             }
405             elsif ($line=~/\s+D\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
406 38         75 $feature->{Dprob} = $1;
407 38         36 $facts{D} = $2;
408 38         59 next;
409             }
410             #If we don't have this line it means that all the factors cutoff are equal to 'NO'
411             elsif ($line =~ /Most likely cleavage site between pos\.\s+(\d+)/) {
412             #if($self->_filterok(\%facts)){
413             #$feature->{name} = $self->seqname();
414             #$feature->{start} = 1;
415 8         23 $feature->{end} = $1 + 1; #To be consistent with end given in short format
416             #}
417             #return $feature;
418             }
419             elsif($line =~ /^\s*$/){
420 38         34 last;
421             }
422             }
423              
424 38 100       59 if($self->_filterok(\%facts)){
425 10         13 $feature->{name} = $self->seqname();
426 10         16 $feature->{start} = 1;
427 10         13 $feature->{nnPrediction} = 'signal-peptide';
428             }
429              
430 38         59 return $feature;
431             }
432              
433              
434             =head2 _parse_hmm_result
435              
436             Title : _parse_hmm_result
437             Usage : obj->_parse_hmm_result
438             Function: Parses the Hiden Markov Model (HMM) part of the result
439             Returns : Hash reference
440             Args :
441              
442             =cut
443              
444             sub _parse_hmm_result {
445              
446 27     27   23 my ($self, $feature_hash) = @_;
447              
448 27         22 my $ok = 0;
449              
450             #SignalP-HMM result:
451             #>MGG_11635.5
452             #Prediction: Non-secretory protein
453             #Signal peptide probability: 0.000
454             #Signal anchor probability: 0.000
455             #Max cleavage site probability: 0.000 between pos. -1 and 0
456              
457 27         38 while(my $line = $self->_readline()){
458              
459 213         153 chomp $line;
460 213 100       361 next if $line =~ /^\s*$/o;
461              
462 202 100       250 if($line =~ /^SignalP-HMM result:/){
463 30         24 $ok = 1;
464 30         56 next;
465             }
466              
467 172 50       187 $self->throw("Wrong line for parsing HMM result.") unless $ok;
468              
469 172 100       499 if($line =~ /^>(\S+)/){
    100          
    100          
    100          
    100          
470             #In case we already seen a name with NN results
471 35 100       43 $feature_hash->{name} = $1 unless $self->seqname();
472             }
473             elsif($line =~ /Prediction: (.+)$/){
474 30         71 $feature_hash->{hmmPrediction} = $1;
475             }
476             elsif($line =~ /Signal peptide probability: ([0-9\.]+)/){
477 30         70 $feature_hash->{peptideProb} = $1;
478             }
479             elsif($line =~ /Signal anchor probability: ([0-9\.]+)/){
480 30         81 $feature_hash->{anchorProb} = $1;
481             }
482             elsif($line =~ /Max cleavage site probability: (\S+) between pos. \S+ and (\S+)/){
483 24         35 $feature_hash->{cleavageSiteProb} = $1;
484             #Strange case, if we don't have an end value in NN result (no nn method launched)
485             #We try anyway to get an end value, unless this value is lower than 1 which is
486             #the start
487 24 100 66     100 $feature_hash->{end} = $2 if($2 > 1 && !$feature_hash->{end});
488 24 100       38 $feature_hash->{start} = 1 unless $feature_hash->{start};
489 24         23 last;
490             }
491             }
492              
493 27         68 return $feature_hash;
494             }
495              
496             =head2 _parse_short_format
497              
498             Title : _parse_short_format
499             Usage : $self->_parse_short_format
500             Function: Method to parse short format from signalp output
501             It automatically fills filtered features.
502             Returns :
503             Args :
504              
505             =cut
506              
507             sub _parse_short_format {
508              
509 5     5   6 my($self) = @_;
510              
511 5         5 my $ok = 0;
512 5         4 my $method = undef;
513 5         10 $self->{_oformat} = 'short';
514              
515             #Output example
516             # SignalP-NN euk predictions # SignalP-HMM euk predictions
517             # name Cmax pos ? Ymax pos ? Smax pos ? Smean ? D ? # name ! Cmax pos ? Sprob ?
518             #Q5A8M1_CANAL 0.085 27 N 0.190 35 N 0.936 27 Y 0.418 N 0.304 N Q5A8M1_CANAL Q 0.001 35 N 0.002 N
519             #O74127_YARLI 0.121 21 N 0.284 21 N 0.953 11 Y 0.826 Y 0.555 Y O74127_YARLI S 0.485 23 N 0.668 Y
520             #Q5VJ86_9PEZI 0.355 24 Y 0.375 24 Y 0.798 12 N 0.447 N 0.411 N Q5VJ86_9PEZI Q 0.180 23 N 0.339 N
521             #Q5A8U5_CANAL 0.085 27 N 0.190 35 N 0.936 27 Y 0.418 N 0.304 N Q5A8U5_CANAL Q 0.001 35 N 0.002 N
522              
523 5         9 while(my $line = $self->_readline()){
524            
525 60         55 chomp $line;
526 60 100       464 next if $line =~ /^\s*$|^# name/;
527              
528 55 100       81 if($line =~ /^#/){
529 5 100       20 $method = $line =~ /SignalP-NN .+ SignalP-HMM/ ?
    100          
530             'both' : $line =~ /SignalP-NN/ ?
531             'nn' : 'hmm';
532 5         11 next;
533             }
534              
535             #$self->throw("It looks like the format is not 'short' format.") unless($ok);
536              
537 50         280 my @data = split(/\s+/, $line);
538 50         83 $self->seqname($data[0]);
539              
540 50         70 my $factors = { };
541 50         43 my $feature = { };
542              
543             #NN results gives more fields than HMM
544 50 100 100     124 if($method eq 'both' || $method eq 'nn'){
    50          
545              
546 40         58 $feature->{maxCprob} = $data[1];
547 40         30 $factors->{maxC} = $data[3];
548 40         38 $feature->{maxYprob} = $data[4];
549 40         38 $factors->{maxY} = $data[6];
550 40         28 $feature->{maxSprob} = $data[7];
551 40         25 $factors->{maxS} = $data[9];
552 40         33 $feature->{meanSprob}= $data[10];
553 40         32 $factors->{meanS} = $data[11];
554 40         38 $feature->{Dprob} = $data[12];
555 40         31 $factors->{D} = $data[13];
556             #It looks like the max Y position is reported as the most likely cleavage position
557 40         35 $feature->{end} = $data[5];
558 40         34 $feature->{nnPrediction} = 'signal-peptide';
559              
560 40 100       49 if($method eq 'both'){
561 20 100       37 $feature->{hmmPrediction} = $data[15] eq 'Q' ? 'Non-secretory protein' : 'Signal peptide';
562 20         18 $feature->{cleavageSiteProb} = $data[16];
563 20         19 $feature->{peptideProb} = $data[19];
564             }
565             }
566             elsif($method eq 'hmm'){
567             #In short output anchor probability is not given
568 10 100       19 $feature->{hmmPrediction} = $data[1] eq 'Q' ? 'Non-secretory protein' : 'Signal peptide';
569 10         12 $feature->{cleavageSiteProb} = $data[2];
570 10         9 $feature->{peptideProb} = $data[5];
571             #It looks like the max cleavage probability position is given by the Cmax proability
572 10         10 $feature->{end} = $data[3];
573             }
574              
575             #Unfortunately, we cannot parse the filters for hmm method.
576 50 100       83 if($self->_filterok($factors)){
577 20         25 $feature->{name} = $self->seqname();
578 20         18 $feature->{start} = 1;
579 20         20 $feature->{source} = 'Signalp';
580 20         42 $feature->{primary} = 'signal_peptide';
581 20         16 $feature->{program} = 'Signalp';
582 20         19 $feature->{logic_name} = 'signal_peptide';
583              
584 20         24 my $new_feat = $self->create_feature($feature);
585 20 100       34 push @{$self->{_features}}, $new_feat if $new_feat;
  18         101  
586             }
587             }
588              
589 5         7 return;
590             }
591              
592             =head2 create_feature
593              
594             Title : create_feature
595             Usage : obj->create_feature(\%feature)
596             Function: Internal(not to be used directly)
597             Returns :
598             Args :
599              
600              
601             =cut
602              
603             sub create_feature {
604              
605 62     62 1 60 my ($self, $feat) = @_;
606              
607             #If we don't have neither start nor end, we return.
608 62 100 66     162 unless($feat->{name} && $feat->{start} && $feat->{end}){
      33        
609 28         30 return;
610             }
611              
612             # create feature object
613             my $feature = Bio::SeqFeature::Generic->new(
614             -seq_id => $feat->{name},
615             -start => $feat->{start},
616             -end => $feat->{end},
617 34 100       196 -score => defined($feat->{peptideProb}) ? $feat->{peptideProb} : '',
618             -source => 'Signalp',
619             -primary => 'signal_peptide',
620             -logic_name => 'signal_peptide',
621             );
622              
623 34         80 $feature->add_tag_value('peptideProb', $feat->{peptideProb});
624 34         69 $feature->add_tag_value('anchorProb', $feat->{anchorProb});
625 34         67 $feature->add_tag_value('evalue',$feat->{anchorProb});
626 34         49 $feature->add_tag_value('percent_id','NULL');
627 34         61 $feature->add_tag_value("hid",$feat->{primary});
628 34         58 $feature->add_tag_value('signalpPrediction', $feat->{hmmPrediction});
629 34 100       73 $feature->add_tag_value('cleavageSiteProb', $feat->{cleavageSiteProb}) if($feat->{cleavageSiteProb});
630 34 100       64 $feature->add_tag_value('nnPrediction', $feat->{nnPrediction}) if($feat->{nnPrediction});
631 34 100       64 $feature->add_tag_value('maxCprob', $feat->{maxCprob}) if(defined($feat->{maxCprob}));
632 34 100       67 $feature->add_tag_value('maxSprob', $feat->{maxSprob}) if(defined($feat->{maxSprob}));
633 34 100       62 $feature->add_tag_value('maxYprob', $feat->{maxYprob}) if(defined($feat->{maxYprob}));
634 34 100       58 $feature->add_tag_value('meanSprob', $feat->{meanSprob}) if(defined($feat->{meanSprob}));
635 34 100       60 $feature->add_tag_value('Dprob', $feat->{Dprob}) if(defined($feat->{Dprob}));
636              
637 34         37 return $feature;
638              
639             }
640              
641             =head2 seqname
642              
643             Title : seqname
644             Usage : obj->seqname($name)
645             Function: Internal(not to be used directly)
646             Returns :
647             Args :
648              
649              
650             =cut
651              
652             sub seqname{
653 153     153 1 146 my ($self,$seqname)=@_;
654              
655 153 100       189 if (defined($seqname)){
656 88         103 $self->{'seqname'} = $seqname;
657             }
658              
659 153         222 return $self->{'seqname'};
660              
661             }
662              
663              
664             1;
665              
666