File Coverage

lib/SmotifTF/Psipred.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             package SmotifTF::Psipred;
2              
3             # use v5.8.8;
4 1     1   30440 use strict;
  1         2  
  1         25  
5 1     1   6 use warnings;
  1         1  
  1         33  
6              
7 1     1   759 use File::Spec::Functions qw(catfile catdir);
  1         834  
  1         70  
8 1     1   742 use SmotifTF::GeometricalCalculations;
  0            
  0            
9             use Carp;
10             use Proc::Simple;
11             use Data::Dumper;
12              
13             BEGIN {
14             use Exporter ();
15             our ( $VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
16             $VERSION = "0.01";
17              
18             @ISA = qw(Exporter);
19              
20             # name of the functions to export
21             @EXPORT = qw(
22             run
23             analyze_psipred
24             parse
25             write_outfile
26             count_smotifs
27             count_pdb_smotifs
28             );
29              
30             @EXPORT_OK = qw(
31             split_ss_string
32             getFirstResidueNumber
33             ); # symbols to export on request
34             }
35              
36             our @EXPORT_OK;
37              
38             use Config::Simple;
39             my $config_file = $ENV{'SMOTIFTF_CONFIG_FILE'};
40             croak "Environmental variable SMOTIFTF_CONFIG_FILE should be set"
41             unless $config_file;
42             my $cfg = new Config::Simple($config_file);
43              
44             my $psipred = $cfg->param( -block => 'psipred' );
45             my $PSIPRED_PATH = $psipred->{'path'};
46             my $PSIPRED_EXEC = $psipred->{'exec'};
47              
48             use constant SMOTIFS_NUMBER_UPPER_LIMIT => 14;
49             use constant SMOTIFS_NUMBER_LOWER_LIMIT => 2;
50              
51             =head1 NAME
52              
53             Psipred
54              
55             =head1 VERSION
56              
57             Version 0.01
58              
59             =cut
60              
61             our $VERSION = '0.01';
62              
63             =head1 SYNOPSIS
64              
65             Set of routines to run and analyze Psipred
66              
67             Usage:
68              
69             use Psipred;
70              
71             run ($sequence, $directory);
72             analyze_psipred ($pdb, $chain, $directory, $havestructure);
73              
74             =cut
75              
76             =head2 run
77              
78             run psipred
79             /usr/local/bin/runpsipred 4wyq.fasta
80            
81             =cut
82              
83             sub run {
84             use Proc::Simple;
85              
86             my %args = (
87             sequence => '',
88             directory => '',
89             @_,
90             );
91             my $sequence = $args{'sequence'} || undef;
92             my $directory = $args{'directory'} || "./";
93              
94             croak "sequence in fasta format is required" unless $sequence;
95              
96             my $myproc = Proc::Simple->new();
97              
98             my $log_file = "$directory/psipred_log.txt";
99             my $error_file= "$directory/psipred_err.txt";
100             $myproc->redirect_output($log_file, $error_file);
101              
102             chdir $directory;
103             my $runpsipred = catfile( $PSIPRED_PATH, $PSIPRED_EXEC );
104             my $status = $myproc->start( "$runpsipred", $sequence );
105            
106             # Wait until process is done
107             my $exit_status = $myproc->wait();
108              
109             return $exit_status;
110              
111             }
112              
113              
114             =head2 analyze_psipred
115              
116             It will return the secondary structure prediction and sequence
117             from psipred output file.
118              
119             Script to get the Smotif defintions from PSIPRED output
120              
121             INPUT ARGUMENTS:
122             1) $pdb : pdb code (the PSIPRED output file should be in the subdirectory identified by the pdb code e.g. 1ptf/1ptf.horiz)
123             2) $chain : chain ID
124             3) If structure exists, 1, else, 0.
125              
126             INPUT FILES:
127             In the folder:
128             1)/.horiz (PSIPRED output file)
129             2)/pdb.ent (PDB file), if structure exists.
130              
131             OUTPUT FILES:
132             In the folder:
133             .out :File with smotif information: Protein code, Chain, Smotif type, Smotif start residue, Loop length, Length of first secondary structure, Length of second secondary structure, Sequence
134              
135             =cut
136              
137             sub analyze_psipred {
138            
139             my %args = (
140             pdb => '',
141             chain => '',
142             directory => '',
143             havestructure => '',
144             @_,
145             );
146             my $pdb = $args{'pdb'} || undef;
147             my $chain = $args{'chain'} || 'A';
148             my $directory = $args{'directory'} || "./";
149             my $havestructure= $args{'havestructure'} || 0;
150            
151             croak "pdb_id is required" unless $pdb;
152            
153             my %psipred = parse("$directory/$pdb.horiz");
154            
155             #print Dumper(\%psipred);
156            
157             croak "Error processing psipred out files. Pred was not found."
158             unless exists $psipred{"Pred"};
159            
160             croak "Error processing psipred out files. AA was not found."
161             unless exists $psipred{"AA"};
162            
163             my $sslist = $psipred{"Pred"};
164             my $seq = $psipred{"AA"};
165              
166             my $start = 0; #Initialize start residue
167             my @ss = split_ss_string( $sslist, \$start);
168              
169             # use Data::Dumper;
170             # print Dumper(\@ss);
171              
172             # find actual start residue in pdb file, if structure exists
173             my $shiftindex = 1;
174             if ($havestructure == 1) {
175             my $localpdb = SmotifTF::GeometricalCalculations::get_full_path_name_for_pdb_code($pdb, $chain);
176             unless (-e $localpdb) {croak "PDB structure file missing for $pdb $chain"};
177             $shiftindex = getFirstResidueNumber($localpdb);
178             }
179            
180             # Print smotif information to $pdb/$pdb.out file
181             $start = $start + $shiftindex;
182             my $filename = catfile ($directory, "$pdb.out");
183             write_outfile ($pdb,$chain,$filename,$start,$seq,@ss);
184              
185             # Check the number of Smotifs in the Query protein
186             my $num_motifs = count_smotifs ($filename);
187            
188             # print "Returning num_motifs = $num_motifs\n";
189             return ($seq, $num_motifs);
190             }
191              
192             =head2 write_outfile
193            
194             Writes the smotif definitions obtained from Psipred to the $pdb/$pdb.out file
195              
196             Input: $filename, $seq, @ss
197             This is the most important file and has to named $pdb/$pdb.out
198             and the format of this file is
199             print OUTFILE "Name\tChain\tType\tStart\tLooplength\tSS1length\tSS2length\tSequence\n";
200              
201             =cut
202            
203             sub write_outfile {
204            
205             my ($pdb,$chain,$filename,$start,$seq,@ss) = @_;
206            
207             croak "4-letter pdb code is required" unless $pdb;
208             # croak "Chain ID is required" unless $chain;
209             croak "Output filename is required" unless $filename;
210             croak "Protein sequence is required" unless $seq;
211             croak "SS array is requires" unless @ss;
212             croak "Smotif start residue number is required" unless $start;
213            
214             # Query chain-name is not manadatory for calculation further down.
215             # That's why we set it to A by default.
216             $chain = 'A' unless defined $chain;
217              
218             open(OUTFILE,">$filename");
219             print OUTFILE "Name\tChain\tType\tStart\tLooplength\tSS1length\tSS2length\tSequence\n";
220             for (my $aa=0;$aa<(scalar(@ss)-1)/2;$aa++) {
221             print OUTFILE "$pdb";
222             print OUTFILE ".pdb\t$chain\t";
223             print OUTFILE $ss[$aa*2][0];
224             print OUTFILE $ss[$aa*2+2][0];
225             print OUTFILE "\t",$start,"\t";
226             print OUTFILE scalar(@{$ss[$aa*2+1]}),"\t";
227             print OUTFILE scalar(@{$ss[$aa*2]}),"\t";
228             print OUTFILE scalar(@{$ss[$aa*2+2]}),"\t";
229             my $len=scalar(@{$ss[$aa*2]})+scalar(@{$ss[$aa*2+1]})+scalar(@{$ss[$aa*2+2]});
230             print OUTFILE substr($seq,$start,$len),"\n";
231             $start+=scalar(@{$ss[$aa*2]})+scalar(@{$ss[$aa*2+1]});
232             }
233             close(OUTFILE);
234              
235             }
236              
237             =head2 split_ss_string
238              
239             SUBROUTINE to convert a series of secondary structure types to a list of start and end points for smotifs
240             #Input 1: $ss - a string of secondary structure types,
241             where H=alpha helix and E=strand e.g. (HHHHLLLLEEEE)
242             #Input 2: $startpos - residue at which first smotif begins
243              
244             =cut
245             sub split_ss_string {
246             my ($ss, $startpos) = @_;
247              
248             croak "secondary structure string is needed" unless $ss;
249             croak "start position is not defined" unless defined $startpos;
250              
251             my @ss = split(//, $ss);
252             # convert all non-helix/non-strand positions to 'L'
253             foreach (@ss) {
254             if ( (($_) ne 'H') and (($_) ne 'E') ) {
255             $_='L'
256             }
257             }
258            
259             # convert strands/helices less than 2 residues long into loops
260             my $sslength = 1;
261             my $prev = $ss[0];
262             for (my $count=1; $count
263             if ($ss[$count] eq $prev) {
264             $sslength++;
265             } else {
266             if ((($sslength<2) and ($prev ne 'L')) or (($sslength<4) and ($prev eq 'H'))) {
267             for (my $aa = $count-$sslength; $aa<$count; $aa++) {
268             $ss[$aa]='L'
269             };
270             }
271             $prev = $ss[$count];
272             $sslength= 1;
273             }
274             }
275             my @out;
276             my $start = 0;
277             for (my $aa=0; $aa
278             if ($ss[$aa] ne $ss[$aa+1]) { #switch in secondary structure type implies a landmark
279             push( @out,[@ss[$start..$aa]]);
280             if (($ss[$aa] ne 'L') and ($ss[$aa+1] ne 'L')) {
281             $ss[$aa+1] = 'L';
282             }
283             $start = $aa + 1;
284             }
285             }
286             if ($out[0][0] eq 'L') {
287             $$startpos = scalar(@{$out[0]});
288             shift(@out);
289             }
290             if ($out[-1][0] eq 'L') {
291             pop(@out)
292             };
293             return @out;
294             }
295              
296             =head2 getFirstResidueNumber
297              
298             Subroutine to get starting residue number of a pdb file
299              
300             =cut
301              
302             sub getFirstResidueNumber {
303              
304             my ($localpdb) = @_;
305              
306             croak "PDB file name is required" unless $localpdb;
307            
308             my $resno = 1;
309             open PDB, "<$localpdb";
310             WLOOP: while (my $line = ) {
311             chomp $line;
312             #print "$line\n";
313             if ($line =~ /^ATOM/) {
314             $resno = substr($line, 22, 5); $resno =~ s/\s+//g;
315             last WLOOP;
316             } elsif ($line =~ /^HETATM/) {
317             $resno = substr($line, 22, 5); $resno =~ s/\s+//g;
318             last WLOOP;
319             }
320             }
321             $resno = 1 if ($resno =~ /[a-zA-Z]/);
322              
323             close PDB;
324             return $resno;
325             }
326              
327             =head2 parse
328              
329             It returns a hash containing the following keys:
330             $VAR1 = {
331             'AA' => 'MNLQPIFWIGLISSVCCVF...'
332             'Conf' => '9975402235541123446...'
333             'Pred' => 'CCCCCCHHHHHHHCEEEEE...'
334             };
335              
336             =cut
337             sub parse {
338            
339             my ($file) = @_;
340            
341             chomp $file if $file;
342            
343             open my $fh, "<". $file
344             or die "cannot open $file $!";
345            
346             croak "$file does end in .horiz"
347             unless $file =~ m/\.horiz$/;
348              
349             my %ppred;
350             while (my $line = <$fh>) {
351             if ($line =~ /^Conf:/) {
352             $line =~ s/^Conf://g;
353             $line =~ s/\s+//g;
354             if ( exists($ppred{'Conf'}) ){
355             $ppred{'Conf'} .= $line;
356             }
357             else {
358             $ppred{'Conf'} = $line;
359             }
360             }
361             if ($line =~ /^Pred:/) {
362             $line =~ s/^Pred://g;
363             $line =~ s/\s+//g;
364             if ( exists($ppred{'Pred'}) ){
365             $ppred{'Pred'} .= $line;
366             }
367             else {
368             $ppred{'Pred'} = $line;
369             }
370             }
371             if ($line =~ /^\s+AA:/) {
372             $line =~ s/^\s+AA://g;
373             $line =~ s/\s+//g;
374             if ( exists($ppred{Conf}) ){
375             $ppred{'AA'} .= $line;
376             }
377             else {
378             $ppred{'AA'} = $line;
379             }
380             }
381             }
382             close $fh;
383             return %ppred;
384             }
385              
386             =head2 count_smotifs
387              
388             Subroutine to count the number of Smotifs in a given Smotif definition file (typically $pdb/$pdb.out).
389             Input : Filename with Smotif definition
390             Output: Returns the number of Smotifs in the given file
391              
392             =cut
393              
394             sub count_smotifs {
395              
396             my ($filename) = @_;
397            
398             # print "opening $filename ...\n";
399             open(INFILE, $filename) or croak "Unable to open Smotif definition file $filename\n";
400             my $smotifs = 0;
401             my $line = ; #skip header line
402             while (my $line = ) {
403             chomp $line;
404             my @lin = split (/\s+/,$line);
405             # print Dumper(\@lin);
406             croak "Unrecognized file format in $filename\n"
407             if not (scalar(@lin) == 8);
408             $smotifs++;
409             }
410             close (INFILE);
411             # print "smotifs = $smotifs\n";
412             return $smotifs;
413             }
414              
415             =head2 count_pdb_smotifs
416              
417             Subroutine to count the number of Smotifs in a given Smotif definition file (typically $pdb/$pdb.out).
418             Input : Filename with Smotif definition
419             Output: Returns the number of Smotifs in the given file
420              
421             =cut
422              
423             sub count_pdb_smotifs {
424             my ( $pdb ) = @_;
425             use File::Spec::Functions qw(catfile);
426              
427             croak "4-letter pdb code is required" unless $pdb;
428              
429             # my $filename = "$pdb\/$pdb" . ".out";
430             my $filename = catfile( $pdb, "$pdb.out" );
431              
432             croak "$filename does not exists. First run 'Psipred' step 1."
433             unless ( -e $filename );
434              
435             my $smotifs = count_smotifs( $filename );
436             return $smotifs;
437             }
438              
439             =head1 AUTHORS
440              
441             Fiserlab Members , C<< >>
442              
443             =head1 BUGS
444              
445             Please report any bugs or feature requests to C, or through
446             the web interface at L.
447              
448             =head1 SUPPORT
449              
450             You can find documentation for this module with the perldoc command.
451              
452             perldoc Psipred
453              
454             =over 4
455              
456             =item * RT: CPAN's request tracker (report bugs here)
457              
458             L
459              
460             =item * AnnoCPAN: Annotated CPAN documentation
461              
462             L
463              
464             =item * CPAN Ratings
465              
466             L
467              
468             =item * Search CPAN
469              
470             L
471              
472             =back
473              
474             =head1 LICENSE AND COPYRIGHT
475              
476             Copyright 2015 Fiserlab Members .
477              
478             This program is free software; you can redistribute it and/or modify it
479             under the terms of the the Artistic License (2.0). You may obtain a
480             copy of the full license at:
481              
482             L
483              
484             Any use, modification, and distribution of the Standard or Modified
485             Versions is governed by this Artistic License. By using, modifying or
486             distributing the Package, you accept this license. Do not use, modify,
487             or distribute the Package, if you do not accept this license.
488              
489             If your Modified Version has been derived from a Modified Version made
490             by someone other than you, you are nevertheless required to ensure that
491             your Modified Version complies with the requirements of this license.
492              
493             This license does not grant you the right to use any trademark, service
494             mark, tradename, or logo of the Copyright Holder.
495              
496             This license includes the non-exclusive, worldwide, free-of-charge
497             patent license to make, have made, use, offer to sell, sell, import and
498             otherwise transfer the Package with respect to any patent claims
499             licensable by the Copyright Holder that are necessarily infringed by the
500             Package. If you institute patent litigation (including a cross-claim or
501             counterclaim) against any party alleging that the Package constitutes
502             direct or contributory patent infringement, then this Artistic License
503             to you shall terminate on the date that such litigation is filed.
504              
505             Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
506             AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
507             THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
508             PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
509             YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
510             CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
511             CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
512             EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
513              
514              
515             =cut
516              
517             1; # End of Psipred