| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | # -*-CPerl-*- | 
| 2 |  |  |  |  |  |  | # Last changed Time-stamp: <2014-10-02 19:17:12 mtw> | 
| 3 |  |  |  |  |  |  |  | 
| 4 |  |  |  |  |  |  | package Bio::ViennaNGS::AnnoC; | 
| 5 |  |  |  |  |  |  |  | 
| 6 | 1 |  |  | 1 |  | 17370 | use Exporter; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 61 |  | 
| 7 | 1 |  |  | 1 |  | 1831 | use version; our $VERSION = qv('0.06'); | 
|  | 1 |  |  |  |  | 2150 |  | 
|  | 1 |  |  |  |  | 7 |  | 
| 8 | 1 |  |  | 1 |  | 93 | use strict; | 
|  | 1 |  |  |  |  | 7 |  | 
|  | 1 |  |  |  |  | 72 |  | 
| 9 | 1 |  |  | 1 |  | 7 | use strict; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 35 |  | 
| 10 | 1 |  |  | 1 |  | 5 | use warnings; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 34 |  | 
| 11 | 1 |  |  | 1 |  | 9381 | use Data::Dumper; | 
|  | 1 |  |  |  |  | 10993 |  | 
|  | 1 |  |  |  |  | 69 |  | 
| 12 | 1 |  |  | 1 |  | 730 | use Bio::Tools::GFF; | 
|  | 1 |  |  |  |  | 104325 |  | 
|  | 1 |  |  |  |  | 32 |  | 
| 13 | 1 |  |  | 1 |  | 491 | use Bio::DB::Fasta; | 
|  | 1 |  |  |  |  | 13836 |  | 
|  | 1 |  |  |  |  | 63 |  | 
| 14 | 1 |  |  | 1 |  | 651 | use Path::Class; | 
|  | 1 |  |  |  |  | 17897 |  | 
|  | 1 |  |  |  |  | 56 |  | 
| 15 | 1 |  |  | 1 |  | 7 | use Carp; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 910 |  | 
| 16 |  |  |  |  |  |  |  | 
| 17 |  |  |  |  |  |  | our @ISA       = qw(Exporter); | 
| 18 |  |  |  |  |  |  | our @EXPORT_OK = qw(parse_gff feature_summary get_fasta_ids | 
| 19 |  |  |  |  |  |  | $feat $fstat $fastadb | 
| 20 |  |  |  |  |  |  | @fastaids | 
| 21 |  |  |  |  |  |  | %features %featsta); | 
| 22 |  |  |  |  |  |  | our @EXPORT    = (); | 
| 23 |  |  |  |  |  |  |  | 
| 24 |  |  |  |  |  |  | #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^# | 
| 25 |  |  |  |  |  |  | #^^^^^^^^^^ Variables ^^^^^^^^^^^# | 
| 26 |  |  |  |  |  |  | #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^# | 
| 27 |  |  |  |  |  |  |  | 
| 28 |  |  |  |  |  |  | our ($fastadb); | 
| 29 |  |  |  |  |  |  | our %features  = (); | 
| 30 |  |  |  |  |  |  | our %featstat  = (); | 
| 31 |  |  |  |  |  |  | our $feat      = \%features; | 
| 32 |  |  |  |  |  |  | our $fstat     = \%featstat; | 
| 33 |  |  |  |  |  |  | our @fastaids  = (); | 
| 34 |  |  |  |  |  |  |  | 
| 35 |  |  |  |  |  |  | #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^# | 
| 36 |  |  |  |  |  |  | #^^^^^^^^^^^ Subroutines ^^^^^^^^^^# | 
| 37 |  |  |  |  |  |  | #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^# | 
| 38 |  |  |  |  |  |  |  | 
| 39 |  |  |  |  |  |  | sub parse_gff { | 
| 40 | 0 |  |  | 0 | 1 |  | my $in_file = shift; | 
| 41 | 0 |  |  |  |  |  | my ($i,$gffio,$feature,$gbkey); | 
| 42 | 0 |  |  |  |  |  | my $this_function = (caller(0))[3]; | 
| 43 |  |  |  |  |  |  |  | 
| 44 | 0 |  |  |  |  |  | $gffio = Bio::Tools::GFF->new(-file        => $in_file, | 
| 45 |  |  |  |  |  |  | -gff_version  => 3, | 
| 46 |  |  |  |  |  |  | ); | 
| 47 | 0 |  |  |  |  |  | $gffio->ignore_sequence(1); | 
| 48 | 0 | 0 |  |  |  |  | if (my $header = $gffio->next_segment() ){ | 
| 49 | 0 |  |  |  |  |  | $featstat{accession}= $header->display_id(); | 
| 50 |  |  |  |  |  |  | } | 
| 51 |  |  |  |  |  |  | else{ | 
| 52 | 0 |  |  |  |  |  | carp "[$this_function]: Cannot parse GFF header\n"; | 
| 53 |  |  |  |  |  |  | } | 
| 54 |  |  |  |  |  |  |  | 
| 55 | 0 |  |  |  |  |  | while($feature = $gffio->next_feature()) { | 
| 56 | 0 |  |  |  |  |  | my ($uid,$feat_name); | 
| 57 | 0 |  |  |  |  |  | my @name = my @id = my @gbkeys = (); | 
| 58 |  |  |  |  |  |  |  | 
| 59 | 0 | 0 |  |  |  |  | next if ($feature->primary_tag() eq "exon"); | 
| 60 |  |  |  |  |  |  |  | 
| 61 |  |  |  |  |  |  | # 1) determine gbkey of the current feature | 
| 62 | 0 |  |  |  |  |  | @gbkeys = $feature->get_tag_values("gbkey"); | 
| 63 | 0 |  |  |  |  |  | $gbkey  = $gbkeys[0]; | 
| 64 |  |  |  |  |  |  |  | 
| 65 |  |  |  |  |  |  | # 2) get a unique ID for each feature | 
| 66 | 0 | 0 |  |  |  |  | if ($feature->has_tag('ID')){ | 
| 67 | 0 |  |  |  |  |  | @id = $feature->get_tag_values('ID'); | 
| 68 | 0 |  |  |  |  |  | $uid = $id[0]; # ID=id101 | 
| 69 |  |  |  |  |  |  | } | 
| 70 |  |  |  |  |  |  | else { | 
| 71 | 0 |  |  |  |  |  | croak "ERROR [$this_function] Feature '$gbkey' at pos.\ | 
| 72 |  |  |  |  |  |  | $feature->start does not have \'ID\' attribute\n"; | 
| 73 |  |  |  |  |  |  | } | 
| 74 |  |  |  |  |  |  |  | 
| 75 |  |  |  |  |  |  | # 3) assign parent's unique ID in case a parent record exists | 
| 76 | 0 | 0 |  |  |  |  | if ($feature->has_tag('Parent')){ | 
| 77 | 0 |  |  |  |  |  | @id = $feature->get_tag_values('Parent'); | 
| 78 | 0 |  |  |  |  |  | $uid = $id[0]; # ID=id101 | 
| 79 |  |  |  |  |  |  | } | 
| 80 |  |  |  |  |  |  |  | 
| 81 |  |  |  |  |  |  | # 4) find a name for the current feature, use 'Name' or 'ID' attribute | 
| 82 | 0 | 0 |  |  |  |  | if ($feature->has_tag('Name')){ | 
|  |  | 0 |  |  |  |  |  | 
| 83 | 0 |  |  |  |  |  | @name = $feature->get_tag_values('Name'); | 
| 84 | 0 |  |  |  |  |  | $feat_name = $name[0]; | 
| 85 |  |  |  |  |  |  | } | 
| 86 |  |  |  |  |  |  | elsif ($feature->has_tag('ID')){ | 
| 87 | 0 |  |  |  |  |  | @id = $feature->get_tag_values('ID'); | 
| 88 | 0 |  |  |  |  |  | $feat_name = $id[0]; # ID=id101, use ID as feature name | 
| 89 |  |  |  |  |  |  | } | 
| 90 |  |  |  |  |  |  | else { | 
| 91 | 0 |  |  |  |  |  | croak "ERROR [$this_function] Cannot set name for feature \ | 
| 92 |  |  |  |  |  |  | $feature->gbkey at pos. $feature->start\n"; | 
| 93 |  |  |  |  |  |  | } | 
| 94 |  |  |  |  |  |  |  | 
| 95 | 0 | 0 |  |  |  |  | unless (exists $features{$uid}) { # gene / ribosome_entry_site / etc. | 
| 96 | 0 |  |  |  |  |  | $features{$uid}->{start}     = $feature->start; | 
| 97 | 0 |  |  |  |  |  | $features{$uid}->{end}       = $feature->end; | 
| 98 | 0 |  |  |  |  |  | $features{$uid}->{strand}    = $feature->strand; | 
| 99 | 0 |  |  |  |  |  | $features{$uid}->{length}    = $feature->length; | 
| 100 | 0 |  |  |  |  |  | $features{$uid}->{seqid}     = $feature->seq_id; | 
| 101 | 0 |  | 0 |  |  |  | $features{$uid}->{score}     = $feature->score || 0; | 
| 102 | 0 |  |  |  |  |  | $features{$uid}->{gbkey}     = $gbkey; | 
| 103 | 0 |  |  |  |  |  | $features{$uid}->{name}      = $feat_name; | 
| 104 | 0 |  |  |  |  |  | $features{$uid}->{uid}       = $uid; | 
| 105 |  |  |  |  |  |  | } | 
| 106 |  |  |  |  |  |  | else { # CDS / tRNA / rRNA / etc | 
| 107 | 0 |  |  |  |  |  | $features{$uid}->{gbkey} = $gbkey;  # gbkey for tRNA/ rRNA/ CDS etc | 
| 108 |  |  |  |  |  |  | } | 
| 109 |  |  |  |  |  |  | } | 
| 110 |  |  |  |  |  |  |  | 
| 111 |  |  |  |  |  |  | # finally generate some statistics on features present in this annotation | 
| 112 | 0 |  |  |  |  |  | $featstat{total} = 0; | 
| 113 | 0 |  |  |  |  |  | $featstat{origin} = "$this_function ".$VERSION; | 
| 114 | 0 |  |  |  |  |  | foreach my $ft (keys %features){ | 
| 115 | 0 |  |  |  |  |  | $featstat{total}++; | 
| 116 | 0 |  |  |  |  |  | my $key = $features{$ft}->{gbkey}; | 
| 117 | 0 | 0 |  |  |  |  | unless (exists $featstat{$key}){ | 
| 118 | 0 |  |  |  |  |  | $featstat{$key} = 0; | 
| 119 |  |  |  |  |  |  | } | 
| 120 | 0 |  |  |  |  |  | $featstat{$key} += 1; | 
| 121 |  |  |  |  |  |  | } | 
| 122 | 0 |  |  |  |  |  | $gffio->close(); | 
| 123 |  |  |  |  |  |  | # print Dumper (\%features); | 
| 124 |  |  |  |  |  |  | } | 
| 125 |  |  |  |  |  |  |  | 
| 126 |  |  |  |  |  |  | # feature_summary($fsR,$dest) | 
| 127 |  |  |  |  |  |  | # Print summary of %featstat hash | 
| 128 |  |  |  |  |  |  | # | 
| 129 |  |  |  |  |  |  | # ARG1: reference to %featstat hash | 
| 130 |  |  |  |  |  |  | # ARG2: path for output file | 
| 131 |  |  |  |  |  |  | sub feature_summary { | 
| 132 | 0 |  |  | 0 | 1 |  | my ($fsR, $dest) = @_; | 
| 133 | 0 |  |  |  |  |  | my ($fn,$fh); | 
| 134 | 0 |  |  |  |  |  | my $this_function = (caller(0))[3]; | 
| 135 |  |  |  |  |  |  |  | 
| 136 | 0 | 0 |  |  |  |  | croak "ERROR [$this_function] $dest does not exist\n" | 
| 137 |  |  |  |  |  |  | unless (-d $dest); | 
| 138 |  |  |  |  |  |  |  | 
| 139 | 0 |  |  |  |  |  | $fn = dir($dest,$$fsR{accession}.".summary.txt"); | 
| 140 | 0 | 0 |  |  |  |  | open $fh, ">", $fn or croak $!; | 
| 141 |  |  |  |  |  |  |  | 
| 142 | 0 |  |  |  |  |  | print $fh "Accession\t $$fsR{accession}\n"; | 
| 143 | 0 |  |  |  |  |  | print $fh "Origin   \t $$fsR{origin}\n"; | 
| 144 | 0 |  |  |  |  |  | foreach my $ft (sort keys %$fsR){ | 
| 145 | 0 | 0 | 0 |  |  |  | next if ($ft =~ /total/ || $ft =~ /accession/ || $ft =~ /origin/); | 
|  |  |  | 0 |  |  |  |  | 
| 146 | 0 |  |  |  |  |  | print $fh "$ft\t$$fsR{$ft}\n"; | 
| 147 |  |  |  |  |  |  | } | 
| 148 | 0 |  |  |  |  |  | print $fh "Total\t$$fsR{total}\n"; | 
| 149 | 0 |  |  |  |  |  | close $fh; | 
| 150 |  |  |  |  |  |  | } | 
| 151 |  |  |  |  |  |  |  | 
| 152 |  |  |  |  |  |  | # get_fasta_ids($fa_in) | 
| 153 |  |  |  |  |  |  | # Returns an array with all fasta ids from a (multi)fasta file | 
| 154 |  |  |  |  |  |  | # | 
| 155 |  |  |  |  |  |  | # ARG1: fa_in | 
| 156 |  |  |  |  |  |  | sub get_fasta_ids { | 
| 157 | 0 |  |  | 0 | 1 |  | my $fa_in = shift; | 
| 158 | 0 | 0 |  |  |  |  | unless (defined $fa_in){ | 
| 159 | 0 |  |  |  |  |  | confess "Fasta input not available"; | 
| 160 |  |  |  |  |  |  | } | 
| 161 | 0 | 0 |  |  |  |  | $fastadb = Bio::DB::Fasta->new($fa_in) or die $!; | 
| 162 | 0 |  |  |  |  |  | @fastaids = $fastadb->ids; | 
| 163 | 0 |  |  |  |  |  | return @fastaids; | 
| 164 |  |  |  |  |  |  | } | 
| 165 |  |  |  |  |  |  |  | 
| 166 |  |  |  |  |  |  | 1; | 
| 167 |  |  |  |  |  |  |  | 
| 168 |  |  |  |  |  |  | __END__ |