File Coverage

blib/lib/Bio/ToolBox/GeneTools.pm

Criterion	Covered	Total	%
statement	275	572	48.0
branch	122	362	33.7
condition	16	88	18.1
subroutine	30	38	78.9
pod	30	33	90.9
total	473	1093	43.2

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::ToolBox::GeneTools;
2							our $VERSION = '1.67';
3
4							=head1 NAME
5
6							Bio::ToolBox::GeneTools - SeqFeature agnostic methods for working with gene models
7
8							=head1 SYNOPSIS
9
10							use Bio::ToolBox::GeneTools qw(:all);
11
12							my $gene; # a SeqFeatureI compliant gene object obtained elsewhere
13							# for example, from Bio::DB::SeqFeature::Store database
14							# or parsed from a GFF3, GTF, or UCSC-style gene table using
15							# Bio::ToolBox parsers
16
17							if (is_coding($gene)) { # boolean test
18
19							# collect all exons from all transcripts in gene
20							my @exons = get_exons($gene);
21
22							# find just the alternate exons used only once
23							my @alternate_exons = get_alt_exons($gene);
24
25							# collect UTRs, which may not be defined in the original source
26							my @utrs;
27							foreach my $t (get_transcripts($gene)) {
28							my @u = get_utrs($t);
29							push @utrs, @u;
30							}
31							}
32
33
34							=head1 DESCRIPTION
35
36							This module provides numerous exportable functions for working with gene
37							SeqFeature models. This assumes that the gene models follow the BioPerl
38							L convention with nested SeqFeature objects representing the
39							gene, transcript, and exons. For example,
40
41							gene
42							transcript
43							exon
44							CDS
45
46							Depending upon how the SeqFeatures were generated or defined, subfeatures
47							may or may not be defined or be obvious. For example, UTRs or introns may
48							not be present. Furthermore, the C or type may not follow
49							Sequence Ontology terms. Regular expressions are deployed to handle
50							varying naming schemes and exceptions.
51
52							These functions should work with most or all L compliant
53							objects. It has been tested with L,
54							L, and L classes.
55
56							New SeqFeature objects that are generated use the same class for
57							simplicity and expediency.
58
59							=head1 METHODS
60
61							=head2 Function Import
62
63							None of the functions are exported by default. Specify which ones you want
64							when you import the module. Alternatively, use one of the tags below.
65
66							=over 4
67
68							=item :all
69
70							Import all of the methods.
71
72							=item :exon
73
74							Import all of the exon methods, including L, L,
75							L, L, and L.
76
77							=item :intron
78
79							Import all of the intron methods, including L, L,
80							L, L, and L.
81
82							=item :transcript
83
84							Import the transcript related methods, including L,
85							L, and L.
86
87							=item :cds
88
89							Import the CDS pertaining methods, including L, L,
90							L, L, L, and L.
91
92							=item :export
93
94							Import all of the export methods, including L, L,
95							L, and L;
96
97							=item :filter
98
99							Import all of the transcript filter methods, including L,
100							L, and L.
101
102							=back
103
104							=head2 Exon Methods
105
106							Functions to get a list of exons from a gene or transcript
107
108							=over 4
109
110							=item get_exons
111
112							my @exons = get_exons($gene);
113							my @exons = get_exons($transcript);
114
115							This will return an array or array reference of all the exon subfeatures in
116							the SeqFeature object, either gene or transcript. No discrimination whether
117							they are used once or more than once. Non-defined exons can be assembled from
118							CDS and/or UTR subfeatures. Exons are sorted by start coordinate.
119
120							=item get_alt_exons
121
122							my @alternate_exons = get_alt_exons($gene);
123
124							This will return an array or array reference of all the exon subfeatures for
125							a multi-transcript gene that are used only once in all of the transcripts.
126
127							=item get_common_exons
128
129							my @common_exons = get_common_exons($gene);
130
131							This will return an array or array reference of all the exon subfeatures for
132							a multi-transcript gene that are used in all of the transcripts.
133
134							=item get_uncommon_exons
135
136							my @uncommon_exons = get_uncommon_exons($gene);
137
138							This will return an array or array reference of all the exon subfeatures for
139							a multi-transcript gene that are used in some of the transcripts, i.e. more
140							than one but not all.
141
142							=item get_alt_common_exons
143
144							my %exon_hash = get_alt_common_exons($gene);
145
146							This will return a hash reference with several keys, including "common",
147							"uncommon", and each of the transcript IDs. Each key value is an array
148							reference with the exons for that category. The "common" will be all
149							common exons, "uncommon" will be uncommon exons (used more than once but
150							less than all), and each transcript ID will include their specific alternate
151							exons (used only once).
152
153							For genes with only a single transcript, all exons will be marked as "common"
154							for simplicity, although technically they could all be considered "alternate"
155							since they're only used once.
156
157							=back
158
159							=head2 Intron Methods
160
161							Functions to get a list of introns from a gene or transcript. Introns are
162							not usually defined in gene annotation files, but are inferred from the
163							exons and total gene or transcript length. In this case, new SeqFeature
164							elements are generated for each intron.
165
166							=over 4
167
168							=item get_introns
169
170							my @introns = get_introns($gene);
171							my @introns = get_introns($transcript);
172
173							This will return an array or array reference of all the intron subfeatures in
174							the SeqFeature object, either gene or transcript. No discrimination whether
175							they are used once or more than once. Non-defined introns can be assembled from
176							CDS andEor UTR subfeatures. Introns are sorted by start coordinate.
177
178							=item get_alt_introns
179
180							my @alternate_introns = get_alt_introns($gene);
181
182							This will return an array or array reference of all the intron subfeatures for
183							a multi-transcript gene that are used only once in all of the transcripts.
184
185							=item get_common_introns
186
187							my @common_introns = get_common_introns($gene);
188
189							This will return an array or array reference of all the intron subfeatures for
190							a multi-transcript gene that are used in all of the transcripts.
191
192							=item get_uncommon_introns
193
194							my @uncommon_introns = get_uncommon_introns($gene);
195
196							This will return an array or array reference of all the intron subfeatures for
197							a multi-transcript gene that are used in some of the transcripts, i.e. more
198							than one but not all.
199
200							=item get_alt_common_introns
201
202							my %intron_hash = get_alt_common_introns($gene);
203
204							This will return a hash reference with several keys, including "common",
205							"uncommon", and each of the transcript IDs. Each key value is an array
206							reference with the introns for that category. The "common" will be all
207							common introns, "uncommon" will be uncommon introns (used more than once but
208							less than all), and each transcript ID will include their specific alternate
209							introns (used only once).
210
211							For genes with only a single transcript, all introns will be marked as "common"
212							for simplicity, although technically they could all be considered "alternate"
213							since they're only used once.
214
215							=back
216
217							=head2 Transcript Methods
218
219							These methods work on transcripts, typically alternate transcripts from a
220							gene SeqFeature.
221
222							=over 4
223
224							=item get_transcripts
225
226							my @transcripts = get_transcripts($gene);
227
228							Returns an array or array reference of the transcripts associated with a
229							gene feature.
230
231							=item collapse_transcripts
232
233							my $new_transcript = collapse_transcripts($gene);
234							my $new_transcript = collapse_transcripts($transcript1, $transcript2);
235
236							This method will collapse all of the transcripts associated with a gene
237							SeqFeature into a single artificial transcript, merging exons as necessary
238							to maximize exon length and minimize introns. This is useful when
239							performing, for example, RNASeq analysis on genes. A single SeqFeature
240							transcript object is returned containing the merged exon subfeatures.
241
242							Pass either a gene or a list of transcripts to collapse.
243
244							=item get_transcript_length
245
246							my $length = get_transcript_length($transcript);
247
248							Calculates and returns the transcribed length of a transcript, i.e
249							the sum of its exon lengths. B If you pass a gene object, you
250							will get the maximum of all transcript exon lengths, which may not be
251							what you anticipate!
252
253							=back
254
255							=head2 CDS methods
256
257							These methods calculate values related to the coding sequence of the
258							mRNA transcripts.
259
260							=over 4
261
262							=item is_coding
263
264							if( is_coding($transcript) ) {
265							# do something
266							}
267
268							This method will return a boolean value (1 or 0) if the passed transcript object
269							appears to be a coding transcript. GFF and GTF files are not always immediately
270							clear about the type of transcript; there are (unfortunately) multiple ways
271							to encode the feature as a protein coding transcript: C,
272							C, GFF attribute, presence of CDS subfeatures, etc.
273							This method checks all of these possibilities. B: If you pass a
274							multi-transcript gene, only one transcript need to be coding to pass a true
275							value.
276
277							=item get_cds
278
279							my @cds = get_cds($transcript);
280
281							Returns the CDS subfeatures of the given transcript, if they are
282							defined. Returns either an array or array reference.
283
284							=item get_cdsStart
285
286							my $start = get_cdsStart($trancript);
287
288							Returns the start coordinate of the CDS for the given transcript.
289							Note that this is the leftmost (smallest) coordinate of the CDS
290							and not necessarily the coordinate of the start codon, similar to
291							what the UCSC gene tables report. Use the transcript strand to
292							determine the 5' end.
293
294							=item get_cdsEnd
295
296							my $end = get_cdsEnd($trancript);
297
298							Returns the stop coordinate of the CDS for the given transcript.
299							B that this is the rightmost (largest) coordinate of the CDS
300							and not necessarily the coordinate of the stop codon, similar to
301							what the UCSC gene tables report. Use the transcript strand to
302							determine which is the C<3'> and C<5'> end.
303
304							=item get_start_codon
305
306							my $start_codon = get_start_codon($trancript);
307
308							Returns a SeqFeature object representing the start codon. If one is
309							not explicitly defined in the hierarchy, then a new object is generated.
310
311							=item get_stop_codon
312
313							my $stop_codon = get_stop_codon($transcript);
314
315							Returns a SeqFeature object representing the stop codon. If one is
316							not defined in the hierarchy, then a new object is created. B that
317							this assumes that the stop codon is inclusive to the defined CDS, which is
318							the case with GFF3 and UCSC gene table derived features. On the other hand,
319							features derived from GTF is defined with the stop codon exclusive to the CDS.
320							This shouldn't matter with GTF, however, since GTF usually explicitly includes
321							stop codon features, whereas the other two formats do not.
322
323							=item get_transcript_cds_length
324
325							my $length = get_transcript_cds_length($transcript);
326
327							Calculates and returns the length of the coding sequence for a
328							transcript, i.e. the sum of the CDS lengths.
329
330							=item get_utrs
331
332							my @utrs = get_utrs($trancript);
333
334							Returns both C<5'> and C<3'> untranslated regions of the transcript. If these are
335							not defined in the SeqFeature subfeature hierarchy, then the coordinates will be
336							determined from from the exon and CDS subfeatures, if available, and new SeqFeature
337							objects generated. Non-coding transcripts will not return anything.
338
339							=item get_5p_utrs
340
341							my @5p_utrs = get_5p_utrs($trancript);
342
343							Returns only the C<5'> untranslated regions of the transcript.
344
345							=item get_3p_utrs($transcript)
346
347							my @3p_utrs = get_3p_utrs($trancript);
348
349							Returns only the C<3'> untranslated regions of the transcript.
350
351							=back
352
353							=head2 Export methods
354
355							These methods are used for exporting a gene andEor transcript model into
356							a text string based on the specified format.
357
358							=over 4
359
360							=item gff_string
361
362							my $string .= gff_string($gene, 1);
363							my $string .= gff_string($transcript, 1);
364
365							This is just a convenience method. SeqFeature objects based on
366							L, L, or L
367							have a C method, and this will simply call that method. SeqFeature
368							objects that do not have this method will, of course, cause the script to
369							terminate.
370
371							Pass the seqfeature object and a boolean value to recursively append all
372							subfeatures (e.g. exons) to the string. In most cases, this will generate a
373							GFF3-style string.
374
375							L also provides a simplified gff_string method.
376
377							=item gtf_string
378
379							my $string .= gtf_string($gene);
380							my $string .= gtf_string($transcript);
381
382							This will export a gene or transcript model as a series of GTF formatted
383							text lines, following the defined Gene Transfer Format (also known as GFF
384							version 2.5). It will ensure that each feature is properly tagged with the
385							C and C attributes.
386
387							This method will automatically recurse through all subfeatures.
388
389							=item ucsc_string
390
391							my $string = ucsc_string($gene);
392
393							This will export a gene or transcript model as a refFlat formatted Gene
394							Prediction line (11 columns). See L
395							for details. Multiple transcript genes are exported as multiple text lines
396							concatenated together.
397
398							=item bed_string
399
400							my $string = bed_string($gene);
401
402							This will export a gene or transcript model as a UCSC Bed formatted transcript
403							line (12 columns). See L
404							for details. Multiple transcript genes are exported as multiple text lines
405							concatenated together. Note that gene information is not preserved with Bed12
406							files; only the transcript name is used. The C value is set to 0.
407
408							=back
409
410							=head2 Filter methods
411
412							These methods are used to filter genes.
413
414							=over 4
415
416							=item filter_transcript_support_level
417
418							my $new_gene = filter_transcript_support_level($gene, 'best2');
419							my @good_transcripts = filter_transcript_support_level(\@transcripts);
420
421							This will filter a gene object for transcripts that match or exceed the
422							provided transcript support level. This assumes that the transcripts
423							contain the attribute tag 'transcript_support_level', which are present in
424							Ensembl provided GFF3 and GTF annotation files. The values are a digit (1-5),
425							or 'NA', where 1 is experimentally supported and 5 is entirely predicted
426							with no experimental evidence. See
427							L
428							for details.
429
430							Pass a gene SeqFeature object with one or more transcript subfeatures.
431							Alternatively, an array reference of transcripts could be passed as well.
432
433							A level may be provided as a second argument. The default is 'best'.
434
435							=over 4
436
437							=item best
438
439							Only the transcripts with the highest existing value will be retained.
440
441							=item bestEdigitE
442
443							All transcripts up to the indicated level are retained. For example,
444							'best3' would indicate that transcripts with support levels 1, 2, and 3
445							would be retained.
446
447							=item EdigitE
448
449							Only transcripts at the given level are retained.
450
451							=item NA
452
453							Only transcripts with 'NA' as the value are retained. These are typically
454							pseudogenes or single-exon transcripts.
455
456							=back
457
458							If none of the transcripts have the attribute, then all are returned
459							(nothing is filtered).
460
461							If a gene object was provided, a new gene object will be returned with
462							only the retained transcripts as subfeatures. If an array reference of
463							transcripts was provided, then an array reference of the filtered
464							transcripts is returned.
465
466							=item filter_transcript_gencode_basic
467
468							my $new_gene = filter_transcript_gencode_basic($gene);
469							my @good_transcripts = filter_transcript_gencode_basic(\@transcripts);
470
471							This will filter a gene object for transcripts for the Ensembl GENCODE
472							tag "basic", which indicates that a transcript is tagged as GENCODE Basic
473							transcript.
474
475							If a gene object was provided, a new gene object will be returned with
476							only the retained transcripts as subfeatures. If an array reference of
477							transcripts was provided, then an array reference of the filtered
478							transcripts is returned.
479
480							=item filter_transcript_biotype
481
482							my $new_gene = filter_transcript_gencode_basic($gene, $biotype);
483							my @good_transcripts = filter_transcript_gencode_basic(\@transcripts, 'miRNA');
484
485							This will filter a gene object for transcripts for specific biotype values
486							using the C or C attribute tags, commonly found
487							in Ensembl annotation.
488
489							If a gene object was provided, a new gene object will be returned with
490							only the retained transcripts as subfeatures. If an array reference of
491							transcripts was provided, then an array reference of the filtered
492							transcripts is returned.
493
494							=back
495
496							=head1 SEE ALSO
497
498							L, L,
499							L, L, L,
500							L, L, L
501
502							=cut
503
504	1			1		961	use strict;
	1					1
	1					26
505	1			1		3	use Carp qw(carp cluck croak confess);
	1					2
	1					5886
506							require Exporter;
507
508							### Export
509							our @ISA = qw(Exporter);
510							our @EXPORT = qw();
511							our @EXPORT_OK = qw(
512							get_exons
513							get_alt_exons
514							get_common_exons
515							get_uncommon_exons
516							get_alt_common_exons
517							get_introns
518							get_alt_introns
519							get_common_introns
520							get_uncommon_introns
521							get_alt_common_introns
522							get_transcripts
523							collapse_transcripts
524							get_transcript_length
525							is_coding
526							get_cds
527							get_cdsStart
528							get_cdsEnd
529							get_start_codon
530							get_stop_codon
531							get_transcript_cds_length
532							get_utrs
533							get_transcript_utr_length
534							get_5p_utrs
535							get_3p_utrs
536							get_transcript_5p_utr_length
537							get_transcript_3p_utr_length
538							gff_string
539							gtf_string
540							ucsc_string
541							bed_string
542							filter_transcript_support_level
543							filter_transcript_gencode_basic
544							filter_transcript_biotype
545							);
546							our %EXPORT_TAGS = (
547							all => \@EXPORT_OK,
548							exon => [ qw(
549							get_exons
550							get_alt_exons
551							get_common_exons
552							get_uncommon_exons
553							get_alt_common_exons
554							) ],
555							intron => [ qw(
556							get_introns
557							get_alt_introns
558							get_common_introns
559							get_uncommon_introns
560							get_alt_common_introns
561							) ],
562							transcript => [ qw(
563							get_transcripts
564							collapse_transcripts
565							get_transcript_length
566							) ],
567							cds => [ qw(
568							is_coding
569							get_cds
570							get_cdsStart
571							get_cdsEnd
572							get_start_codon
573							get_stop_codon
574							get_transcript_cds_length
575							) ],
576							utr => [ qw(
577							get_utrs
578							get_5p_utrs
579							get_3p_utrs
580							get_transcript_utr_length
581							get_transcript_5p_utr_length
582							get_transcript_3p_utr_length
583							) ],
584							export => [ qw(
585							gff_string
586							gtf_string
587							ucsc_string
588							bed_string
589							) ],
590							filter => [ qw(
591							filter_transcript_biotype
592							filter_transcript_gencode_basic
593							filter_transcript_support_level
594							)]
595							);
596
597
598							### The True Statement
599							1;
600
601
602
603							######## Exon Methods
604
605							sub get_exons {
606							# initialize
607	102			102	1	127	my $transcript = shift;
608	102	50				235	confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
609	102					177	my @exons;
610							my @cdss;
611	102					0	my @transcripts;
612
613							# go through the subfeatures
614	102					183	foreach my $subfeat ($transcript->get_SeqFeatures) {
615	1372					1824	my $type = $subfeat->primary_tag;
616	1372	100				2892	if ($type =~ /exon/i) {
		50
		0
617	682					810	push @exons, $subfeat;
618							}
619							elsif ($type =~ /cds\|utr\|untranslated/i) {
620	690					858	push @cdss, $subfeat;
621							}
622							elsif ($type =~ /rna\|transcript/i) {
623	0					0	push @transcripts, $subfeat;
624							}
625							}
626
627							# check which array we'll use
628							# prefer to use actual exon subfeatures, but those may not be defined
629	102					153	my @list;
630	102	50				140	if (@exons) {
		0
		0
631	682					717	@list = map { $_->[0] }
632	1184					1193	sort { $a->[1] <=> $b->[1] }
633	102					140	map { [$_, $_->start] }
	682					898
634							@exons;
635							}
636							elsif (@cdss) {
637							# duplicate the CDSs as exons
638	0					0	foreach (@cdss) {
639	0					0	my $e = $_->duplicate;
640	0					0	$e->primary_tag('exon'); # reset tag
641	0					0	$e->phase('.'); # no phase
642	0					0	push @list, $e;
643							}
644
645							# make sure to merge adjacent exons
646	0					0	@list = map { $_->[0] } # must sort first
647	0					0	sort { $a->[1] <=> $b->[1] }
648	0					0	map { [$_, $_->start] }
	0					0
649							@list;
650	0					0	for (my $i = 0; $i < scalar @list; $i++) {
651	0	0				0	if (defined ($list[ $i + 1 ]) ) {
652	0	0				0	if ($list[$i+1]->start - $list[$i]->end <= 1) {
653							# need to merge
654	0					0	$list[$i]->end( $list[$i+1]->end );
655	0					0	splice(@list, $i + 1, 1); # remove the merged
656	0					0	$i--;
657							}
658							}
659							}
660							}
661							elsif (@transcripts) {
662	0					0	foreach my $t (@transcripts) {
663							# there are possibly duplicates in here if there are alternate transcripts
664							# should we remove them?
665	0					0	my @e = get_exons($t);
666	0					0	push @list, @e;
667							}
668	0					0	@list = map { $_->[0] }
669	0	0				0	sort { $a->[1] <=> $b->[1] or $a->[2] <=> $b->[2] }
670	0					0	map { [$_, $_->start, $_->end] }
	0					0
671							@list;
672							}
673							else {
674							# nothing found!
675	0					0	return;
676							}
677
678	102	50				304	return wantarray ? @list : \@list;
679							}
680
681							sub get_alt_exons {
682	1			1	1	5	my $ac_exons = get_alt_common_exons(@_);
683	1					2	my @alts;
684	1					4	foreach my $k (keys %$ac_exons) {
685	15	100				20	next if $k eq 'common';
686	14	100				16	next if $k eq 'uncommon';
687	13					11	push @alts, @{ $ac_exons->{$k} };
	13					16
688							}
689	1	50				2	if (@alts) {
690							# re-sort in genomic order
691	19					20	@alts = map {$_->[1]}
692	60					69	sort {$a->[0] <=> $b->[0]}
693	1					2	map { [$_->start, $_] } @alts;
	19					26
694							}
695	1	50				16	return wantarray ? @alts : \@alts;
696							}
697
698							sub get_common_exons {
699	1			1	1	508	my $ac_exons = get_alt_common_exons(@_);
700	1	50				5	return wantarray ? @{ $ac_exons->{common} } : $ac_exons->{common};
	1					27
701							}
702
703							sub get_uncommon_exons {
704	1			1	1	745	my $ac_exons = get_alt_common_exons(@_);
705	1	50				5	return wantarray ? @{ $ac_exons->{uncommon} } : $ac_exons->{uncommon};
	1					17
706							}
707
708							sub get_alt_common_exons {
709	3			3	1	10	return _get_alt_common_things(1, @_);
710							}
711
712
713							######## Intron Methods
714
715							sub get_introns {
716	42			42	1	1476	my $transcript = shift;
717	42	50				141	confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
718	42					46	my @introns;
719
720							# find the exons and/or CDSs
721	42					71	my @exons = get_exons($transcript);
722	42	50				77	return unless @exons;
723	42	50				78	return if (scalar(@exons) == 1);
724
725							# identify the last exon index position
726	42					60	my $last = scalar(@exons) - 1;
727
728							# forward strand
729	42	50				79	if ($transcript->strand >= 0) {
730							# each intron is created based on the previous exon
731	42					97	for (my $i = 0; $i < $last; $i++) {
732	234					243	my $e = $exons[$i];
733	234					365	my $i = $e->new(
734							-seq_id => $e->seq_id,
735							-start => $e->end + 1,
736							-end => $exons[$i + 1]->start - 1, # up to start of next exon
737							-strand => $transcript->strand,
738							-primary_tag => 'intron',
739							-source_tag => $transcript->source_tag,
740							-primary_id => $transcript->display_name . ".intron$i",
741							-display_name => $transcript->display_name . ".intron$i",
742							);
743	234					603	push @introns, $i;
744							}
745							}
746
747							# reverse strand
748							else {
749							# each intron is created based on the previous exon
750							# ordering from 5' to 3' end direction for convenience in naming
751	0					0	for (my $i = $last; $i > 0; $i--) {
752	0					0	my $e = $exons[$i];
753	0					0	my $i = $e->new(
754							-seq_id => $e->seq_id,
755							-start => $exons[$i - 1]->end + 1, # end of next exon
756							-end => $e->start - 1,
757							-strand => $transcript->strand,
758							-primary_tag => 'intron',
759							-source_tag => $transcript->source_tag,
760							-primary_id => $transcript->display_name . ".intron$i",
761							-display_name => $transcript->display_name . ".intron$i",
762							);
763	0					0	push @introns, $i;
764							}
765
766							# reorder the introns based on start position
767	0	0				0	if (@introns) {
768	0					0	@introns = map { $_->[0] }
769	0	0				0	sort { $a->[1] <=> $b->[1] or $a->[2] <=> $b->[2] }
770	0					0	map { [$_, $_->start, $_->end] }
	0					0
771							@introns;
772							}
773							}
774
775							# finished
776	42	50				121	return wantarray ? @introns : \@introns;
777							}
778
779							sub get_alt_introns {
780	1			1	1	4	my $ac_introns = get_alt_common_introns(@_);
781	1					3	my @alts;
782	1					6	foreach my $k (keys %$ac_introns) {
783	15	100				20	next if $k eq 'common';
784	14	100				17	next if $k eq 'uncommon';
785	13					13	push @alts, @{ $ac_introns->{$k} };
	13					17
786							}
787	1	50				3	if (@alts) {
788							# re-sort in genomic order
789	8					10	@alts = map {$_->[1]}
790	16					18	sort {$a->[0] <=> $b->[0]}
791	1					3	map { [$_->start, $_] } @alts;
	8					13
792							}
793	1	50				16	return wantarray ? @alts : \@alts;
794							}
795
796							sub get_common_introns {
797	1			1	1	4	my $ac_introns = get_alt_common_introns(@_);
798	1	50				4	return wantarray ? @{ $ac_introns->{common} } : $ac_introns->{common};
	1					18
799							}
800
801							sub get_uncommon_introns {
802	1			1	1	757	my $ac_introns = get_alt_common_introns(@_);
803	1	50				3	return wantarray ? @{ $ac_introns->{uncommon} } : $ac_introns->{uncommon};
	1					12
804							}
805
806							sub get_alt_common_introns {
807	3			3	1	10	return _get_alt_common_things(0, @_);
808							}
809
810							sub _get_alt_common_things {
811							# internal subroutine to get either exons or introns
812	6			6		12	my $do_exon = shift; # true for exon, false for intron
813	6					11	my @transcripts;
814	6	50				17	return unless @_;
815	6	50				17	if (scalar @_ == 1) {
		0
816							# someone passed a gene, get the transcripts
817	6	50				31	confess "not a SeqFeature object!" unless ref($_[0]) =~ /seqfeature/i;
818	6					15	@transcripts = get_transcripts($_[0]);
819							}
820							elsif (scalar @_ > 1) {
821							# presume these are transcripts?
822	0					0	@transcripts = @_;
823							}
824
825							# hash of transcript to things
826	6					22	my %tx2things = (
827							common => [],
828							uncommon => []
829							);
830
831							# no transcripts provided?
832	6	50				15	return \%tx2things unless @transcripts;
833
834							# only one transcript provided?
835	6	50				15	if (scalar @transcripts == 1) {
836							# all exons are common by definition
837							# my $name = $transcripts[0]->display_name;
838	0	0				0	my @things = $do_exon ? get_exons($transcripts[0]) : get_introns($transcripts[0]);
839	0					0	$tx2things{common} = \@things;
840	0					0	return \%tx2things;
841							}
842
843							# get things and put them in has based on coordinates
844	6					9	my %pos2things;
845	6					12	foreach my $t (@transcripts) {
846	78	100				158	my @things = $do_exon ? get_exons($t) : get_introns($t);
847	78					119	foreach my $e (@things) {
848	471					741	my $new_e = $e->duplicate;
849	471					728	$pos2things{$e->start}{$e->end}{$t->display_name} = $new_e;
850							}
851	78					138	$tx2things{ $t->display_name } = [];
852							}
853
854							# put things into categories based on commonality
855							# associate things with unique transcripts, common, or uncommon sets
856	6					15	my $trx_number = scalar @transcripts;
857	6					43	foreach my $s (sort {$a <=> $b} keys %pos2things) { # sort on start
	416					410
858	126					115	foreach my $e (sort {$a <=> $b} keys %{ $pos2things{$s} }) { # sort on stop
	47					78
	126					248
859	165					184	my @names = keys %{ $pos2things{$s}{$e} };
	165					322
860	165	100				274	if (scalar @names == 1) {
		50
861							# only 1 thing, must be an alternate
862	81					74	push @{ $tx2things{$names[0]} }, $pos2things{$s}{$e}{$names[0]};
	81					189
863							}
864							elsif (scalar @names == $trx_number) {
865							# common to all transcripts, take the first one as example
866	0					0	push @{ $tx2things{common} }, $pos2things{$s}{$e}{$names[0]};
	0					0
867							}
868							else {
869							# common to some but not all transcripts, so uncommon
870	84					72	push @{ $tx2things{uncommon} }, $pos2things{$s}{$e}{$names[0]};
	84					155
871							}
872							}
873							}
874	6					299	return \%tx2things;
875							}
876
877
878
879							######## Transcript Methods
880
881							sub get_transcripts {
882	9			9	1	364	my $gene = shift;
883	9	50				23	return unless $gene;
884	9	50				36	confess "not a SeqFeature object!" unless ref($gene) =~ /seqfeature/i;
885	9	50				25	return $gene if ($gene->primary_tag =~ /rna\|transcript/i);
886	9					18	my @transcripts;
887							my @exons;
888	9					0	my @other;
889	9					25	foreach my $subf ($gene->get_SeqFeatures) {
890	106	50				140	if ($subf->primary_tag =~ /rna\|transcript\|\bprocessed/i) {
		0
891	106					148	push @transcripts, $subf;
892							}
893							elsif ($subf->primary_tag =~ /^(?:cds\|exon\|\w+codon)$/i) {
894	0					0	push @exons, $subf;
895							}
896							else {
897							# wierdo subfeature types like unprocessed_pseudogene
898	0					0	push @other, $subf;
899							}
900							}
901	9	50	33			49	if (not @transcripts and @exons) {
		50	33
			33
902							# some weirdly formatted annotation files skip the transcript
903							# looking at you SGD
904	0					0	my $transcript = $gene->new(
905							-seq_id => $gene->seq_id,
906							-start => $gene->start,
907							-end => $gene->end,
908							-strand => $gene->strand,
909							-primary_tag => 'transcript',
910							-source => $gene->source_tag,
911							-name => $gene->display_name,
912							-segments => \@exons,
913							);
914	0					0	push @transcripts, $transcript;
915							}
916							elsif (not @transcripts and not @exons and @other) {
917							# well, what choice do we have?
918							# we can assume these are transcripts, because what else could they be?
919	0					0	@transcripts = @other;
920							}
921	106					111	@transcripts = map { $_->[0] }
922	225	50				282	sort { $a->[1] <=> $b->[1] or $a->[2] <=> $b->[2] }
923	9					21	map { [$_, $_->start, $_->length] }
	106					154
924							@transcripts;
925	9	50				40	return wantarray ? @transcripts : \@transcripts;
926							}
927
928
929							sub collapse_transcripts {
930	1			1	1	2	my @transcripts;
931	1	50				4	return unless @_;
932	1					2	my $example = $_[0]; # parent transcript seqfeature to model new transcript on
933	1	50				5	if (scalar @_ == 1) {
		50
934							# someone passed a gene, get the transcripts
935	0					0	@transcripts = get_transcripts($_[0]);
936	0	0				0	return unless @transcripts;
937	0	0				0	return $transcripts[0] if scalar @transcripts == 1;
938							}
939							elsif (scalar @_ > 1) {
940	1					4	@transcripts = @_;
941							}
942
943							# get all useable exons to collapse
944	1					3	my @exons;
945	1					3	foreach my $t (@transcripts) {
946	13					17	my @e = get_exons($t);
947	13					30	push @exons, @e;
948							}
949
950							# check that we have exons - weirdo files may just have CDS!!!????
951	1	50				5	unless (@exons) {
952	0					0	foreach my $t (@transcripts) {
953	0					0	my @e = get_cds($t);
954	0					0	push @exons, @e;
955							}
956							}
957	1	50				3	return unless (@exons);
958
959							# sort all the exons
960	85					84	my @sorted = map { $_->[0] }
961	414	50				489	sort { $a->[1] <=> $b->[1] or $a->[2] <=> $b->[2] }
962	1					3	map { [$_, $_->start, $_->end] }
	85					114
963							@exons;
964
965							# build new exons from the original - don't keep cruft
966	1					10	my $next = shift @sorted;
967	1					2	my @new;
968	1					6	$new[0] = $next->new(
969							-seq_id => $next->seq_id,
970							-start => $next->start,
971							-end => $next->end,
972							-strand => $next->strand,
973							-primary_tag => 'exon',
974							);
975
976							# work through remaining exons, adding and merging as necessary
977	1					4	while (@sorted) {
978	84					97	$next = shift @sorted;
979	84					113	my ($ns, $ne) = ($next->start, $next->end); # new start & end
980	84					115	my ($os, $oe) = ($new[-1]->start, $new[-1]->end); # old start & end
981	84	100	100			278	if ($ns == $os and $ne > $oe) {
		50	100
		100	66
982							# same beginning, further end
983	5					8	$new[-1]->end($ne);
984							}
985							elsif ($ns > $os and $ns < $oe and $ne > $oe) {
986							# overlapping start, further end
987	0					0	$new[-1]->end($ne);
988							}
989							elsif ($ns > $oe) {
990							# completely new exon
991	13					24	push @new, $next->new(
992							-seq_id => $next->seq_id,
993							-start => $ns,
994							-end => $ne,
995							-strand => $next->strand,
996							-primary_tag => 'exon',
997							);
998							}
999							# all other possibilities we can skip
1000							}
1001
1002							# return the assembled transcript
1003	1					5	return $example->new(
1004							-seq_id => $example->seq_id,
1005							-start => $new[0]->start,
1006							-end => $new[-1]->end,
1007							-strand => $example->strand,
1008							-primary_tag => 'transcript',
1009							-source => $example->source_tag,
1010							-name => $example->display_name,
1011							-segments => \@new,
1012							);
1013							}
1014
1015							sub get_transcript_length {
1016	4			4	1	508	my $transcript = shift;
1017	4	50				26	confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1018	4	50				14	if ($transcript->primary_tag =~ /gene$/i) {
1019							# someone passed a gene object!!!!
1020	0					0	my @lengths;
1021	0					0	foreach my $t (get_transcripts($transcript)) {
1022	0					0	push @lengths, get_transcript_length($t);
1023							}
1024							# return the longest transcript length
1025	0					0	return (sort {$b <=> $a} @lengths)[0];
	0					0
1026							}
1027	4					8	my $total = 0;
1028	4					15	foreach my $e (get_exons($transcript)) {
1029	33					53	$total += $e->length;
1030							}
1031	4					17	return $total;
1032							}
1033
1034
1035
1036							######## CDS Methods
1037
1038							sub is_coding {
1039	3			3	1	461	my $transcript = shift;
1040	3	50				11	return unless $transcript;
1041	3	50				36	confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1042	3	50				8	if ($transcript->primary_tag =~ /gene$/i) {
1043							# someone passed a gene, check its subfeatures
1044	0					0	my $code_potential = 0;
1045	0					0	foreach ($transcript->get_SeqFeatures) {
1046	0					0	$code_potential += is_coding($_);
1047							}
1048	0					0	return $code_potential;
1049							}
1050	3	100				8	return 1 if $transcript->primary_tag =~ /mrna/i; # assumption
1051	2	50				9	return 1 if $transcript->source =~ /protein.?coding/i;
1052	2	50				8	if ($transcript->has_tag('transcript_biotype')) {
		50
		0
1053							# ensembl type GTFs
1054	0					0	my ($biotype) = $transcript->get_tag_values('transcript_biotype');
1055	0	0				0	return $biotype =~ /protein.?coding/i ? 1 : 0;
1056							}
1057							elsif ($transcript->has_tag('biotype')) {
1058							# ensembl type GFFs
1059	2					7	my ($biotype) = $transcript->get_tag_values('biotype');
1060	2	50				12	return $biotype =~ /protein.?coding/i ? 1 : 0;
1061							}
1062							elsif ($transcript->has_tag('gene_biotype')) {
1063							# ensembl type GTFs
1064							# must be careful here, gene_biotype of course pertains to gene,
1065							# and not necessarily this particular transcript
1066	0					0	my ($biotype) = $transcript->get_tag_values('gene_biotype');
1067	0	0				0	return 1 if $biotype =~ /protein.?coding/i;
1068							}
1069	0					0	foreach ($transcript->get_SeqFeatures) {
1070							# old fashioned way
1071	0	0				0	return 1 if $_->primary_tag eq 'CDS';
1072							}
1073	0					0	return 0;
1074							}
1075
1076							sub get_cds {
1077	13			13	1	1420	my $transcript = shift;
1078	13	50				30	return unless $transcript;
1079	13	50				39	confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1080	13					16	my @cds;
1081	13					30	foreach my $subfeat ($transcript->get_SeqFeatures) {
1082	170	100				222	push @cds, $subfeat if $subfeat->primary_tag eq 'CDS';
1083							}
1084	13	100				32	return unless @cds;
1085	43					48	@cds = map { $_->[0] }
1086	70					73	sort { $a->[1] <=> $b->[1] }
1087	7					15	map { [$_, $_->start] }
	43					65
1088							@cds;
1089	7	100				27	return wantarray ? @cds : \@cds;
1090							}
1091
1092							sub get_cdsStart {
1093	4			4	1	2039	my $transcript = shift;
1094	4	50				23	confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1095	4					11	my $cds = get_cds($transcript);
1096	4	100				16	return unless $cds;
1097	2	50				7	if ($transcript->strand >= 0) {
1098	2					5	return $cds->[0]->start;
1099							}
1100							else {
1101							# stop codons may or may not be not included
1102	0					0	my $codon = get_stop_codon($transcript);
1103	0	0				0	if ($codon) {
1104	0	0				0	return $codon->start < $cds->[0]->start ?
1105							$codon->start : $cds->[0]->start;
1106							}
1107							else {
1108	0					0	return $cds->[0]->start;
1109							}
1110							}
1111							}
1112
1113							sub get_cdsEnd {
1114	4			4	1	9	my $transcript = shift;
1115	4	50				22	confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1116	4					9	my $cds = get_cds($transcript);
1117	4	100				12	return unless $cds;
1118	2	50				7	if ($transcript->strand >= 0) {
1119	2					9	my $codon = get_stop_codon($transcript);
1120	2	50				6	if ($codon) {
1121	2	50				5	return $codon->end > $cds->[-1]->end ? $codon->end : $cds->[-1]->end;
1122							}
1123							else {
1124	0					0	return $cds->[-1]->end;
1125							}
1126							}
1127							else {
1128	0					0	return $cds->[-1]->end;
1129							}
1130							}
1131
1132							sub get_transcript_cds_length {
1133	4			4	1	9	my $transcript = shift;
1134	4	50				22	confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1135	4					6	my $total = 0;
1136	4					12	foreach my $subf ($transcript->get_SeqFeatures) {
1137	53	100				97	next unless $subf->primary_tag eq 'CDS';
1138	13					24	$total += $subf->length;
1139							}
1140	4					16	return $total;
1141							}
1142
1143							sub get_start_codon {
1144	0			0	1	0	my $transcript = shift;
1145	0	0				0	confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1146	0					0	my $start_codon;
1147
1148							# look for existing one
1149	0					0	foreach my $subfeat ($transcript->get_SeqFeatures) {
1150	0	0				0	$start_codon = $subfeat if $subfeat->primary_tag =~ /start.?codon/i;
1151							}
1152	0	0				0	return $start_codon if $start_codon;
1153
1154							# otherwise we have to build one
1155	0					0	my $cdss = get_cds($transcript);
1156	0	0				0	return unless $cdss;
1157	0	0				0	if ($transcript->strand >= 0) {
1158	0					0	$start_codon = $transcript->new(
1159							-seq_id => $transcript->seq_id,
1160							-source => $transcript->source,
1161							-primary_tag => 'start_codon',
1162							-start => $cdss->[0]->start,
1163							-end => $cdss->[0]->start + 2,
1164							-strand => 1,
1165							-phase => 0,
1166							-primary_id => $transcript->primary_id . '.start_codon',
1167							);
1168							}
1169							else {
1170	0					0	$start_codon = $transcript->new(
1171							-seq_id => $transcript->seq_id,
1172							-source => $transcript->source,
1173							-primary_tag => 'start_codon',
1174							-start => $cdss->[-1]->end - 2,
1175							-end => $cdss->[-1]->end,
1176							-strand => -1,
1177							-phase => 0,
1178							-primary_id => $transcript->primary_id . '.start_codon',
1179							);
1180							}
1181	0					0	return $start_codon;
1182							}
1183
1184							sub get_stop_codon {
1185	2			2	1	3	my $transcript = shift;
1186	2	50				10	confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1187	2					3	my $stop_codon;
1188
1189							# look for existing one
1190	2					6	foreach my $subfeat ($transcript->get_SeqFeatures) {
1191	37	50				44	$stop_codon = $subfeat if $subfeat->primary_tag =~ /stop.?codon/i;
1192							}
1193	2	50				5	return $stop_codon if $stop_codon;
1194
1195							# otherwise we have to build one
1196							# this entirely presumes that the stop codon is inclusive to the last cds
1197							# this is the case with GFF3 and UCSC tables, but not GTF
1198	2					5	my $cdss = get_cds($transcript);
1199	2	50				4	return unless $cdss;
1200	2	50				5	if ($transcript->strand >= 0) {
1201	2					7	$stop_codon = $transcript->new(
1202							-seq_id => $transcript->seq_id,
1203							-source => $transcript->source,
1204							-primary_tag => 'stop_codon',
1205							-start => $cdss->[-1]->end - 2,
1206							-end => $cdss->[-1]->end,
1207							-strand => 1,
1208							-phase => 0,
1209							-primary_id => $transcript->primary_id . '.stop_codon',
1210							);
1211							}
1212							else {
1213	0					0	$stop_codon = $transcript->new(
1214							-seq_id => $transcript->seq_id,
1215							-source => $transcript->source,
1216							-primary_tag => 'stop_codon',
1217							-start => $cdss->[0]->start,
1218							-end => $cdss->[0]->start + 2,
1219							-strand => -1,
1220							-phase => 0,
1221							-primary_id => $transcript->primary_id . '.stop_codon',
1222							);
1223							}
1224	2					6	return $stop_codon;
1225							}
1226
1227							sub get_utrs {
1228	13			13	1	21	my $transcript = shift;
1229	13	50				36	confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1230
1231							# collect the various types of subfeatures
1232	13					39	my @exons;
1233							my @cdss;
1234	13					0	my @utrs;
1235	13					0	my @transcripts;
1236	13					30	foreach my $subfeat ($transcript->get_SeqFeatures) {
1237	224					309	my $type = $subfeat->primary_tag;
1238	224	100				477	if ($type =~ /exon/i) {
		100
		50
		0
1239	104					128	push @exons, $subfeat;
1240							}
1241							elsif ($type =~ /cds/i) {
1242	78					96	push @cdss, $subfeat;
1243							}
1244							elsif ($type =~ /utr\|untranslated/i) {
1245	42					57	push @utrs, $subfeat;
1246							}
1247							elsif ($type =~ /rna\|transcript/i) {
1248	0					0	push @transcripts, $subfeat;
1249							}
1250							}
1251
1252							# collect the utrs into final list
1253	13					18	my @list;
1254	13	100	33			31	if (@utrs) {
		50
		50
1255							# good, we don't have to do any work
1256	12					20	@list = @utrs;
1257							}
1258							elsif (@transcripts) {
1259							# someone must've passed us a gene
1260	0					0	foreach my $t (@transcripts) {
1261	0					0	my @u = get_utrs($t);
1262	0					0	push @list, @u;
1263							}
1264							}
1265							elsif (@exons and @cdss) {
1266							# calculate the utrs ourselves
1267
1268	0					0	@exons = map { $_->[0] }
1269	0					0	sort { $a->[1] <=> $b->[1] }
1270	0					0	map { [$_, $_->start] }
	0					0
1271							@exons;
1272	0					0	@cdss = map { $_->[0] }
1273	0					0	sort { $a->[1] <=> $b->[1] }
1274	0					0	map { [$_, $_->start] }
	0					0
1275							@cdss;
1276	0					0	my $firstCDS = $cdss[0];
1277	0					0	my $lastCDS = $cdss[-1];
1278	0					0	while (@exons) {
1279	0					0	my $exon = shift @exons;
1280	0	0	0			0	if ($exon->end < $firstCDS->start) {
		0
		0
		0
		0
1281							# whole exon is UTR
1282	0					0	my $utr = $exon->duplicate;
1283	0	0				0	$utr->primary_tag(
1284							$transcript->strand >= 0 ? 'five_prime_UTR' : 'three_prime_UTR' );
1285	0					0	$utr->display_name( $exon->display_name . '.utr' );
1286	0					0	push @list, $utr;
1287							}
1288							elsif ($exon->overlaps($firstCDS)) {
1289							# partial UTR on left side
1290	0					0	my $pieces = $exon->subtract($firstCDS); # array ref of pieces
1291	0	0				0	next unless $pieces;
1292	0					0	my $utr = $pieces->[0]; # we will want the first one if there are two
1293	0	0				0	$utr->primary_tag(
1294							$transcript->strand >= 0 ? 'five_prime_UTR' : 'three_prime_UTR' );
1295	0					0	$utr->display_name($exon->display_name . '.utr');
1296	0					0	$utr->strand($exon->strand);
1297	0					0	$utr->source($exon->strand);
1298	0					0	push @list, $utr;
1299							}
1300							elsif ($exon->start > $firstCDS->end and $exon->end < $lastCDS->start) {
1301							# CDS exon
1302	0					0	next;
1303							}
1304							elsif ($exon->overlaps($lastCDS)) {
1305							# partial UTR
1306	0					0	my $pieces = $exon->subtract($lastCDS); # array ref of pieces
1307	0	0				0	next unless $pieces;
1308	0					0	my $utr = $pieces->[-1]; # we will want the second one if there are two
1309	0	0				0	$utr->primary_tag(
1310							$transcript->strand >= 0 ? 'three_prime_UTR' : 'five_prime_UTR' );
1311	0					0	$utr->display_name($exon->display_name . '.utr');
1312	0					0	$utr->strand($exon->strand);
1313	0					0	$utr->source($exon->strand);
1314	0					0	push @list, $utr;
1315							}
1316							elsif ($exon->start > $lastCDS->end) {
1317							# whole exon is UTR
1318	0					0	my $utr = $exon->duplicate;
1319	0	0				0	$utr->primary_tag(
1320							$transcript->strand >= 0 ? 'three_prime_UTR' : 'five_prime_UTR' );
1321	0					0	$utr->display_name( $exon->display_name . '.utr' );
1322	0					0	push @list, $utr;
1323							}
1324							else {
1325							# geometric error
1326	0					0	croak " programmer geometric error!";
1327							}
1328							}
1329							}
1330							else {
1331							# nothing usable found to identify UTRs
1332	1					4	return;
1333							}
1334
1335							# we have our list
1336	12	100				33	return wantarray ? @list : \@list;
1337							}
1338
1339							sub get_transcript_utr_length {
1340	2			2	0	4	my $transcript = shift;
1341	2	50				13	confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1342	2					4	my $utrs = get_utrs($transcript);
1343	2					4	my $total = 0;
1344	2					4	foreach my $utr (@$utrs) {
1345	7					11	$total += $utr->length;
1346							}
1347	2					8	return $total;
1348							}
1349
1350							sub get_5p_utrs {
1351	4			4	1	470	my $transcript = shift;
1352	4	50				23	confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1353
1354							# get all UTRs
1355	4					13	my $utrs = get_utrs($transcript);
1356	4	50				11	return unless scalar(@$utrs);
1357
1358	4					10	my @fivers = grep { $_->primary_tag =~ /5\|five/i } @$utrs;
	14					22
1359	4	100				15	return wantarray ? @fivers : \@fivers;
1360							}
1361
1362							sub get_3p_utrs {
1363	4			4	1	7	my $transcript = shift;
1364	4	50				21	confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1365
1366							# get all UTRs
1367	4					8	my $utrs = get_utrs($transcript);
1368	4	50				10	return unless scalar(@$utrs);
1369
1370	4					7	my @threes = grep { $_->primary_tag =~ /3\|three/i } @$utrs;
	14					847
1371	4	100				14	return wantarray ? @threes : \@threes;
1372							}
1373
1374							sub get_transcript_5p_utr_length {
1375	2			2	0	502	my $transcript = shift;
1376	2					8	my $utrs = get_5p_utrs($transcript);
1377	2					4	my $total = 0;
1378	2					6	foreach my $utr (@$utrs) {
1379	3					9	$total += $utr->length;
1380							}
1381	2					8	return $total;
1382							}
1383
1384							sub get_transcript_3p_utr_length {
1385	2			2	0	6	my $transcript = shift;
1386	2					4	my $utrs = get_3p_utrs($transcript);
1387	2					2	my $total = 0;
1388	2					5	foreach my $utr (@$utrs) {
1389	4					10	$total += $utr->length;
1390							}
1391	2					6	return $total;
1392							}
1393
1394
1395							#### Export methods
1396
1397							sub gff_string {
1398							# Bio::ToolBox::SeqFeature and Bio::SeqFeature::Lite objects have this method
1399							# otherwise this will die
1400	0			0	1	0	return shift->gff_string(@_);
1401							}
1402
1403							sub gtf_string {
1404	0			0	1	0	my $feature = shift;
1405	0		0			0	my $gene = shift \|\| undef; # only present when recursing
1406	0	0				0	confess "not a SeqFeature object!" unless ref($feature) =~ /seqfeature/i;
1407
1408							# process a gene
1409	0	0	0			0	if ($feature->primary_tag =~ /gene$/i and not defined $gene) {
1410	0					0	my $string;
1411	0					0	foreach my $t (get_transcripts($feature)) {
1412	0					0	$string .= gtf_string($t, $feature);
1413							}
1414	0					0	return $string;
1415							}
1416
1417							# check that we have transcribed feature with exons
1418	0					0	my @exons = get_exons($feature);
1419	0	0				0	return unless @exons; # no exon subfeatures? must not be a transcript....
1420
1421							# mandatory identifiers
1422	0					0	my ($gene_id, $gene_name, $gene_biotype);
1423	0	0				0	if ($gene) {
1424	0		0			0	$gene_id = $gene->primary_id \|\| $gene->display_name;
1425	0					0	$gene_name = $gene->display_name;
1426	0		0			0	($gene_biotype) = $gene->get_tag_values('gene_biotype') \|\|
1427							$gene->get_tag_values('biotype') \|\| undef;
1428							}
1429							else {
1430							# these attributes might still be present for transcripts
1431	0		0			0	($gene_id) = $feature->get_tag_values('gene_id') \|\| undef;
1432	0		0			0	($gene_name) = $feature->get_tag_values('gene_name') \|\| undef;
1433	0		0			0	($gene_biotype) = $feature->get_tag_values('gene_biotype') \|\| undef;
1434							}
1435	0		0			0	my $trx_id = $feature->primary_id \|\| $feature->display_name;
1436	0					0	my $trx_name = $feature->display_name;
1437	0					0	my $group = sprintf(
1438							"gene_id \"%s\"; transcript_id \"%s\"; gene_name \"%s\"; transcript_name \"%s\";",
1439							$gene_id, $trx_id, $gene_name, $trx_name);
1440
1441							# add additional transcript attributes that might be interesting to keep
1442	0	0				0	if ($gene_biotype) {
1443	0					0	$group .= " gene_biotype \"$gene_biotype\";";
1444							}
1445	0		0			0	my ($biotype) = $feature->get_tag_values('transcript_biotype') \|\|
1446							$feature->get_tag_values('biotype');
1447	0	0				0	if ($biotype) {
1448	0					0	$group .= " transcript_biotype \"$biotype\";" ;
1449							}
1450	0					0	my ($tsl) = $feature->get_tag_values('transcript_support_level');
1451	0	0				0	if ($tsl) {
1452	0					0	$group .= " transcript_support_level \"$tsl\";";
1453							}
1454
1455							# skip transcript as it is technically not part of the GTF standard....
1456
1457							# convert exon subfeatures collected above
1458	0					0	my $string;
1459	0					0	my @cds = get_cds($feature);
1460	0					0	push @cds, get_stop_codon($feature);
1461	0					0	push @cds, get_start_codon($feature);
1462	0					0	@exons = map { $_->[0] }
1463	0	0				0	sort { $a->[1] <=> $b->[1] or $a->[2] <=> $b->[2] }
1464	0					0	map { [$_, $_->start, $_->end] } ( @exons, @cds );
	0					0
1465	0					0	foreach my $subf (@exons) {
1466	0	0	0			0	$string .= join("\t", (
		0	0
		0
1467							$subf->seq_id \|\| '.',
1468							$subf->source_tag \|\| '.',
1469							$subf->primary_tag,
1470							$subf->start,
1471							$subf->end,
1472							defined $subf->score ? $subf->score : '.',
1473							$subf->strand < 0 ? '-' : '+',
1474							defined $subf->phase ? $subf->phase : '.',
1475							"$group\n"
1476							) );
1477							}
1478
1479							# finished
1480	0					0	return $string;
1481							}
1482
1483							sub ucsc_string {
1484	0			0	1	0	my $feature = shift;
1485	0	0				0	confess "not a SeqFeature object!" unless ref($feature) =~ /seqfeature/i;
1486	0					0	my @ucsc_list;
1487
1488							# process according to type
1489	0	0				0	if ($feature->primary_tag =~ /gene$/i) {
		0
1490							# a gene object, we will need to process it's transcript subfeatures
1491	0					0	foreach my $transcript (get_transcripts($feature)) {
1492	0					0	my $ucsc = _process_ucsc_transcript($transcript, $feature);
1493	0	0				0	push @ucsc_list, $ucsc if $ucsc;
1494							}
1495							}
1496							elsif ($feature->primary_tag =~ /rna\|transcript/i) {
1497							# some sort of RNA transcript
1498	0					0	my $ucsc = _process_ucsc_transcript($feature);
1499	0	0				0	push @ucsc_list, $ucsc if $ucsc;
1500							}
1501							else {
1502	0					0	return;
1503							}
1504
1505							# return strings
1506	0					0	my $string;
1507	0					0	foreach my $ucsc (@ucsc_list) {
1508							$string .= sprintf("%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%s\t%s\n",
1509							$ucsc->{'name2'},
1510							$ucsc->{'name'},
1511							$ucsc->{'chr'},
1512							$ucsc->{'strand'},
1513							$ucsc->{'txStart'},
1514							$ucsc->{'txEnd'},
1515							$ucsc->{'cdsStart'},
1516							$ucsc->{'cdsEnd'},
1517							$ucsc->{'exonCount'},
1518	0					0	join(",", @{$ucsc->{'exonStarts'}} ) . ',',
1519	0					0	join(",", @{$ucsc->{'exonEnds'}} ) . ',',
	0					0
1520							);
1521							}
1522	0					0	return $string;
1523							}
1524
1525							sub bed_string {
1526	0			0	1	0	my $feature = shift;
1527	0	0				0	confess "not a SeqFeature object!" unless ref($feature) =~ /seqfeature/i;
1528	0					0	my @ucsc_list;
1529
1530							# process according to type
1531	0	0				0	if ($feature->primary_tag =~ /gene$/i) {
		0
1532							# a gene object, we will need to process it's transcript subfeatures
1533	0					0	foreach my $transcript (get_transcripts($feature)) {
1534	0					0	my $ucsc = _process_ucsc_transcript($transcript, $feature);
1535	0	0				0	push @ucsc_list, $ucsc if $ucsc;
1536							}
1537							}
1538							elsif ($feature->primary_tag =~ /rna\|transcript/i) {
1539							# some sort of RNA transcript
1540	0					0	my $ucsc = _process_ucsc_transcript($feature);
1541	0	0				0	push @ucsc_list, $ucsc if $ucsc;
1542							}
1543							else {
1544	0					0	return;
1545							}
1546
1547							# return strings
1548	0					0	my $string;
1549	0					0	foreach my $ucsc (@ucsc_list) {
1550							# exon sizes
1551	0					0	my @sizes;
1552	0					0	for (my $i = 0; $i < scalar( @{$ucsc->{'exonStarts'}} ); $i++) {
	0					0
1553	0					0	push @sizes, $ucsc->{'exonEnds'}->[$i] - $ucsc->{'exonStarts'}->[$i];
1554							}
1555							$string .= sprintf("%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%s\t%s\n",
1556							$ucsc->{'chr'},
1557							$ucsc->{'txStart'},
1558							$ucsc->{'txEnd'},
1559							$ucsc->{'name'},
1560							$ucsc->{'score'},
1561							$ucsc->{'strand'},
1562							$ucsc->{'cdsStart'},
1563							$ucsc->{'cdsEnd'},
1564							0,
1565							$ucsc->{'exonCount'},
1566							join(",", @sizes),
1567	0					0	join(",", map {$_ - $ucsc->{'txStart'} } @{$ucsc->{'exonStarts'}} ),
	0					0
	0					0
1568							);
1569							}
1570	0					0	return $string;
1571							}
1572
1573
1574							#### filter methods
1575
1576							sub filter_transcript_support_level {
1577	0			0	1	0	my $gene = shift;
1578	0	0				0	return unless $gene;
1579	0		0			0	my $min_tsl = shift \|\| 'best';
1580	0					0	my @list = qw(1 2 3 4 5 NA Missing);
1581
1582							# get transcripts
1583	0					0	my @transcripts;
1584	0	0				0	if (ref($gene) =~ /seqfeature/i) {
		0
1585	0					0	@transcripts = get_transcripts($gene);
1586							}
1587							elsif (ref($gene) eq 'ARRAY') {
1588	0					0	@transcripts = @$gene;
1589							}
1590							else {
1591	0					0	return;
1592							}
1593	0	0				0	return unless @transcripts;
1594
1595							# categorize transcripts
1596	0					0	my %results = map { $_ => [] } @list;
	0					0
1597	0					0	foreach my $t (@transcripts) {
1598	0					0	my ($tsl) = $t->get_tag_values('transcript_support_level');
1599	0		0			0	$tsl \|\|= 'Missing'; # default in case nothing is present
1600	0					0	push @{ $results{$tsl} }, $t;
	0					0
1601							}
1602
1603							# take appropriate transcripts
1604	0					0	my @keepers;
1605	0	0				0	if ($min_tsl eq 'best') {
		0
		0
		0
1606							# progress through all levels and keep only the highest set
1607	0					0	foreach my $tsl (@list) {
1608	0	0				0	if (scalar @{ $results{$tsl} }) {
	0					0
1609	0					0	@keepers = @{ $results{$tsl} };
	0					0
1610	0					0	last;
1611							}
1612							}
1613							}
1614							elsif ($min_tsl =~ /^best(\d)$/) {
1615							# progress through all levels and take all up to specified max
1616	0					0	my $max = $1;
1617	0					0	foreach my $tsl (1 .. 5) {
1618	0	0				0	if (scalar @{ $results{$tsl} }) {
	0					0
1619	0	0	0			0	if ($tsl <= $max) {
		0
1620	0					0	push @keepers, @{ $results{$tsl} };
	0					0
1621							}
1622							elsif (not @keepers and $tsl > $max) {
1623							# go ahead and take it if we have no other option
1624	0					0	push @keepers, @{ $results{$tsl} };
	0					0
1625							}
1626							}
1627							}
1628							# still take the NA and Missing if nothing else
1629	0	0	0			0	if (scalar @keepers == 0 and scalar @{ $results{'NA'} }) {
	0	0	0			0
1630	0					0	@keepers = @{ $results{'NA'} };
	0					0
1631							}
1632	0					0	elsif (scalar @keepers == 0 and scalar @{ $results{'Missing'} }) {
1633	0					0	@keepers = @{ $results{'Missing'} };
	0					0
1634							}
1635							}
1636							elsif ($min_tsl =~ /^\d$/) {
1637							# take only the specified level
1638	0					0	@keepers = @{ $results{$min_tsl} };
	0					0
1639							}
1640							elsif ($min_tsl eq 'NA') {
1641							# take only the NA guys
1642	0					0	@keepers = @{ $results{'NA'} };
	0					0
1643							}
1644							else {
1645	0					0	confess "unrecognized minimum TSL value '$min_tsl' Check the documentation!";
1646							}
1647	0	0				0	@keepers = @{ $results{'Missing'} } unless @keepers;
	0					0
1648
1649							# return
1650	0					0	return _return_filtered_transcripts($gene, \@keepers);
1651							}
1652
1653							sub filter_transcript_gencode_basic {
1654	0			0	1	0	my $gene = shift;
1655	0	0				0	return unless $gene;
1656
1657							# get transcripts
1658	0					0	my @transcripts;
1659	0	0				0	if (ref($gene) =~ /seqfeature/i) {
		0
1660	0					0	@transcripts = get_transcripts($gene);
1661							}
1662							elsif (ref($gene) eq 'ARRAY') {
1663	0					0	@transcripts = @$gene;
1664							}
1665							else {
1666	0					0	return;
1667							}
1668	0	0				0	return unless @transcripts;
1669
1670							# take appropriate transcripts
1671	0					0	my @keepers;
1672	0					0	foreach my $t (@transcripts) {
1673	0					0	my ($basic) = $t->get_tag_values('tag');
1674	0	0	0			0	if ($basic and $basic eq 'basic') {
1675	0					0	push @keepers, $t;
1676							}
1677							}
1678
1679							# return
1680	0					0	return _return_filtered_transcripts($gene, \@keepers);
1681							}
1682
1683
1684							sub filter_transcript_biotype {
1685	1			1	1	550	my $gene = shift;
1686	1	50				3	return unless $gene;
1687	1					2	my $check = shift;
1688	1	50				2	confess "no biotype value to check provided!" unless $check;
1689
1690							# get transcripts
1691	1					2	my @transcripts;
1692	1	50				6	if (ref($gene) =~ /seqfeature/i) {
		0
1693	1					4	@transcripts = get_transcripts($gene);
1694							}
1695							elsif (ref($gene) eq 'ARRAY') {
1696	0					0	@transcripts = @$gene;
1697							}
1698							else {
1699	0					0	return;
1700							}
1701	1	50				3	return unless @transcripts;
1702
1703							# take appropriate transcripts
1704	1					2	my @keepers;
1705	1					3	foreach my $t (@transcripts) {
1706	13		33			23	my ($value) = $t->get_tag_values('transcript_biotype') \|\|
1707							$t->get_tag_values('biotype');
1708	13		33			22	$value \|\|= $t->primary_tag;
1709	13	100	66			44	if ($value and $value =~ /$check/i) {
1710	2					4	push @keepers, $t;
1711							}
1712							}
1713
1714							# return
1715	1					4	return _return_filtered_transcripts($gene, \@keepers);
1716							}
1717
1718
1719
1720
1721							#### internal methods
1722
1723							sub _process_ucsc_transcript {
1724	0			0		0	my $transcript = shift;
1725	0		0			0	my $gene = shift \|\| undef;
1726
1727							# initialize ucsc hash
1728	0	0	0			0	my $ucsc = {
			0
1729							'name' => $transcript->display_name \|\| $transcript->primary_id,
1730							'name2' => undef,
1731							'chr' => $transcript->seq_id,
1732							'strand' => $transcript->strand < 0 ? '-' : '+',
1733							'txStart' => $transcript->start - 1,
1734							'txEnd' => $transcript->end,
1735							'cdsStart' => undef,
1736							'cdsEnd' => undef,
1737							'exonCount' => 0,
1738							'exonStarts' => [],
1739							'exonEnds' => [],
1740							'score' => $transcript->score \|\| 1000,
1741							};
1742
1743							# determine gene name
1744	0	0				0	if ($gene) {
1745							# use provided gene name
1746	0		0			0	$ucsc->{'name2'} = $gene->display_name \|\| $gene->primary_id;
1747							}
1748							else {
1749							# reuse the name
1750	0					0	$ucsc->{'name2'} = $ucsc->{'name'};
1751							}
1752
1753							# record CDS points
1754	0	0				0	if (is_coding($transcript)) {
1755	0					0	my $start = get_cdsStart($transcript);
1756	0					0	my $stop = get_cdsEnd($transcript);
1757	0	0	0			0	if ($start and $stop) {
1758	0					0	$ucsc->{cdsStart} = $start - 1;
1759	0					0	$ucsc->{cdsEnd} = $stop;
1760							}
1761							else {
1762							# if we don't have cds start/stop, then assign to transcript end
1763							# as if it is a noncoding transcript, regardless of primary_tag
1764	0					0	$ucsc->{cdsStart} = $transcript->end;
1765	0					0	$ucsc->{cdsEnd} = $transcript->end;
1766							}
1767							}
1768							else {
1769							# non-coding transcript sets the cds points to the transcript end
1770	0					0	$ucsc->{cdsStart} = $transcript->end;
1771	0					0	$ucsc->{cdsEnd} = $transcript->end;
1772							}
1773
1774							# record the exons
1775	0					0	foreach my $exon (get_exons($transcript)) {
1776	0					0	push @{ $ucsc->{'exonStarts'} }, $exon->start - 1;
	0					0
1777	0					0	push @{ $ucsc->{'exonEnds'} }, $exon->end;
	0					0
1778	0					0	$ucsc->{'exonCount'} += 1;
1779							}
1780
1781	0					0	return $ucsc;
1782							}
1783
1784							sub _return_filtered_transcripts {
1785	1			1		3	my ($gene, $keepers) = @_;
1786
1787	1	50				4	if (ref($gene) =~ /seqfeature/i) {
1788							# first check if we were only given a transcript
1789	1	50				2	if ($gene->primary_tag =~ /transcript\|rna/i) {
1790							# we must have been given a single transcript to check, so return it
1791	0		0			0	$keepers->[0] \|\|= undef;
1792	0					0	return $keepers->[0];
1793							}
1794
1795							# we can't delete subfeatures, so we're forced to create a new
1796							# parent gene and reattach the filtered transcripts
1797	1					3	my %attributes = $gene->attributes;
1798	1					5	return $gene->new(
1799							-seq_id => $gene->seq_id,
1800							-start => $gene->start,
1801							-end => $gene->end,
1802							-strand => $gene->strand,
1803							-primary_tag => $gene->primary_tag,
1804							-source => $gene->source_tag,
1805							-name => $gene->display_name,
1806							-id => $gene->primary_id,
1807							-attributes => \%attributes,
1808							-segments => $keepers,
1809							);
1810							}
1811							else {
1812	0						return $keepers;
1813							}
1814							}
1815
1816
1817
1818							__END__