File Coverage

blib/lib/Bio/ToolBox/Data/Feature.pm

Criterion	Covered	Total	%
statement	310	766	40.4
branch	144	494	29.1
condition	51	310	16.4
subroutine	31	51	60.7
pod	35	37	94.5
total	571	1658	34.4

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::ToolBox::Data::Feature;
2							our $VERSION = '1.691';
3
4							=head1 NAME
5
6							Bio::ToolBox::Data::Feature - Objects representing rows in a data table
7
8							=head1 DESCRIPTION
9
10							A Bio::ToolBox::Data::Feature is an object representing a row in the
11							data table. Usually, this in turn represents an annotated feature or
12							segment in the genome. As such, this object provides convenient
13							methods for accessing and manipulating the values in a row, as well as
14							methods for working with the represented genomic feature.
15
16							This class should NOT be used directly by the user. Rather, Feature
17							objects are generated from a Bio::ToolBox::Data::Iterator object
18							(generated itself from the L
19							function in Bio::ToolBox::Data), or the L
20							function in Bio::ToolBox::Data. Please see the respective documentation
21							for more information.
22
23							Example of working with a stream object.
24
25							my $Data = Bio::ToolBox::Data->new(file => $file);
26
27							# stream method
28							my $stream = $Data->row_stream;
29							while (my $row = $stream->next_row) {
30							# each $row is a Bio::ToolBox::Data::Feature object
31							# representing the row in the data table
32							my $value = $row->value($index);
33							# do something with $value
34							}
35
36							# iterate method
37							$Data->iterate( sub {
38							my $row = shift;
39							my $number = $row->value($index);
40							my $log_number = log($number);
41							$row->value($index, $log_number);
42							} );
43
44
45							=head1 METHODS
46
47							=head2 General information methods
48
49							=over 4
50
51							=item row_index
52
53							Returns the index position of the current data row within the
54							data table. Useful for knowing where you are at within the data
55							table.
56
57							=item feature_type
58
59							Returns one of three specific values describing the contents
60							of the data table inferred by the presence of specific column
61							names. This provides a clue as to whether the table features
62							represent genomic regions (defined by coordinate positions) or
63							named database features. The return values include:
64
65							=over 4
66
67							=item * coordinate: Table includes at least chromosome and start
68
69							=item * named: Table includes name, type, and/or Primary_ID
70
71							=item * unknown: unrecognized
72
73							=back
74
75							=item column_name
76
77							Returns the column name for the given index.
78
79							item data
80
81							Returns the parent L object, in case you may
82							have lost it by going out of scope.
83
84							=back
85
86							=head2 Methods to access row feature attributes
87
88							These methods return the corresponding value, if present in the
89							data table, based on the column header name. If the row represents
90							a named database object, try calling the L method first.
91							This will retrieve the database SeqFeature object, and the attributes
92							can then be retrieved using the methods below or on the actual
93							database SeqFeature object.
94
95							In cases where there is a table column and a corresponding SeqFeature
96							object, for example a start column and a parsed SeqFeature object, the
97							table value takes precedence and is returned. You can always obtain the
98							SeqFeature's value separately and directly.
99
100							These methods do not set attribute values. If you need to change the
101							values in a table, use the L method below.
102
103							=over 4
104
105							=item seq_id
106
107							The name of the chromosome the feature is on.
108
109							=item start
110
111							=item end
112
113							=item stop
114
115							The coordinates of the feature or segment. Coordinates from known
116							0-based file formats, e.g. BED, are returned as 1-based. Coordinates
117							must be integers to be returned. Zero or negative start coordinates
118							are assumed to be accidents or poor programming and transformed to 1.
119							Use the L method if you don't want this to happen.
120
121							=item strand
122
123							The strand of the feature or segment. Returns -1, 0, or 1. Default is 0,
124							or unstranded.
125
126							=item midpoint
127
128							The calculated midpoint position of the feature.
129
130							=item peak
131
132							For features in a C file, this will report the peak coordinate,
133							transformed into a genomic coordinate.
134
135							=item name
136
137							=item display_name
138
139							The name of the feature.
140
141							=item coordinate
142
143							Returns a coordinate string formatted as C. This uses the
144							start coordinate as listed in the source file and does not convert 0-based
145							start values to 1-based values. This may confound downstream applications.
146
147							=item type
148
149							The type of feature. Typically either C or C.
150							In a GFF3 file, this represents columns 3 and 2, respectively. In annotation
151							databases such as L, the type is used to restrict
152							to one of many different types of features, e.g. gene, mRNA, or exon.
153
154							=item id
155
156							=item primary_id
157
158							Here, this represents the C in the database. Note that this number
159							is generally unique to a specific database, and not portable between databases.
160
161							=item length
162
163							The length of the feature or segment.
164
165							=item score
166
167							Returns the value of the Score column, if one is available. Typically
168							associated with defined file formats, such as GFF files (6th column),
169							BED and related Peak files (5th column), and bedGraph (4th column).
170
171							=back
172
173							=head2 Accessing and setting values in the row.
174
175							=over 4
176
177							=item value
178
179							# retrieve a value
180							my $v = $row->value($index);
181							# set a value
182							$row->value($index, $v + 1);
183
184							Returns or sets the value at a specific column index in the
185							current data row. Null values return a '.', symbolizing an
186							internal null value.
187
188							=item row_values
189
190							Returns an array or array reference representing all the values
191							in the current data row.
192
193							=back
194
195							=head2 Special feature attributes
196
197							GFF and VCF files have special attributes in the form of key =E value pairs.
198							These are stored as specially formatted, character-delimited lists in
199							certain columns. These methods will parse this information and return as
200							a convenient hash reference. The keys and values of this hash may be
201							changed, deleted, or added to as desired. To write the changes back to
202							the file, use the L to properly write the attributes
203							back to the file with the proper formatting.
204
205							=over 4
206
207							=item attributes
208
209							Generic method that calls either L or L
210							depending on the data table format.
211
212							=item gff_attributes
213
214							Parses the 9th column of GFF files. URL-escaped characters are converted
215							back to text. Returns a hash reference of key =E value pairs.
216
217							=item vcf_attributes
218
219							Parses the C (8th column) and all sample columns (10th and higher
220							columns) in a version 4 VCF file. The Sample columns use the C
221							column (9th column) as keys. The returned hash reference has two levels:
222							The first level keys are both the column names and index (0-based). The
223							second level keys are the individual attribute keys to each value.
224							For example:
225
226							my $attr = $row->vcf_attributes;
227
228							# access by column name
229							my $genotype = $attr->{sample1}{GT};
230							my $depth = $attr->{INFO}{ADP};
231
232							# access by 0-based column index
233							my $genotype = $attr->{9}{GT};
234							my $depth = $attr->{7}{ADP}
235
236							=item rewrite_attributes
237
238							Generic method that either calls L or
239							L depending on the data table format.
240
241							=item rewrite_gff_attributes
242
243							Rewrites the GFF attributes column (the 9th column) based on the
244							contents of the attributes hash that was previously generated with
245							the L method. Useful when you have modified the
246							contents of the attributes hash.
247
248							=item rewrite_vcf_attributes
249
250							Rewrite the VCF attributes for the C (8th column), C (9th
251							column), and sample columns (10th and higher columns) based on the
252							contents of the attributes hash that was previously generated with
253							the L method. Useful when you have modified the
254							contents of the attributes hash.
255
256							=back
257
258							=head2 Convenience Methods to database functions
259
260							The next three functions are convenience methods for using the
261							attributes in the current data row to interact with databases.
262							They are wrappers to methods in the L
263							module.
264
265							=over 4
266
267							=item seqfeature
268
269							=item feature
270
271							Returns a SeqFeature object representing the feature or item in
272							the current row. If the SeqFeature object is stored in the parent
273							C<$Data> object (usually from parsing an annotation file), it is
274							immediately returned. Otherwise, the SeqFeature
275							object is retrieved from the database using the name and
276							type values in the current Data table row. The SeqFeature object
277							is requested from the database named in the general metadata. If
278							an alternate database is desired, you should change it first using
279							the C<$Data>-Edatabase() method. If the feature name or type is not
280							present in the table, then nothing is returned.
281
282							See L and L for more
283							information about working with these objects. See L
284							about working with database features.
285
286							This method normally only works with "named" feature types in a
287							L Data table. If your Data table has coordinate
288							information, i.e. chromosome, start, and stop columns, then it will
289							likely be recognized as a "coordinate" feature_type and not work.
290
291							Pass a true value to this method to force the seqfeature lookup. This
292							will still require the presence of Name, ID, and/or Type columns to
293							perform the database lookup. The L method feature()
294							is used to determine the type if a Type column is not present.
295
296							=item segment
297
298							Returns a database Segment object corresponding to the coordinates
299							defined in the Data table row. If a named feature and type are
300							present instead of coordinates, then the feature is first automatically
301							retrieved and a Segment returned based on its coordinates. The
302							database named in the general metadata is used to establish the
303							Segment object. If a different database is desired, it should be
304							changed first using the general L method.
305
306							See L and L for more information
307							about working with Segment objects.
308
309							=item get_features
310
311							my @overlap_features = $row->get_features(type => $type);
312
313							Returns seqfeature objects from a database that overlap the Feature
314							or interval in the current Data table row. This is essentially a
315							convenience wrapper for a Bio::DB style I method using the
316							coordinates of the Feature. Optionally pass an array of key value pairs
317							to specify alternate coordinates if so desired. Potential keys
318							include
319
320							=over 4
321
322							=item seq_id
323
324							=item start
325
326							=item end
327
328							=item type
329
330							The type of database features to retrieve.
331
332							=item db
333
334							An alternate database object to collect from.
335
336							=back
337
338							=item get_sequence
339
340							Fetches genomic sequence based on the coordinates of the current seqfeature
341							or interval in the current Feature. This requires a database that
342							contains the genomic sequence, either the database specified in the
343							Data table metadata or an external indexed genomic fasta file.
344
345							If the Feature represents a transcript or gene, then a concatenated
346							sequence of the selected subfeatures may be generated and returned. B
347							that redundant or overlapping subfeatures are B merged, and
348							unexpected results may be obtained.
349
350							The sequence is returned as simple string. If the feature is on the reverse
351							strand, then the reverse complement sequence is automatically returned.
352
353							Pass an array of key value pairs to specify alternate coordinates if so
354							desired. Potential keys include
355
356							=over 4
357
358							=item subfeature
359
360							Pass a text string representing the type of subfeature from which to collect
361							the sequence. Acceptable values include
362
363							=over 4
364
365							=item * exon
366
367							=item * cds
368
369							=item * 5p_utr
370
371							=item * 3p_utr
372
373							=item * intron
374
375							=back
376
377							=item seq_id
378
379							=item start
380
381							=item end
382
383							=item strand
384
385							=item extend
386
387							Indicate additional basepairs of sequence added to both sides
388
389							=item db
390
391							The fasta file or database from which to fetch the sequence
392
393							=back
394
395							=back
396
397							=head2 Data collection
398
399							The following methods allow for data collection from various
400							sources, including bam, bigwig, bigbed, useq, Bio::DB databases, etc.
401
402							=over 4
403
404							=item calculate_reference($position)
405
406							Calculates and returns the absolute genomic coordinate for a relative
407							reference position taking into account feature orientation (strand).
408							This is not explicitly data collection, but often used in conjunction
409							with such. Provide an integer representing the relative position point:
410
411							=over 4
412
413							=item * C<3> representing C<3'> end coordinate
414
415							=item * C<4> representing mid point coordinate
416
417							=item * C<5> representing C<5'> end coordinate
418
419							=item * C<9> representing peak summit in F formatted files
420
421							=back
422
423							If necessary, an array or array reference may be provided as an
424							alternative parameter with keys including C, C,
425							C, and C if alternate or adjusted
426							coordinates should be used instead of the given row feature coordinates.
427
428							=item get_score
429
430							my $score = $row->get_score(
431							dataset => 'scores.bw',
432							method => 'max',
433							);
434
435							This method collects a single score over the feature or interval.
436							Usually a mathematical or statistical value is employed to derive the
437							single score. Pass an array of key value pairs to control data collection.
438							Keys include the following:
439
440							=over 4
441
442							=item db
443
444							=item ddb
445
446							Specify a Bio::DB database from which to collect the data. The default
447							value is the database specified in the Data table metadata, if present.
448							Examples include a L or L
449							database.
450
451							=item dataset
452
453							Specify the name of the dataset. If a database was specified, then this
454							value would be the C or C feature found in the
455							database. Otherwise, the name of a data file, such as a bam, bigWig,
456							bigBed, or USeq file, would be provided here. This options is required!
457
458							=item method
459
460							Specify the mathematical or statistical method combining multiple scores
461							over the interval into one value. Options include the following:
462
463							=over 4
464
465							=item * mean
466
467							=item * sum
468
469							=item * min
470
471							=item * max
472
473							=item * median
474
475							=item * count
476
477							Count all overlapping items.
478
479							=item * pcount
480
481							Precisely count only containing (not overlapping) items.
482
483							=item * ncount
484
485							Count overlapping unique names only.
486
487							=item * range
488
489							The difference between minimum and maximum values.
490
491							=item * stddev
492
493							Standard deviation.
494
495							=back
496
497							=item strandedness
498
499							Specify what strand from which the data should be taken, with respect
500							to the Feature strand. Three options are available. Only really relevant
501							for data sources that support strand.
502
503							=over 4
504
505							=item * sense
506
507							The same strand as the Feature.
508
509							=item * antisense
510
511							The opposite strand as the Feature.
512
513							=item * all
514
515							Strand is ignored, all is taken (default).
516
517							=back
518
519							=item subfeature
520
521							Specify the subfeature type from which to collect the scores. Typically
522							a SeqFeature object representing a transcript is provided, and the
523							indicated subfeatures are collected from object. Pass the name of the
524							subfeature to use. Accepted values include the following.
525
526							=over 4
527
528							=item * exon
529
530							=item * cds
531
532							=item * 5p_utr
533
534							=item * 3p_utr
535
536							=item * intron
537
538							=back
539
540							=item extend
541
542							Specify the number of basepairs that the Data table Feature's
543							coordinates should be extended in both directions. Ignored
544							when used with the subfeature option.
545
546							=item seq_id
547
548							=item chromo
549
550							=item start
551
552							=item end
553
554							=item stop
555
556							=item strand
557
558							Optionally specify zero or more alternate coordinates to use.
559							By default, these are obtained from the Data table Feature.
560
561							=back
562
563							=item get_relative_point_position_scores
564
565							while (my $row = $stream->next_row) {
566							my $pos2score = $row->get_relative_point_position_scores(
567							'ddb' => '/path/to/BigWigSet/',
568							'dataset' => 'MyData',
569							'position' => 5,
570							'extend' => 1000,
571							);
572							}
573
574							This method collects indexed position scores centered around a
575							specific reference point. The returned data is a hash of
576							relative positions (example -20, -10, 1, 10, 20) and their score
577							values. Pass an array of key value pairs to control data collection.
578							Keys include the following:
579
580							=over 4
581
582							=item db
583
584							=item ddb
585
586							Specify a Bio::DB database from which to collect the data. The default
587							value is the database specified in the Data table metadata, if present.
588							Examples include a L or L
589							database.
590
591							=item dataset
592
593							Specify the name of the dataset. If a database was specified, then this
594							value would be the C or C feature found in the
595							database. Otherwise, the name of a data file, such as a bam, bigWig,
596							bigBed, or USeq file, would be provided here. This options is required!
597
598							=item position
599
600							Indicate the position of the reference point relative to the Data table
601							Feature. 5 is the 5' coordinate, 3 is the 3' coordinate, and 4 is the
602							midpoint (get it? it's between 5 and 3). Default is 5.
603
604							=item extend
605
606							Indicate the number of base pairs to extend from the reference coordinate.
607							This option is required!
608
609							=item coordinate
610
611							Optionally provide the real chromosomal coordinate as the reference point.
612
613							=item absolute
614
615							Boolean option to indicate that the returned hash of positions and scores
616							should not be transformed into relative positions but kept as absolute
617							chromosomal coordinates.
618
619							=item avoid
620
621							Provide a C or C database feature type to avoid overlapping
622							scores. Each found score is checked for overlapping features and is
623							discarded if found to do so. The database should be set to use this.
624
625							=item strandedness
626
627							Specify what strand from which the data should be taken, with respect
628							to the Feature strand. Three options are available. Only really relevant
629							for data sources that support strand.
630
631							=over 4
632
633							=item * sense
634
635							The same strand as the Feature.
636
637							=item * antisense
638
639							The opposite strand as the Feature.
640
641							=item * all
642
643							Strand is ignored, all is taken (default).
644
645							=back
646
647							=item method
648
649							Only required when counting objects.
650
651							=over 4
652
653							=item * count
654
655							Count all overlapping items.
656
657							=item * pcount
658
659							Precisely count only containing (not overlapping) items.
660
661							=item * ncount
662
663							Count overlapping unique names only.
664
665							=back
666
667							=back
668
669							=item get_region_position_scores
670
671							while (my $row = $stream->next_row) {
672							my $pos2score = $row->get_relative_point_position_scores(
673							'ddb' => '/path/to/BigWigSet/',
674							'dataset' => 'MyData',
675							'position' => 5,
676							'extend' => 1000,
677							);
678							}
679
680							This method collects indexed position scores across a defined
681							region or interval. The returned data is a hash of positions and
682							their score values. The positions are by default relative to a
683							region coordinate, usually to the 5' end. Pass an array of key value
684							pairs to control data collection. Keys include the following:
685
686							=over 4
687
688							=item db
689
690							=item ddb
691
692							Specify a Bio::DB database from which to collect the data. The default
693							value is the database specified in the Data table metadata, if present.
694							Examples include a L or L
695							database.
696
697							=item dataset
698
699							Specify the name of the dataset. If a database was specified, then this
700							value would be the C or C feature found in the
701							database. Otherwise, the name of a data file, such as a bam, bigWig,
702							bigBed, or USeq file, would be provided here. This options is required!
703
704							=item subfeature
705
706							Specify the subfeature type from which to collect the scores. Typically
707							a SeqFeature object representing a transcript is provided, and the
708							indicated subfeatures are collected from object. When converting to
709							relative coordinates, the coordinates will be relative to the length of
710							the sum of the subfeatures, i.e. the length of the introns will be ignored.
711
712							Pass the name of the subfeature to use. Accepted values include the following.
713
714							=over 4
715
716							=item * exon
717
718							=item * cds
719
720							=item * 5p_utr
721
722							=item * 3p_utr
723
724							=item * intron
725
726							=back
727
728							=item extend
729
730							Specify the number of basepairs that the Data table Feature's
731							coordinates should be extended in both directions.
732
733							=item seq_id
734
735							=item chromo
736
737							=item start
738
739							=item end
740
741							=item stop
742
743							=item strand
744
745							Optionally specify zero or more alternate coordinates to use.
746							By default, these are obtained from the Data table Feature.
747
748							=item position
749
750							Indicate the position of the reference point relative to the Data table
751							Feature. 5 is the 5' coordinate, 3 is the 3' coordinate, and 4 is the
752							midpoint (get it? it's between 5 and 3). Default is 5.
753
754							=item coordinate
755
756							Optionally provide the real chromosomal coordinate as the reference point.
757
758							=item absolute
759
760							Boolean option to indicate that the returned hash of positions and scores
761							should not be transformed into relative positions but kept as absolute
762							chromosomal coordinates.
763
764							=item avoid
765
766							Provide a C or C database feature type to avoid overlapping
767							scores. Each found score is checked for overlapping features and is
768							discarded if found to do so. The database should be set to use this.
769
770							=item strandedness
771
772							Specify what strand from which the data should be taken, with respect
773							to the Feature strand. Three options are available. Only really relevant
774							for data sources that support strand.
775
776							=over 4
777
778							=item * sense
779
780							The same strand as the Feature.
781
782							=item * antisense
783
784							The opposite strand as the Feature.
785
786							=item * all
787
788							Strand is ignored, all is taken (default).
789
790							=back
791
792							=item method
793
794							Only required when counting objects.
795
796							=over 4
797
798							=item * count
799
800							Count all overlapping items.
801
802							=item * pcount
803
804							Precisely count only containing (not overlapping) items.
805
806							=item * ncount
807
808							Count overlapping unique names only.
809
810							=back
811
812							=back
813
814							=item fetch_alignments
815
816							my $sam = $Data->open_database('/path/to/file.bam');
817							my $alignment_data = { mapq => [] };
818							my $callback = sub {
819							my ($a, $data) = @_;
820							push @{ $data->{mapq} }, $a->qual;
821							};
822							while (my $row = $stream->next_row) {
823							$row->fetch_alignments(
824							'db' => $sam,
825							'data' => $alignment_data,
826							'callback' => $callback,
827							);
828							}
829
830							This function allows you to iterate over alignments in a Bam file,
831							allowing custom information to be collected based on a callback
832							code reference that is provided.
833
834							Three parameters are required: C, C, and C. A
835							true value (1) is returned upon success.
836
837							=over 4
838
839							=item db
840
841							Provide an opened, high-level, Bam database object.
842
843							=item callback
844
845							Provide a code callback reference to use when iterating over the
846							alignments. Two objects are passed to this code function: the
847							alignment object and the data structure that is provided. See the
848							Bam adapter documentation for details on low-level C through
849							the Bam index object for details.
850
851							=item data
852
853							This is a reference to a C data object for storing information.
854							It is passed to the callback function along with the alignment. Three
855							new key =E value pairs are automatically added: C, C,
856							and C. These correspond to the values for the current queried
857							interval. Coordinates are automatically transformed to 0-base coordinate
858							system to match low level alignment objects.
859
860							=item subfeature
861
862							If the feature has subfeatures, such as exons, introns, etc., pass
863							the name of the subfeature to restrict iteration only over the
864							indicated subfeatures. The C object will inherit the coordinates
865							for each subfeatures. Allowed subfeatures include the following:
866
867							=over 4
868
869							=item * exon
870
871							=item * cds
872
873							=item * 5p_utr
874
875							=item * 3p_utr
876
877							=item * intron
878
879							=back
880
881							=item start
882
883							=item stop
884
885							=item end
886
887							Provide alternate, custom start and stop coordinates for the row
888							feature. Ignored with subfeatures.
889
890							=back
891
892							=back
893
894							=head2 Feature Export
895
896							These methods allow the feature to be exported in industry standard
897							formats, including the BED format and the GFF format. Both methods
898							return a formatted tab-delimited text string suitable for printing to
899							file. The string does not include a line ending character.
900
901							These methods rely on coordinates being present in the source table.
902							If the row feature represents a database item, the L method
903							should be called prior to these methods, allowing the feature to be
904							retrieved from the database and coordinates obtained.
905
906							=over 4
907
908							=item bed_string
909
910							Returns a BED formatted string. By default, a 6-element string is
911							generated, unless otherwise specified. Pass an array of key values
912							to control how the string is generated. The following arguments
913							are supported.
914
915							=over 4
916
917							=item bed
918
919							Specify the number of BED elements to include. The number of elements
920							correspond to the number of columns in the BED file specification. A
921							minimum of 3 (chromosome, start, stop) is required, and maximum of 6
922							is allowed (chromosome, start, stop, name, score, strand).
923
924							=item chromo
925
926							=item seq_id
927
928							Provide a text string of an alternative chromosome or sequence name.
929
930							=item start
931
932							=item stop
933
934							=item end
935
936							Provide alternative integers for the start and stop coordinates.
937							Note that start values are automatically converted to 0-base
938							by subtracting 1.
939
940							=item strand
941
942							Provide alternate an alternative strand value.
943
944							=item name
945
946							Provide an alternate or missing name value to be used as text in the 4th
947							column. If no name is provided or available, a default name is generated.
948
949							=item score
950
951							Provide a numerical value to be included as the score. BED files typically
952							use integer values ranging from 1..1000.
953
954							=back
955
956							=item gff_string
957
958							Returns a GFF3 formatted string. Pass an array of key values
959							to control how the string is generated. The following arguments
960							are supported.
961
962							=over 4
963
964							=item chromo
965
966							=item seq_id
967
968							=item start
969
970							=item stop
971
972							=item end
973
974							=item strand
975
976							Provide alternate values from those defined or missing in the current
977							row Feature.
978
979							=item source
980
981							Provide a text string to be used as the source_tag value in the 2nd
982							column. The default value is null ".".
983
984							=item primary_tag
985
986							Provide a text string to be used as the primary_tag value in the 3rd
987							column. The default value is null ".".
988
989							=item type
990
991							Provide a text string. This can be either a "primary_tag:source_tag" value
992							as used by GFF based BioPerl databases, or "primary_tag" alone.
993
994							=item score
995
996							Provide a numerical value to be included as the score. The default
997							value is null ".".
998
999							=item name
1000
1001							Provide alternate or missing name value to be used as the display_name.
1002							If no name is provided or available, a default name is generated.
1003
1004							=item attributes
1005
1006							Provide an anonymous array reference of one or more row Feature indices
1007							to be used as GFF attributes. The name of the column is used as the GFF
1008							attribute key.
1009
1010							=back
1011
1012							=back
1013
1014							=cut
1015
1016	3			3		16	use strict;
	3					4
	3					87
1017	3			3		12	use Carp qw(carp cluck croak confess);
	3					5
	3					180
1018	3			3		13	use Module::Load;
	3					17
	3					22
1019	3					145	use Bio::ToolBox::db_helper qw(
1020							get_db_feature
1021							get_segment_score
1022							calculate_score
1023							get_genomic_sequence
1024							low_level_bam_fetch
1025	3			3		134	);
	3					6
1026	3			3		13	use Bio::ToolBox::db_helper::constants;
	3					6
	3					19848
1027
1028							my $GENETOOL_LOADED = 0;
1029							1;
1030
1031
1032
1033							### Initialization
1034
1035							sub new {
1036							# this should ONLY be called from Bio::ToolBox::Data* iterators
1037	353			353	0	374	my $class = shift;
1038	353					581	my %self = @_;
1039							# we trust that new is called properly with data and index values
1040	353					776	return bless \%self, $class;
1041							}
1042
1043
1044							### Set and retrieve values
1045
1046							sub data {
1047	0			0	1	0	return shift->{data};
1048							}
1049
1050							sub column_name {
1051	0			0	1	0	my ($self, $column) = @_;
1052	0	0				0	return unless defined $column;
1053	0					0	return $self->{data}->name($column);
1054							}
1055
1056							sub feature_type {
1057	24			24	1	30	my $self = shift;
1058	24	50				38	carp "feature_type is a read only method" if @_;
1059	24					60	return $self->{data}->feature_type;
1060							}
1061
1062							sub row_index {
1063	65			65	1	63	my $self = shift;
1064	65	50				100	carp "row_index is a read only method" if @_;
1065	65					137	return $self->{'index'};
1066							}
1067
1068							sub line_number {
1069	0			0	0	0	my $self = shift;
1070	0	0				0	carp "line_number is a read only method" if @_;
1071	0	0				0	if (exists $self->{data}->{line_count}) {
		0
1072	0					0	return $self->{data}->{line_count};
1073							}
1074							elsif (exists $self->{data}->{header_line_count}) {
1075	0					0	return $self->{data}->{header_line_count} + $self->row_index;
1076							}
1077							else {
1078	0					0	return;
1079							}
1080							}
1081
1082							sub row_values {
1083	196			196	1	252	my $self = shift;
1084	196	50				242	carp "row_values is a read only method" if @_;
1085	196					197	my $row = $self->{'index'};
1086	196					174	my @data = @{ $self->{data}->{data_table}->[$row] };
	196					373
1087	196	100				486	return wantarray ? @data : \@data;
1088							}
1089
1090							sub value {
1091	341			341	1	1378	my ($self, $column, $value) = @_;
1092	341	50				455	return unless defined $column;
1093	341					382	my $row = $self->{'index'};
1094
1095	341	100				406	if (defined $value) {
1096							# set a value
1097	85					104	$self->{data}->{data_table}->[$row][$column] = $value;
1098							}
1099	341					473	my $v = $self->{data}->{data_table}->[$row][$column];
1100	341	100				721	return length($v) ? $v : '.'; # internal null value, inherited from GFF definition
1101							}
1102
1103							sub seq_id {
1104	21			21	1	300	my $self = shift;
1105	21	100				44	if ($_[0]) {
1106	1					4	my $i = $self->{data}->chromo_column;
1107							# update only if we have an actual column
1108	1	50				3	if (defined $i) {
		0
1109	1					4	$self->value($i, $_[0]);
1110	1					2	$self->{seqid} = $_[0];
1111							}
1112							elsif (exists $self->{feature}) {
1113	0					0	carp "Unable to update seq_id for parsed SeqFeature objects";
1114							}
1115							else {
1116	0					0	carp "No Chromosome column to update!";
1117							}
1118							}
1119
1120							# return cached value
1121	21	100				71	return $self->{seqid} if exists $self->{seqid};
1122
1123							# get seq_id value
1124	6					10	my $c;
1125	6					19	my $i = $self->{data}->chromo_column;
1126	6					30	my $j = $self->{data}->coord_column;
1127	6	50				23	if (defined $i) {
		0
		0
		0
1128							# collect from table
1129	6					15	$c = $self->value($i);
1130							}
1131							elsif (exists $self->{feature}) {
1132							# seqfeature
1133	0					0	$c = $self->{feature}->seq_id;
1134							}
1135							elsif (defined $j) {
1136	0					0	$self->_from_coordinate_string($j);
1137	0					0	return $self->{seqid};
1138							}
1139							elsif ($self->feature_type eq 'named') {
1140	0					0	my $f = $self->seqfeature;
1141	0	0				0	if ($f) {
1142	0					0	$c = $f->seq_id;
1143							}
1144							}
1145	6	50				22	$self->{seqid} = $c if defined $c;
1146	6					18	return $c;
1147							}
1148
1149							sub start {
1150	119			119	1	1038	my $self = shift;
1151	119	100				179	if ($_[0]) {
1152	5					16	my $i = $self->{data}->start_column;
1153							# update only if we have an actual column
1154	5	50				34	my $d = $_[0] =~ /^\d+$/ ? 1 : 0; # looks like an integer
1155	5	100	66			25	if (defined $i and $d) {
		50
		50
1156	1	50				4	if ($self->{data}->interbase) {
1157							# compensate for 0-based, assuming we're always working with 1-based
1158	1					4	$self->value($i, $_[0] - 1);
1159	1					2	$self->{start} = $_[0];
1160							}
1161							else {
1162	0					0	$self->value($i, $_[0]);
1163	0					0	$self->{start} = $_[0];
1164							}
1165							}
1166							elsif (not $d) {
1167	0					0	carp "Start coordinate value is not an integer";
1168							}
1169							elsif (exists $self->{feature}) {
1170	4					480	carp "Unable to update Start coordinate for parsed SeqFeature objects";
1171							}
1172							else {
1173	0					0	carp "No Start coordinate column to update!";
1174							}
1175							}
1176
1177							# return cached value
1178	119	100				496	return $self->{start} if exists $self->{start};
1179
1180							# get start value
1181	92					85	my $s;
1182	92					157	my $i = $self->{data}->start_column;
1183	92					168	my $j = $self->{data}->coord_column;
1184	92	100				130	if (defined $i) {
		50
		0
		0
1185							# collect from table
1186	87					107	$s = $self->value($i);
1187	87	100				150	if ($self->{data}->interbase) {
1188							# compensate for 0-based index
1189	6					18	$s += 1;
1190							}
1191							}
1192							elsif (exists $self->{feature}) {
1193							# seqfeature
1194	5					17	$s = $self->{feature}->start;
1195							}
1196							elsif (defined $j) {
1197	0					0	$self->_from_coordinate_string($j);
1198	0					0	return $self->{start};
1199							}
1200							elsif ($self->feature_type eq 'named') {
1201	0					0	my $f = $self->seqfeature;
1202	0	0				0	if ($f) {
1203	0					0	$s = $f->start;
1204							}
1205							}
1206	92	50				149	$self->{start} = $s if defined $s;
1207	92					194	return $s;
1208							}
1209
1210							*stop = \&end;
1211							sub end {
1212	38			38	1	66	my $self = shift;
1213	38	100				73	if ($_[0]) {
1214	2					8	my $i = $self->{data}->stop_column;
1215							# update only if we have an actual column
1216	2	50				14	my $d = $_[0] =~ /^\d+$/ ? 1 : 0;
1217	2	50	33			10	if (defined $i and $d) {
		0
		0
1218	2					7	$self->value($i, $_[0]);
1219	2					3	$self->{end} = $_[0];
1220							}
1221							elsif (not $d) {
1222	0					0	carp "End coordinate value is not an integer";
1223							}
1224							elsif (exists $self->{feature}) {
1225	0					0	carp "Unable to update End coordinate for parsed SeqFeature objects";
1226							}
1227							else {
1228	0					0	carp "No End coordinate column to update!";
1229							}
1230							}
1231
1232							# return cached value
1233	38	100				113	return $self->{end} if exists $self->{end};
1234
1235							# get end value
1236	11					13	my $e;
1237	11					43	my $i = $self->{data}->stop_column;
1238	11					27	my $j = $self->{data}->coord_column;
1239	11	100				29	if (defined $i) {
		50
		0
		0
1240							# collect from table
1241	10					23	$e = $self->value($i);
1242							}
1243							elsif (exists $self->{feature}) {
1244							# seqfeature
1245	1					4	$e = $self->{feature}->end;
1246							}
1247							elsif (defined $j) {
1248	0					0	$self->_from_coordinate_string($j);
1249	0					0	return $self->{end};
1250							}
1251							elsif ($self->feature_type eq 'named') {
1252	0					0	my $f = $self->seqfeature;
1253	0	0				0	if ($f) {
1254	0					0	$e = $f->end;
1255							}
1256							}
1257	11	50				35	$self->{end} = $e if defined $e;
1258	11					32	return $e;
1259							}
1260
1261							sub strand {
1262	30			30	1	49	my $self = shift;
1263	30	100				55	if ($_[0]) {
1264							# update only if we have an actual column
1265	1					5	my $i = $self->{data}->strand_column;
1266	1	50				4	if (defined $i) {
		50
1267	0					0	$self->value($i, $_[0]);
1268	0					0	$self->{strand} = $self->_strand($_[0]);
1269							}
1270							elsif (exists $self->{feature}) {
1271	0					0	carp "Unable to update Strand for parsed SeqFeature objects";
1272							}
1273							else {
1274	1					123	carp "No Strand column to update!";
1275							}
1276							}
1277
1278							# return cached value
1279	30	100				161	return $self->{strand} if exists $self->{strand};
1280
1281							# get strand value
1282	8					10	my $s = 0;
1283	8	100				36	if (my $i = $self->{data}->strand_column) {
		100
		50
1284							# collect from table
1285	2					6	$s = $self->_strand( $self->value($i) );
1286							}
1287							elsif (exists $self->{feature}) {
1288							# seqfeature
1289	5					15	$s = $self->{feature}->strand;
1290							}
1291							elsif ($self->feature_type eq 'named') {
1292	0					0	my $f = $self->seqfeature;
1293	0	0				0	if ($f) {
1294	0					0	$s = $f->strand;
1295							}
1296							}
1297	8					15	$self->{strand} = $s;
1298	8					23	return $s;
1299							}
1300
1301							sub _strand {
1302	2			2		4	my $self = shift;
1303	2					3	my $str = shift;
1304	2	50	33			31	if ($str eq '1' or $str eq '-1' or $str eq '0') {
		50	33
		50
		0
		0
		0
1305	0					0	return $str;
1306							}
1307							elsif ($str eq '+') {
1308	0					0	return 1;
1309							}
1310							elsif ($str eq '-') {
1311	2					6	return -1;
1312							}
1313							elsif ($str eq '.') {
1314	0					0	return 0;
1315							}
1316							elsif ($str eq '2') {
1317							# R packages use 1 for + and 2 for -
1318	0					0	return -1;
1319							}
1320							elsif ($str eq '*') {
1321							# R packages use * for .
1322	0					0	return 0;
1323							}
1324							else {
1325	0					0	return 0;
1326							}
1327							}
1328
1329							sub _from_coordinate_string {
1330	0			0		0	my ($self, $i) = shift;
1331	0					0	my ($chr, $start, $end, $str) = split /(?:\-\|\.\.\|\s)/, $self->value($i);
1332	0	0				0	$self->{seqid} = $chr unless exists $self->{seqid};
1333	0	0				0	$self->{start} = $start unless exists $self->{start};
1334							# we assume this is a 1-based coordinate
1335	0	0				0	$self->{end} = $end unless exists $self->{end};
1336	0	0	0			0	if (defined $str and not exists $self->{strand}) {
1337							# you never know, the strand may be added to the string
1338	0					0	$self->{strand} = $self->_strand($str);
1339							}
1340							}
1341
1342							sub peak {
1343	4			4	1	305	my $self = shift;
1344	4	100				15	if ($self->{data}->format eq 'narrowPeak') {
1345	3	100	66			15	if (exists $self->{feature} and $self->{feature}->has_tag('peak')) {
1346	2					6	return $self->{feature}->get_tag_values('peak') + $self->{feature}->start;
1347							}
1348							else {
1349	1					4	return $self->value(1) + $self->value(9) + 1;
1350							}
1351							}
1352							else {
1353	1					3	return $self->midpoint;
1354							}
1355							}
1356
1357							sub midpoint {
1358	5			5	1	10	my $self = shift;
1359	5					14	my $s = $self->start;
1360	5					15	my $e = $self->end;
1361	5	50	33			19	if ($s and $e) {
1362	5					28	return int( ($s + $e) / 2 );
1363							}
1364							else {
1365	0					0	return undef;
1366							}
1367							}
1368
1369							*name = \&display_name;
1370							sub display_name {
1371	76			76	1	1246	my $self = shift;
1372	76	100				113	if ($_[0]) {
1373							# update only if we have an actual column
1374	1					4	my $i = $self->{data}->name_column;
1375	1	50				4	if (defined $i) {
		0
1376	1					4	return $self->value($i, $_[0]);
1377							}
1378							elsif (exists $self->{feature}) {
1379	0					0	carp "Unable to update display_name for parsed SeqFeature objects";
1380							}
1381							else {
1382	0					0	carp "No Name column to update!";
1383							}
1384							}
1385							# seqfeature
1386	75	100				140	if (exists $self->{feature}) {
1387	5					17	return $self->{feature}->display_name;
1388							}
1389							# collect from table
1390	70					146	my $i = $self->{data}->name_column;
1391	70	50				99	if (defined $i) {
		0
1392	70					107	return $self->value($i);
1393							}
1394							elsif (my $att = $self->gff_attributes) {
1395							return $att->{Name} \|\| $att->{ID} \|\| $att->{transcript_name} \|\|
1396	0		0			0	$att->{gene_name} \|\| undef;
1397							}
1398							}
1399
1400							sub coordinate {
1401	3			3	1	7	my $self = shift;
1402	3	50				9	carp "name is a read only method" if @_;
1403							# to avoid auto-converting start0 coordinates, which might confuse people or programs,
1404							# we will take the start value as is when it's available, otherwise calculate start
1405	3					12	my $start_i = $self->{data}->start_column;
1406							my $coord = sprintf("%s:%d", $self->seq_id,
1407							defined $start_i ? $self->value($start_i) :
1408	3	0				10	exists $self->{feature} ? $self->{feature}->start : 0);
		50
1409	3					8	my $end = $self->end;
1410	3	50				10	$coord .= "-$end" if $end;
1411	3	50				16	return CORE::length($coord) > 2 ? $coord : undef;
1412							}
1413
1414							sub type {
1415	9			9	1	16	my $self = shift;
1416	9	100				24	if ($_[0]) {
1417							# update only if we have an actual column
1418	1					4	my $i = $self->{data}->type_column;
1419	1	50				6	if (defined $i) {
		50
1420	0					0	return $self->value($i, $_[0]);
1421							}
1422							elsif (exists $self->{feature}) {
1423	0					0	carp "Unable to update primary_tag for parsed SeqFeature objects";
1424							}
1425							else {
1426	1					52	carp "No Type column to update!";
1427							}
1428							}
1429							# collect from table
1430	9					85	my $i = $self->{data}->type_column;
1431	9	50				18	if (defined $i) {
1432	0					0	return $self->value($i);
1433							}
1434							# seqfeature
1435	9	100				20	if (exists $self->{feature}) {
1436	4					15	return $self->{feature}->primary_tag;
1437							}
1438							# general metadata
1439	5	50				11	if ($self->{data}->feature) {
1440	5					10	return $self->{data}->feature;
1441							}
1442	0					0	return undef;
1443							}
1444
1445							*id = \&primary_id;
1446							sub primary_id {
1447	66			66	1	71	my $self = shift;
1448	66	50				88	carp "id is a read only method" if @_;
1449	66					130	my $i = $self->{data}->id_column;
1450	66	50				118	my $v = $self->value($i) if defined $i;
1451	66	50	33			156	if (defined $v and $v ne '.') {
1452	66					135	return $v;
1453							}
1454	0	0				0	return $self->{feature}->primary_id if exists $self->{feature};
1455	0	0				0	if (my $att = $self->gff_attributes) {
1456	0		0			0	return $att->{ID} \|\| $att->{Name} \|\| $att->{transcript_id};
1457							}
1458	0					0	return undef;
1459							}
1460
1461							sub length {
1462	2			2	1	4	my $self = shift;
1463	2	50				5	carp "length is a read only method" if @_;
1464	2	50				7	if ($self->{data}->vcf) {
1465							# special case for vcf files, measure the length of the ALT allele
1466	0					0	return CORE::length($self->value(4));
1467							}
1468	2					5	my $s = $self->start;
1469	2					4	my $e = $self->end;
1470	2	50	33			9	if (defined $s and defined $e) {
		0
1471	2					7	return $e - $s + 1;
1472							}
1473							elsif (defined $s) {
1474	0					0	return 1;
1475							}
1476							else {
1477	0					0	return undef;
1478							}
1479							}
1480
1481							sub score {
1482	0			0	1	0	my $self = shift;
1483	0					0	my $c = $self->{data}->score_column;
1484	0	0				0	return defined $c ? $self->value($c) : undef;
1485							}
1486
1487							sub attributes {
1488	0			0	1	0	my $self = shift;
1489	0	0				0	return $self->gff_attributes if ($self->{data}->gff);
1490	0	0				0	return $self->vcf_attributes if ($self->{data}->vcf);
1491	0					0	return;
1492							}
1493
1494							sub gff_attributes {
1495	1			1	1	2	my $self = shift;
1496	1	50				5	return unless ($self->{data}->gff);
1497	1	50				4	return $self->{attributes} if (exists $self->{attributes});
1498	1					2	$self->{attributes} = {};
1499	1					3	foreach my $g (split(/\s;\s/, $self->value(8))) {
1500	3					19	my ($tag, $value) = split /\s+\|=/, $g;
1501	3	50	33			9	next unless ($tag and $value);
1502							# unescape URL encoded values, borrowed from Bio::DB::GFF
1503	3					5	$value =~ tr/+/ /;
1504	3					7	$value =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
	11					25
1505	3					9	$self->{attributes}->{$tag} = $value;
1506							}
1507	1					3	return $self->{attributes};
1508							}
1509
1510							sub vcf_attributes {
1511	0			0	1	0	my $self = shift;
1512	0	0				0	return unless ($self->{data}->vcf);
1513	0	0				0	return $self->{attributes} if (exists $self->{attributes});
1514	0					0	$self->{attributes} = {};
1515
1516							# INFO attributes
1517	0					0	my %info;
1518	0	0				0	if ($self->{data}->name(7) eq 'INFO') {
1519	0	0				0	%info = map {$_->[0] => defined $_->[1] ? $_->[1] : undef}
1520							# some tags are simple and have no value, eg SOMATIC
1521	0					0	map { [split(/=/, $_)] }
	0					0
1522							split(/;/, $self->value(7));
1523							}
1524	0					0	$self->{attributes}->{INFO} = \%info;
1525	0					0	$self->{attributes}->{7} = \%info;
1526
1527							# Sample attributes
1528	0	0				0	if ($self->{data}->number_columns > 8) {
1529	0					0	my @formatKeys = split /:/, $self->value(8);
1530	0					0	foreach my $i (9 .. $self->{data}->last_column) {
1531	0					0	my $name = $self->{data}->name($i);
1532	0					0	my @sampleVals = split /:/, $self->value($i);
1533							my %sample = map {
1534	0	0				0	$formatKeys[$_] => defined $sampleVals[$_] ? $sampleVals[$_] : undef }
	0					0
1535							(0 .. $#formatKeys);
1536	0					0	$self->{attributes}->{$name} = \%sample;
1537	0					0	$self->{attributes}->{$i} = \%sample;
1538							}
1539							}
1540	0					0	return $self->{attributes};
1541							}
1542
1543							sub rewrite_attributes {
1544	0			0	1	0	my $self = shift;
1545	0	0				0	return $self->rewrite_gff_attributes if ($self->{data}->gff);
1546	0	0				0	return $self->rewrite_vcf_attributes if ($self->{data}->vcf);
1547	0					0	return;
1548							}
1549
1550							sub rewrite_gff_attributes {
1551	1			1	1	773	my $self = shift;
1552	1	50				3	return unless ($self->{data}->gff);
1553	1	50				4	return unless exists $self->{attributes};
1554	1					3	my @pairs; # of key=value items
1555	1	50				3	if (exists $self->{attributes}{ID}) {
1556							# I assume this does not need to be escaped!
1557	1					3	push @pairs, 'ID=' . $self->{attributes}{ID};
1558							}
1559	1	50				3	if (exists $self->{attributes}{Name}) {
1560	1					3	my $name = $self->{attributes}{Name};
1561	1					3	$name =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
	0					0
1562	1					2	push @pairs, "Name=$name";
1563							}
1564	1					1	foreach my $key (sort {$a cmp $b} keys %{ $self->{attributes} }) {
	3					8
	1					7
1565	3	100				6	next if $key eq 'ID';
1566	2	100				4	next if $key eq 'Name';
1567	1					2	my $value = $self->{attributes}{$key};
1568	1					3	$key =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
	0					0
1569	1					3	$value =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
	8					21
1570	1					4	push @pairs, "$key=$value";
1571							}
1572	1					5	$self->value(8, join("; ", @pairs));
1573	1					4	return 1;
1574							}
1575
1576							sub rewrite_vcf_attributes {
1577	0			0	1	0	my $self = shift;
1578	0	0				0	return unless ($self->{data}->vcf);
1579	0	0				0	return unless exists $self->{attributes};
1580
1581							# INFO
1582							my $info = join(';',
1583							map {
1584							defined $self->{attributes}->{INFO}{$_} ?
1585	0	0				0	join('=', $_, $self->{attributes}->{INFO}{$_}) : $_
1586							}
1587	0					0	sort {$a cmp $b}
1588	0					0	keys %{$self->{attributes}->{INFO}}
	0					0
1589							);
1590	0		0			0	$info \|\|= '.'; # sometimes we have nothing left
1591	0					0	$self->value(7, $info);
1592
1593							# FORMAT
1594	0					0	my @order;
1595	0	0				0	push @order, 'GT' if exists $self->{attributes}{9}{GT};
1596	0					0	foreach my $key (sort {$a cmp $b} keys %{ $self->{attributes}{9} } ) {
	0					0
	0					0
1597	0	0				0	next if $key eq 'GT';
1598	0					0	push @order, $key;
1599							}
1600	0	0				0	if (@order) {
1601	0					0	$self->value(8, join(':', @order));
1602							}
1603							else {
1604	0					0	$self->value(8, '.');
1605							}
1606
1607							# SAMPLES
1608	0					0	foreach my $i (9 .. $self->{data}->last_column) {
1609	0	0				0	if (@order) {
1610							$self->value($i, join(":",
1611	0					0	map { $self->{attributes}{$i}{$_} } @order
	0					0
1612							) );
1613							}
1614							else {
1615	0					0	$self->value($i, '.');
1616							}
1617							}
1618	0					0	return 1;
1619							}
1620
1621							### Data collection convenience methods
1622
1623							*feature = \&seqfeature;
1624							sub seqfeature {
1625	4			4	1	4	my $self = shift;
1626	4		50			11	my $force = shift \|\| 0;
1627	4	50				10	carp "feature is a read only method" if @_;
1628	4	50				10	return $self->{feature} if exists $self->{feature};
1629							# normally this is only for named features in a data table
1630							# skip this for coordinate features like bed files
1631	4	50	33			8	return unless $self->feature_type eq 'named' or $force;
1632
1633							# retrieve from main Data store
1634	0					0	my $f = $self->{data}->get_seqfeature( $self->{'index'} );
1635	0	0				0	if ($f) {
1636	0					0	$self->{feature} = $f;
1637	0					0	return $f;
1638							}
1639
1640							# retrieve the feature from the database
1641	0	0				0	return unless $self->{data}->database;
1642							$f = get_db_feature(
1643							'db' => $self->{data}->open_meta_database,
1644							'id' => $self->id \|\| undef,
1645							'name' => $self->name \|\| undef,
1646							'type' => $self->type \|\| $self->{data}->feature,
1647	0		0			0	);
			0
			0
1648	0	0				0	return unless $f;
1649	0					0	$self->{feature} = $f;
1650	0					0	return $f;
1651							}
1652
1653							sub segment {
1654	1			1	1	2	my $self = shift;
1655	1	50				4	carp "segment is a read only method" if @_;
1656	1	50				5	return unless $self->{data}->database;
1657	1	50				4	if ($self->feature_type eq 'coordinate') {
		0
1658	1					3	my $chromo = $self->seq_id;
1659	1					4	my $start = $self->start;
1660	1		33			5	my $stop = $self->end \|\| $start;
1661	1					4	my $db = $self->{data}->open_meta_database;
1662	1	50				8	return $db ? $db->segment($chromo, $start, $stop) : undef;
1663							}
1664							elsif ($self->feature_type eq 'named') {
1665	0					0	my $f = $self->feature;
1666	0	0				0	return $f ? $f->segment : undef;
1667							}
1668							else {
1669	0					0	return undef;
1670							}
1671							}
1672
1673							sub get_features {
1674	0			0	1	0	my $self = shift;
1675	0					0	my %args = @_;
1676	0		0			0	my $db = $args{db} \|\| $self->{data}->open_meta_database \|\| undef;
1677	0	0				0	carp "no database defined to get features!" unless defined $db;
1678	0	0				0	return unless $db->can('features');
1679
1680							# convert the argument style for most bioperl db APIs
1681	0					0	my %opts;
1682	0		0			0	$opts{-seq_id} = $args{chromo} \|\| $self->seq_id;
1683	0		0			0	$opts{-start} = $args{start} \|\| $self->start;
1684	0		0			0	$opts{-end} = $args{end} \|\| $self->end;
1685	0		0			0	$opts{-type} = $args{type} \|\| $self->type;
1686
1687	0					0	return $db->features(%opts);
1688							}
1689
1690							sub get_sequence {
1691	0			0	1	0	my $self = shift;
1692	0					0	my %args = @_;
1693	0		0			0	my $db = $args{db} \|\| $args{database} \|\| $self->{data}->open_meta_database \|\| undef;
1694							# this will fail immediately if user doesn't provide valid database
1695
1696							# get sequence over subfeatures
1697	0		0			0	$args{subfeature} \|\|= undef;
1698	0	0	0			0	if ($self->feature_type eq 'named' and $args{subfeature}) {
1699							# this is more complicated so we have a dedicated method
1700	0					0	return $self->_get_subfeature_sequence($db, \%args);
1701							}
1702
1703							# get coordinates
1704	0		0			0	my $seqid = $args{seq_id} \|\| $args{chromo} \|\| $self->seq_id;
1705	0		0			0	my $start = $args{start} \|\| $self->start;
1706	0		0			0	my $stop = $args{stop} \|\| $args{end} \|\| $self->end;
1707	0					0	my $strand = $self->strand;
1708	0	0				0	if (exists $args{strand}) {
1709							# user supplied strand, gotta check it
1710	0	0				0	$strand = $args{strand} =~ /\-\|r/i ? -1 : 1;
1711							}
1712	0	0	0			0	if (exists $args{extend} and $args{extend}) {
1713	0					0	$start -= $args{extend};
1714	0	0				0	$start = 1 if $start <= 0;
1715	0					0	$stop += $args{extend};
1716							}
1717	0	0	0			0	return unless (defined $seqid and defined $start and defined $stop);
			0
1718
1719							# retrieve and return sequence
1720	0					0	my $seq = get_genomic_sequence($db, $seqid, $start, $stop);
1721	0	0				0	if ($strand == -1) {
1722	0					0	$seq =~ tr/gatcGATC/ctagCTAG/;
1723	0					0	$seq = reverse $seq;
1724							}
1725	0					0	return $seq;
1726							}
1727
1728							sub _get_subfeature_sequence {
1729	0			0		0	my ($self, $db, $args) = @_;
1730
1731							# get the subfeatures
1732	0					0	my $subfeatures = $self->_get_subfeatures($args->{subfeature});
1733	0	0				0	unless (@$subfeatures) {
1734	0					0	carp "no subfeatures available! Returning parent sequence!";
1735							# just return the parent
1736	0					0	undef $args->{subfeature};
1737	0					0	return $self->get_sequence(@$args);
1738							}
1739
1740							# sort subfeatures
1741							# this should be done by GeneTools in most cases but just to be sure
1742							# note that this does NOT merge redundant or overlapping exons!!!!
1743	0					0	my @sorted = map { $_->[0] }
1744	0	0				0	sort { $a->[1] <=> $b->[1] or $a->[2] <=> $b->[2] }
1745	0					0	map { [$_, $_->start, $_->end] }
	0					0
1746							@$subfeatures;
1747
1748							# collect sequence
1749	0					0	my $sequence;
1750	0					0	foreach my $subf (@sorted) {
1751	0					0	my $seq = get_genomic_sequence($db, $subf->seq_id, $subf->start, $subf->stop);
1752	0					0	$sequence .= $seq;
1753							}
1754
1755							# flip the sequence
1756	0	0				0	if ($self->strand == -1) {
1757	0					0	$sequence =~ tr/gatcGATC/ctagCTAG/;
1758	0					0	$sequence = reverse $sequence;
1759							}
1760	0					0	return $sequence;
1761							}
1762
1763							sub _get_subfeatures {
1764	0			0		0	my $self = shift;
1765	0					0	my $subf = lc shift;
1766
1767							# load GeneTools
1768	0	0				0	unless ($GENETOOL_LOADED) {
1769	0					0	load('Bio::ToolBox::GeneTools', qw(get_exons get_cds get_5p_utrs get_3p_utrs
1770							get_introns));
1771	0	0				0	if ($@) {
1772	0					0	croak "missing required modules! $@";
1773							}
1774							else {
1775	0					0	$GENETOOL_LOADED = 1;
1776							}
1777							}
1778
1779							# feature
1780	0					0	my $feature = $self->seqfeature;
1781	0	0				0	return unless ($feature);
1782
1783							# get the subfeatures
1784	0					0	my @subfeatures;
1785	0	0				0	if ($subf eq 'exon') {
		0
		0
		0
		0
1786	0					0	@subfeatures = get_exons($feature);
1787							}
1788							elsif ($subf eq 'cds') {
1789	0					0	@subfeatures = get_cds($feature);
1790							}
1791							elsif ($subf eq '5p_utr') {
1792	0					0	@subfeatures = get_5p_utrs($feature);
1793							}
1794							elsif ($subf eq '3p_utr') {
1795	0					0	@subfeatures = get_3p_utrs($feature);
1796							}
1797							elsif ($subf eq 'intron') {
1798	0					0	@subfeatures = get_introns($feature);
1799							}
1800							else {
1801	0					0	croak "unrecognized subfeature parameter '$subf'!";
1802							}
1803
1804	0					0	return \@subfeatures;
1805							}
1806
1807							sub get_score {
1808	7			7	1	5766	my $self = shift;
1809	7					27	my %args = @_; # passed arguments to this method
1810
1811							# verify the dataset for the user, cannot trust whether it has been done or not
1812	7		50			42	my $db = $args{ddb} \|\| $args{db} \|\| $self->{data}->open_meta_database \|\| undef;
1813	7					35	$args{dataset} = $self->{data}->verify_dataset($args{dataset}, $db);
1814	7	50				17	unless ($args{dataset}) {
1815	0					0	croak "provided dataset was unrecognized format or otherwise could not be verified!";
1816							}
1817
1818							# get positioned scores over subfeatures only
1819	7		50			33	$args{subfeature} \|\|= q();
1820	7	0	33			17	if ($self->feature_type eq 'named' and $args{subfeature}) {
1821							# this is more complicated so we have a dedicated method
1822	0					0	return $self->_get_subfeature_scores($db, \%args);
1823							}
1824
1825							# build parameter array to pass on to the adapter
1826	7					10	my @params;
1827
1828							# verify coordinates based on type of feature
1829	7	50				10	if ($self->feature_type eq 'coordinate') {
		0
1830							# coordinates are already in the table, use those
1831	7		33			23	$params[CHR] = $args{seq_id} \|\| $self->seq_id;
1832	7		33			38	$params[STRT] = $args{start} \|\| $self->start;
1833	7		33			34	$params[STOP] = $args{stop} \|\| $args{end} \|\| $self->end;
1834							$params[STR] = (exists $args{strand} and defined $args{strand}) ? $args{strand} :
1835	7	50	33			28	$self->strand;
1836							}
1837							elsif ($self->feature_type eq 'named') {
1838							# must retrieve feature from the database first
1839	0					0	my $f = $self->seqfeature;
1840	0	0				0	return unless $f;
1841	0		0			0	$params[CHR] = $args{seq_id} \|\| $f->seq_id;
1842	0		0			0	$params[STRT] = $args{start} \|\| $f->start;
1843	0		0			0	$params[STOP] = $args{stop} \|\| $args{end} \|\| $f->end;
1844							$params[STR] = (exists $args{strand} and defined $args{strand}) ? $args{strand} :
1845	0	0	0			0	$f->strand;
1846							}
1847							else {
1848	0					0	croak "data table does not have identifiable coordinate or feature identification columns for score collection";
1849							}
1850
1851							# adjust coordinates as necessary
1852	7	0	33			17	if (exists $args{extend} and $args{extend}) {
1853	0					0	$params[STRT] -= $args{extend};
1854	0					0	$params[STOP] += $args{extend};
1855							}
1856
1857							# check coordinates
1858	7	50				13	$params[STRT] = 1 if $params[STRT] <= 0;
1859	7	50				20	if ($params[STOP] < $params[STRT]) {
1860							# coordinates are flipped, reverse strand
1861	0	0				0	return if ($params[STOP] <= 0);
1862	0					0	my $stop = $params[STRT];
1863	0					0	$params[STRT] = $params[STOP];
1864	0					0	$params[STOP] = $stop;
1865	0					0	$params[STR] = -1;
1866							}
1867	7	50	33			25	return unless ($params[CHR] and defined $params[STRT]);
1868
1869							# score attributes
1870	7		50			16	$params[METH] = $args{'method'} \|\| 'mean';
1871	7		100			32	$params[STND] = $args{strandedness} \|\| $args{stranded} \|\| 'all';
1872
1873							# other parameters
1874	7					8	$params[DB] = $db;
1875	7					8	$params[RETT] = 0; # return type should be a calculated value
1876	7					8	$params[DATA] = $args{dataset};
1877
1878							# get the score
1879	7					20	return get_segment_score(@params);
1880							}
1881
1882							sub _get_subfeature_scores {
1883	0			0		0	my ($self, $db, $args) = @_;
1884
1885							# get the subfeatures
1886	0					0	my $subfeatures = $self->_get_subfeatures($args->{subfeature});
1887	0	0				0	unless (@$subfeatures) {
1888	0					0	carp "no subfeatures available! Returning parent score data!";
1889							# just return the parent
1890	0					0	undef $args->{subfeature};
1891	0	0				0	delete $args->{exon} if exists $args->{exon};
1892	0					0	return $self->get_score(@$args);
1893							}
1894
1895							# collect over each subfeature
1896	0					0	my @scores;
1897	0					0	foreach my $exon (@$subfeatures) {
1898	0					0	my @params; # parameters to pass on to adapter
1899	0					0	$params[CHR] = $exon->seq_id;
1900	0					0	$params[STRT] = $exon->start;
1901	0					0	$params[STOP] = $exon->end;
1902	0	0				0	$params[STR] = defined $args->{strand} ? $args->{strand} : $exon->strand;
1903	0		0			0	$params[STND] = $args->{strandedness} \|\| $args->{stranded} \|\| 'all';
1904	0		0			0	$params[METH] = $args->{method} \|\| 'mean';
1905	0					0	$params[RETT] = 1; # return type should be an array reference of scores
1906	0					0	$params[DB] = $db;
1907	0					0	$params[DATA] = $args->{dataset};
1908
1909	0					0	my $exon_scores = get_segment_score(@params);
1910	0	0				0	push @scores, @$exon_scores if defined $exon_scores;
1911							}
1912
1913							# combine all the scores based on the requested method
1914	0					0	return calculate_score($args->{method}, \@scores);
1915							}
1916
1917							sub get_relative_point_position_scores {
1918	1			1	1	1832	my $self = shift;
1919	1					5	my %args = @_;
1920
1921							# get the database and verify the dataset
1922	1		33			12	my $ddb = $args{ddb} \|\| $args{db} \|\| $self->{data}->open_meta_database;
1923	1					5	$args{dataset} = $self->{data}->verify_dataset($args{dataset}, $ddb);
1924	1	50				3	unless ($args{dataset}) {
1925	0					0	croak "provided dataset was unrecognized format or otherwise could not be verified!\n";
1926							}
1927
1928							# assign some defaults
1929	1		50			6	$args{strandedness} \|\|= $args{stranded} \|\| 'all';
			33
1930	1		50			4	$args{position} \|\|= 5;
1931	1		50			5	$args{coordinate} \|\|= undef;
1932	1		50			4	$args{avoid} \|\|= undef;
1933	1		50			5	$args{'method'} \|\|= 'mean'; # in most cases this doesn't do anything
1934	1	50				3	unless ($args{extend}) {
1935	0					0	croak "must provide an extend value!";
1936							}
1937	1	50	33			4	$args{avoid} = undef unless ($args{db} or $self->{data}->open_meta_database);
1938
1939							# determine reference coordinate
1940	1	50				3	unless (defined $args{coordinate}) {
1941	1					4	$args{coordinate} = $self->calculate_reference(\%args);
1942							}
1943
1944							# build parameter array to pass on to the adapter
1945	1					1	my @params;
1946	1					4	$params[CHR] = $self->seq_id;
1947	1					4	$params[STRT] = $args{coordinate} - $args{extend};
1948	1	50				4	$params[STRT] = 1 if $params[STRT] < 1; # sanity check
1949	1					14	$params[STOP] = $args{coordinate} + $args{extend};
1950	1	50				6	$params[STR] = defined $args{strand} ? $args{strand} : $self->strand;
1951	1					2	$params[STND] = $args{strandedness};
1952	1					2	$params[METH] = $args{'method'};
1953	1					2	$params[RETT] = 2; # return type should be a hash reference of positioned scores
1954	1					2	$params[DB] = $ddb;
1955	1					2	$params[DATA] = $args{dataset};
1956
1957							# Data collection
1958	1					5	my $pos2data = get_segment_score(@params);
1959
1960							# Avoid positions
1961	1	50				4	if ($args{avoid}) {
1962	0					0	$self->_avoid_positions($pos2data, \%args, $params[CHR], $params[STRT], $params[STOP]);
1963							}
1964
1965							# covert to relative positions
1966	1	50				3	if ($args{absolute}) {
1967							# do not convert to relative positions
1968	0	0				0	return wantarray ? %$pos2data : $pos2data;
1969							}
1970							else {
1971							# return the collected dataset hash
1972							return $self->_convert_to_relative_positions($pos2data,
1973	1					4	$args{coordinate}, $params[STR]);
1974							}
1975							}
1976
1977							sub get_region_position_scores {
1978	4			4	1	7665	my $self = shift;
1979	4					12	my %args = @_;
1980
1981							# get the database and verify the dataset
1982	4		33			45	my $ddb = $args{ddb} \|\| $args{db} \|\| $self->{data}->open_meta_database;
1983	4					16	$args{dataset} = $self->{data}->verify_dataset($args{dataset}, $ddb);
1984	4	50				13	unless ($args{dataset}) {
1985	0					0	croak "provided dataset was unrecognized format or otherwise could not be verified!\n";
1986							}
1987
1988							# assign some defaults here, in case we get passed on to subfeature method
1989	4		50			25	$args{strandedness} \|\|= $args{stranded} \|\| 'all';
			33
1990	4		50			16	$args{extend} \|\|= 0;
1991	4		50			15	$args{position} \|\|= 5;
1992	4		100			10	$args{'method'} \|\|= 'mean'; # in most cases this doesn't do anything
1993	4	50	33			20	$args{avoid} = undef unless ($args{db} or $self->{data}->open_meta_database);
1994
1995							# get positioned scores over subfeatures only
1996	4		50			16	$args{subfeature} \|\|= q();
1997	4	0	33			12	if ($self->feature_type eq 'named' and $args{subfeature}) {
1998							# this is more complicated so we have a dedicated method
1999	0					0	return $self->_get_subfeature_position_scores(\%args, $ddb);
2000							}
2001
2002							# Assign coordinates
2003							# build parameter array to pass on to the adapter
2004	4					4	my @params;
2005	4		33			14	my $feature = $self->seqfeature \|\| $self;
2006	4		33			30	$params[CHR] = $args{chromo} \|\| $args{seq_id} \|\| $feature->seq_id;
2007	4		33			15	$params[STRT] = $args{start} \|\| $feature->start;
2008	4		33			20	$params[STOP] = $args{stop} \|\| $args{end} \|\| $feature->end;
2009	4	50				12	$params[STR] = defined $args{strand} ? $args{strand} : $feature->strand;
2010	4	50				12	if ($args{extend}) {
2011	0					0	$params[STRT] -= $args{extend};
2012	0					0	$params[STOP] += $args{extend};
2013	0	0				0	$params[STRT] = 1 if $params[STRT] < 1; # sanity check
2014							}
2015	4					7	$params[STND] = $args{strandedness};
2016	4					8	$params[METH] = $args{method};
2017	4					5	$params[RETT] = 2; # return type should be a hash reference of positioned scores
2018	4					8	$params[DB] = $ddb;
2019	4					4	$params[DATA] = $args{dataset};
2020
2021							# Data collection
2022	4					14	my $pos2data = get_segment_score(@params);
2023
2024							# Avoid positions
2025	4	50				23	if ($args{avoid}) {
2026	0					0	$self->_avoid_positions($pos2data, \%args, $params[CHR], $params[STRT], $params[STOP]);
2027							}
2028
2029							# covert to relative positions
2030	4	50				10	if ($args{absolute}) {
2031							# do not convert to relative positions
2032	0	0				0	return wantarray ? %$pos2data : $pos2data;
2033							}
2034							else {
2035							# return data converted to relative positions
2036	4	50				10	unless (defined $args{coordinate}) {
2037	4					18	$args{coordinate} = $self->calculate_reference(\%args);
2038							}
2039							return $self->_convert_to_relative_positions($pos2data,
2040	4					14	$args{coordinate}, $params[STR]);
2041							}
2042							}
2043
2044							sub _get_subfeature_position_scores {
2045	0			0		0	my ($self, $args, $ddb) = @_;
2046
2047							# get the subfeatures
2048	0					0	my $subfeatures = $self->_get_subfeatures($args->{subfeature});
2049	0	0				0	unless (@$subfeatures) {
2050	0					0	carp "no subfeatures available! Returning parent score data!";
2051							# just return the parent
2052	0					0	undef $args->{subfeature};
2053	0	0				0	delete $args->{exon} if exists $args->{exon};
2054	0					0	return $self->get_sequence(@$args);
2055							}
2056
2057							# reset the practical start and stop to the actual subfeatures' final start and stop
2058							# we can no longer rely on the feature start and stop, consider CDS
2059							# these subfeatures should already be genomic sorted by GeneTools
2060	0					0	my $practical_start = $subfeatures->[0]->start;
2061	0					0	my $practical_stop = $subfeatures->[-1]->end;
2062
2063							# collect over each exon
2064							# we will adjust the positions of each reported score so that
2065							# it will appear as if all the exons are adjacent to each other
2066							# and no introns exist
2067	0					0	my $pos2data = {};
2068	0					0	my $namecheck = {}; # to check unique names when using ncount method....
2069	0					0	my $current_end = $practical_start;
2070	0					0	my $adjustment = 0;
2071	0	0				0	my $fstrand = defined $args->{strand} ? $args->{strand} : $self->strand;
2072	0					0	foreach my $exon (@$subfeatures) {
2073
2074	0					0	my @params; # parameters to pass on to adapter
2075	0					0	$params[CHR] = $exon->seq_id;
2076	0					0	$params[STRT] = $exon->start;
2077	0					0	$params[STOP] = $exon->end;
2078	0					0	$params[STR] = $fstrand;
2079	0					0	$params[STND] = $args->{strandedness};
2080	0					0	$params[METH] = $args->{method};
2081	0					0	$params[RETT] = 2; # return type should be a hash reference of positioned scores
2082	0					0	$params[DB] = $ddb;
2083	0					0	$params[DATA] = $args->{dataset};
2084
2085							# collect scores
2086	0					0	my $exon_scores = get_segment_score(@params);
2087
2088							# adjust the scores
2089	0					0	$adjustment = $params[STRT] - $current_end;
2090							$self->_process_exon_scores($exon_scores, $pos2data, $adjustment, $params[STRT],
2091	0					0	$params[STOP], $namecheck, $args->{method});
2092
2093							# reset
2094	0					0	$current_end += $exon->length;
2095							}
2096
2097							# collect extensions if requested
2098	0	0				0	if ($args->{extend}) {
2099							# left side
2100	0					0	my @params; # parameters to pass on to adapter
2101	0					0	$params[CHR] = $self->seq_id;
2102	0					0	$params[STRT] = $practical_start - $args->{extend};
2103	0					0	$params[STOP] = $practical_start - 1;
2104	0					0	$params[STR] = $fstrand;
2105	0					0	$params[STND] = $args->{strandedness};
2106	0					0	$params[METH] = $args->{method};
2107	0					0	$params[RETT] = 2; # return type should be a hash reference of positioned scores
2108	0					0	$params[DB] = $ddb;
2109	0					0	$params[DATA] = $args->{dataset};
2110
2111	0					0	my $ext_scores = get_segment_score(@params);
2112
2113							# no adjustment should be needed
2114							$self->_process_exon_scores($ext_scores, $pos2data, 0, $params[STRT],
2115	0					0	$params[STOP], $namecheck, $args->{method});
2116
2117
2118							# right side
2119							# we can reuse our parameter array
2120	0					0	$params[STRT] = $practical_stop + 1;
2121	0					0	$params[STOP] = $practical_stop + $args->{extend};
2122	0					0	$ext_scores = get_segment_score(@params);
2123
2124							# the adjustment should be the same as the last exon
2125							$self->_process_exon_scores($ext_scores, $pos2data, $adjustment, $params[STRT],
2126	0					0	$params[STOP], $namecheck, $args->{method});
2127							}
2128
2129							# covert to relative positions
2130	0	0				0	if ($args->{absolute}) {
2131							# do not convert to relative positions
2132	0	0				0	return wantarray ? %$pos2data : $pos2data;
2133							}
2134							else {
2135							# return data converted to relative positions
2136							# can no longer use original coordinates, but instead the new shifted coordinates
2137	0					0	$args->{practical_start} = $practical_start;
2138	0					0	$args->{practical_stop} = $current_end;
2139	0					0	my $reference = $self->calculate_reference($args);
2140	0					0	return $self->_convert_to_relative_positions($pos2data,
2141							$reference, $fstrand);
2142							}
2143							}
2144
2145							sub calculate_reference {
2146							# my ($self, $args) = @_;
2147							# I need position
2148							# optionally alternate start, stop, and/or strand
2149	10			10	1	20	my $self = shift;
2150	10					15	my $args;
2151	10	50				21	if (scalar @_ == 1) {
2152							# single variable passed
2153	10	100				26	if (ref($_[0]) eq 'HASH') {
2154	6					11	$args = shift @_;
2155							}
2156							else {
2157							# assumed to be the position
2158	4					9	$args = {
2159							position => $_[0]
2160							};
2161							}
2162							}
2163							else {
2164							# multiple variables
2165	0					0	$args = { @_ };
2166							}
2167
2168							# calculate reference coordinate
2169	10					13	my $coordinate;
2170	10	50				34	my $strand = defined $args->{strand} ? $args->{strand} : $self->strand;
2171	10	100				35	if ($args->{position} == 5) {
		100
		100
		50
2172	6	100				16	if ($strand >= 0) {
2173	1		33			6	$coordinate = $args->{practical_start} \|\| $self->start;
2174							}
2175							else {
2176	5		33			28	$coordinate = $args->{practical_stop} \|\| $self->end;
2177							}
2178							}
2179							elsif ($args->{position} == 3) {
2180	1	50				4	if ($strand >= 0) {
2181	1		33			5	$coordinate = $args->{practical_stop} \|\| $self->end;
2182							}
2183							else {
2184	0		0			0	$coordinate = $args->{practical_start} \|\| $self->start;
2185							}
2186							}
2187							elsif ($args->{position} == 4) {
2188							# strand doesn't matter here
2189							# but practical coordinates do
2190	2	100				5	if (exists $args->{practical_start}) {
2191							$coordinate = int(
2192	1					3	( ($args->{practical_start} + $args->{practical_stop}) / 2 ) + 0.5
2193							);
2194							}
2195							else {
2196	1					53	$coordinate = $self->midpoint;
2197							}
2198							}
2199							elsif ($args->{position} == 9) {
2200							# narrowPeak coordinate
2201	1					3	$coordinate = $self->peak;
2202							}
2203							else {
2204	0					0	confess "position must be one of 5, 3, 4, or 9";
2205							}
2206	10					30	return $coordinate;
2207							}
2208
2209							sub _avoid_positions {
2210	0			0		0	my ($self, $pos2data, $args, $seqid, $start, $stop) = @_;
2211
2212							# first check the list of avoid types
2213	0	0				0	if (ref $args->{avoid} eq 'ARRAY') {
		0
		0
2214							# we have types, presume they're ok
2215							}
2216							elsif ($args->{avoid} eq '1') {
2217							# old style boolean value
2218	0	0				0	if (defined $args->{type}) {
2219	0					0	$args->{avoid} = [ $args->{type} ];
2220							}
2221							else {
2222							# no type provided, we can't avoid that which is not defined!
2223							# this is an error, but won't complain as we never did before
2224	0					0	$args->{avoid} = $self->type;
2225							}
2226							}
2227							elsif ($args->{avoid} =~ /w+/i) {
2228							# someone passed a string, a feature type perhaps?
2229	0					0	$args->{avoid} = [ $args->{avoid} ];
2230							}
2231
2232							### Check for conflicting features
2233	0		0			0	my $db = $args->{db} \|\| $self->{data}->open_meta_database;
2234							my @overlap_features = $self->get_features(
2235							seq_id => $seqid,
2236							start => $start,
2237							end => $stop,
2238							type => $args->{avoid},
2239	0					0	);
2240
2241							# get the overlapping features of the same type
2242	0	0				0	if (@overlap_features) {
2243	0					0	my $primary = $self->primary_id;
2244							# there are one or more feature of the type in this region
2245							# one of them is likely the one we're working with
2246							# but not necessarily - user may be looking outside original feature
2247							# the others are not what we want and therefore need to be
2248							# avoided
2249	0					0	foreach my $feat (@overlap_features) {
2250							# skip the one we want
2251	0	0				0	next if ($feat->primary_id eq $primary);
2252							# now eliminate those scores which overlap this feature
2253	0					0	my $start = $feat->start;
2254	0					0	my $stop = $feat->end;
2255	0					0	foreach my $position (keys %$pos2data) {
2256							# delete the scored position if it overlaps with
2257							# the offending feature
2258	0	0	0			0	if (
2259							$position >= $start and
2260							$position <= $stop
2261							) {
2262	0					0	delete $pos2data->{$position};
2263							}
2264							}
2265							}
2266							}
2267							}
2268
2269							sub _convert_to_relative_positions {
2270	5			5		13	my ($self, $pos2data, $position, $strand) = @_;
2271
2272	5					7	my %relative_pos2data;
2273	5	50				16	if ($strand >= 0) {
		50
2274	0					0	foreach my $p (keys %$pos2data) {
2275	0					0	$relative_pos2data{ $p - $position } = $pos2data->{$p};
2276							}
2277							}
2278							elsif ($strand < 0) {
2279	5					26	foreach my $p (keys %$pos2data) {
2280	223					329	$relative_pos2data{ $position - $p } = $pos2data->{$p};
2281							}
2282							}
2283	5	50				131	return wantarray ? %relative_pos2data : \%relative_pos2data;
2284							}
2285
2286							sub _process_exon_scores {
2287	0			0			my ($self, $exon_scores, $pos2data, $adjustment, $start, $end,
2288							$namecheck, $method) = @_;
2289
2290							# ncount method
2291	0	0					if ($method eq 'ncount') {
2292							# we need to check both names and adjust position
2293	0						foreach my $p (keys %$exon_scores) {
2294	0	0	0				next unless ($p >= $start and $p <= $end);
2295	0						foreach my $n (@{ $exon_scores->{$p} }) {
	0
2296	0	0					if (exists $namecheck->{$n} ) {
2297	0						$namecheck->{$n}++;
2298	0						next;
2299							}
2300							else {
2301	0						$namecheck->{$n} = 1;
2302	0						my $a = $p - $adjustment;
2303	0		0				$pos2data->{$a} \|\|= [];
2304	0						push @{ $pos2data->{$a} }, $n;
	0
2305							}
2306							}
2307							}
2308							}
2309							else {
2310							# just adjust scores
2311	0						foreach my $p (keys %$exon_scores) {
2312	0	0	0				next unless ($p >= $start and $p <= $end);
2313	0						$pos2data->{ $p - $adjustment } = $exon_scores->{$p};
2314							}
2315							}
2316							}
2317
2318							sub fetch_alignments {
2319	0			0	1		my $self = shift;
2320	0						my %args = @_;
2321	0		0				$args{db} \|\|= $args{dataset} \|\| undef;
			0
2322	0		0				$args{data} \|\|= undef;
2323	0		0				$args{callback} \|\|= undef;
2324	0		0				$args{subfeature} \|\|= q();
2325
2326							# verify - trusting that these are valid, else they will fail lower down in the code
2327	0	0					unless ($args{db}) {
2328	0						croak "must provide a Bam object database to fetch alignments!\n";
2329							}
2330	0	0	0				unless ($args{data} and ref($args{data}) eq 'HASH') {
2331	0						croak "must provide a data HASH for the fetch callback!\n";
2332							}
2333	0	0					unless ($args{callback}) {
2334	0						croak "must provide a callback code reference!\n";
2335							}
2336
2337							# array of features to iterate, probably just one or subfeatures
2338	0						my @intervals;
2339	0	0	0				if ($self->feature_type eq 'named' and $args{subfeature}) {
2340							# we have subfeatures to iterate over
2341
2342							# get the subfeatures
2343	0						my $subfeatures = $self->_get_subfeatures($args{subfeature});
2344	0	0					if (@$subfeatures) {
2345	0						foreach my $sf (@$subfeatures) {
2346	0						push @intervals, [
2347							$sf->start - 1,
2348							$sf->end
2349							];
2350							}
2351							}
2352							else {
2353							# zere subfeatures? just take the parent then
2354							push @intervals, [
2355							($args{start} \|\| $self->start) - 1,
2356	0		0				$args{stop} \|\| $args{end} \|\| $self->end
			0
2357							];
2358							}
2359							}
2360							else {
2361							# take feature as is
2362							push @intervals, [
2363							($args{start} \|\| $self->start) - 1,
2364	0		0				$args{stop} \|\| $args{end} \|\| $self->end
			0
2365							];
2366							}
2367
2368							# get the target id for the chromosome
2369							# this will fail if the user didn't provide a real bam object!!!
2370	0						my ($tid, undef, undef) = $args{db}->header->parse_region($self->seq_id);
2371	0	0					return unless defined $tid;
2372
2373							# now iterate over the intervals
2374	0						foreach my $i (@intervals) {
2375	0						$args{data}->{start} = $i->[0];
2376	0						$args{data}->{end} = $i->[1];
2377	0						$args{data}->{strand} = $self->strand;
2378							low_level_bam_fetch(
2379							$args{db},
2380							$tid,
2381							$i->[0],
2382							$i->[1],
2383							$args{callback},
2384							$args{data}
2385	0						);
2386							}
2387
2388							# nothing to return since we're using a data reference
2389	0						return 1;
2390							}
2391
2392
2393
2394							### String export
2395
2396							sub bed_string {
2397	0			0	1		my $self = shift;
2398	0						my %args = @_;
2399	0		0				$args{bed} \|\|= 6; # number of bed columns
2400	0	0					croak "bed count must be an integer!" unless $args{bed} =~ /^\d+$/;
2401	0	0					croak "bed count must be at least 3!" unless $args{bed} >= 3;
2402
2403							# coordinate information
2404	0						$self->seqfeature; # retrieve the seqfeature object first
2405	0		0				my $chr = $args{chromo} \|\| $args{seq_id} \|\| $self->seq_id;
2406	0		0				my $start = $args{start} \|\| $self->start;
2407	0		0				my $stop = $args{stop} \|\| $args{end} \|\| $self->stop \|\|
2408							$start + $self->length - 1 \|\| $start;
2409	0	0	0				if ($chr eq '.' or not CORE::length($chr) or $start eq '.' or not CORE::length($start)) {
			0
			0
2410	0						carp sprintf("no valid seq_id or start for data line %d", $self->line_number);
2411	0						return;
2412							}
2413	0	0					if ($start > $stop) {
2414							# reversed coordinates? old school way of setting reverse strand
2415	0						my $s = $start;
2416	0						$start = $stop;
2417	0						$stop = $s;
2418	0						$args{strand} = -1; # this will override any user provided data?
2419							}
2420	0						$start -= 1; # 0-based coordinates
2421	0						my $string = "$chr\t$start\t$stop";
2422
2423							# additional information
2424	0	0					if ($args{bed} >= 4) {
2425	0		0				my $name = $args{name} \|\| $self->name \|\| 'Feature_' . $self->line_number;
2426	0						$string .= "\t$name";
2427							}
2428	0	0					if ($args{bed} >= 5) {
2429	0	0					my $score = exists $args{score} ? $args{score} : $self->score;
2430	0	0					$score = 1 unless defined $score;
2431	0						$string .= "\t$score";
2432							}
2433	0	0					if ($args{bed} >= 6) {
2434	0						my $s;
2435	0	0	0				if (exists $args{strand} and defined $args{strand}) {
2436	0						$s = $self->_strand($args{strand});
2437							}
2438							else {
2439	0						$s = $self->strand;
2440							}
2441	0	0					$string .= sprintf("\t%s", $s == -1 ? '-' : '+')
2442							}
2443							# we could go on with other columns, but there's no guarantee that additional
2444							# information is available, and we would have to implement user provided data
2445
2446							# done
2447	0						return $string;
2448							}
2449
2450							sub gff_string {
2451	0			0	1		my $self = shift;
2452	0						my %args = @_;
2453
2454							# coordinate information
2455	0						$self->seqfeature; # retrieve the seqfeature object first
2456	0		0				my $chr = $args{chromo} \|\| $args{seq_id} \|\| $self->seq_id;
2457	0		0				my $start = $args{start} \|\| $self->start;
2458	0		0				my $stop = $args{stop} \|\| $args{end} \|\| $self->stop \|\|
2459							$start + $self->length - 1 \|\| $start;
2460	0	0	0				if ($chr eq '.' or not CORE::length($chr) or $start eq '.' or not CORE::length($start)) {
			0
			0
2461	0						carp sprintf("no valid seq_id or start for data line %d", $self->line_number);
2462	0						return;
2463							}
2464	0	0					if ($start > $stop) {
2465							# reversed coordinates? old school way of setting reverse strand
2466	0						my $s = $start;
2467	0						$start = $stop;
2468	0						$stop = $s;
2469	0						$args{strand} = -1; # this will override any user provided data?
2470							}
2471	0						my $strand;
2472	0	0	0				if (exists $args{strand} and defined $args{strand}) {
2473	0						$strand = $self->_strand($args{strand});
2474							}
2475							else {
2476	0						$strand = $self->strand;
2477							}
2478	0	0					$strand = $strand == -1 ? '-' : $strand == 1 ? '+' : '.';
		0
2479
2480							# type information
2481	0		0				my $type = $args{type} \|\| $self->type \|\| undef;
2482	0						my ($source, $primary_tag);
2483	0	0	0				if (defined $type and $type =~ /:/) {
2484	0						($primary_tag, $source) = split /:/, $type;
2485							}
2486	0	0					unless ($source) {
2487	0		0				$source = $args{source} \|\| '.';
2488							}
2489	0	0					unless ($primary_tag) {
2490	0	0	0				$primary_tag = $args{primary_tag} \|\| defined $type ? $type : '.';
2491							}
2492
2493							# score
2494	0	0					my $score = exists $args{score} ? $args{score} : $self->score;
2495	0	0					$score = '.' unless defined $score;
2496	0						my $phase = '.'; # do not even bother!!!!
2497
2498							# attributes
2499	0		0				my $name = $args{name} \|\| $self->name \|\| 'Feature_' . $self->line_number;
2500	0						my $attributes = "Name=$name";
2501	0		0				my $id = $args{id} \|\| sprintf("%08d", $self->line_number);
2502	0						$attributes .= ";ID=$id";
2503	0	0	0				if (exists $args{attributes} and ref($args{attributes}) eq 'ARRAY') {
2504	0						foreach my $i (@{$args{attributes}}) {
	0
2505	0						my $k = $self->{data}->name($i);
2506	0						$k =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
	0
2507	0						my $v = $self->value($i);
2508	0						$v =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
	0
2509	0						$attributes .= ";$k=$v";
2510							}
2511							}
2512
2513							# done
2514	0						my $string = join("\t", $chr, $source, $primary_tag, $start, $stop, $score,
2515							$strand, $phase, $attributes);
2516	0						return $string;
2517							}
2518
2519
2520							__END__