line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
############################################################################### |
2
|
|
|
|
|
|
|
# # |
3
|
|
|
|
|
|
|
# Copyright © 2012-2013 -- IRB/INSERM # |
4
|
|
|
|
|
|
|
# (Institut de Recherche en Biothérapie / # |
5
|
|
|
|
|
|
|
# Institut National de la Santé et de la # |
6
|
|
|
|
|
|
|
# Recherche Médicale) # |
7
|
|
|
|
|
|
|
# # |
8
|
|
|
|
|
|
|
# Auteurs/Authors: Jerôme AUDOUX # |
9
|
|
|
|
|
|
|
# Nicolas PHILIPPE # |
10
|
|
|
|
|
|
|
# # |
11
|
|
|
|
|
|
|
# ------------------------------------------------------------------------- # |
12
|
|
|
|
|
|
|
# # |
13
|
|
|
|
|
|
|
# Ce fichier fait partie de la suite CracTools qui contient plusieurs pipeline# |
14
|
|
|
|
|
|
|
# intégrés permettant de traiter les évênements biologiques présents dans du # |
15
|
|
|
|
|
|
|
# RNA-Seq. Les CracTools travaillent à partir d'un fichier SAM de CRAC et d'un# |
16
|
|
|
|
|
|
|
# fichier d'annotation au format GFF3. # |
17
|
|
|
|
|
|
|
# # |
18
|
|
|
|
|
|
|
# Ce logiciel est régi par la licence CeCILL soumise au droit français et # |
19
|
|
|
|
|
|
|
# respectant les principes de diffusion des logiciels libres. Vous pouvez # |
20
|
|
|
|
|
|
|
# utiliser, modifier et/ou redistribuer ce programme sous les conditions de # |
21
|
|
|
|
|
|
|
# la licence CeCILL telle que diffusée par le CEA, le CNRS et l'INRIA sur # |
22
|
|
|
|
|
|
|
# le site "http://www.cecill.info". # |
23
|
|
|
|
|
|
|
# # |
24
|
|
|
|
|
|
|
# En contrepartie de l'accessibilité au code source et des droits de copie, # |
25
|
|
|
|
|
|
|
# de modification et de redistribution accordés par cette licence, il n'est # |
26
|
|
|
|
|
|
|
# offert aux utilisateurs qu'une garantie limitée. Pour les mêmes raisons, # |
27
|
|
|
|
|
|
|
# seule une responsabilité restreinte pèse sur l'auteur du programme, le # |
28
|
|
|
|
|
|
|
# titulaire des droits patrimoniaux et les concédants successifs. # |
29
|
|
|
|
|
|
|
# # |
30
|
|
|
|
|
|
|
# À cet égard l'attention de l'utilisateur est attirée sur les risques # |
31
|
|
|
|
|
|
|
# associés au chargement, à l'utilisation, à la modification et/ou au # |
32
|
|
|
|
|
|
|
# développement et à la reproduction du logiciel par l'utilisateur étant # |
33
|
|
|
|
|
|
|
# donné sa spécificité de logiciel libre, qui peut le rendre complexe à # |
34
|
|
|
|
|
|
|
# manipuler et qui le réserve donc à des développeurs et des professionnels # |
35
|
|
|
|
|
|
|
# avertis possédant des connaissances informatiques approfondies. Les # |
36
|
|
|
|
|
|
|
# utilisateurs sont donc invités à charger et tester l'adéquation du # |
37
|
|
|
|
|
|
|
# logiciel à leurs besoins dans des conditions permettant d'assurer la # |
38
|
|
|
|
|
|
|
# sécurité de leurs systêmes et ou de leurs données et, plus généralement, # |
39
|
|
|
|
|
|
|
# à l'utiliser et l'exploiter dans les mêmes conditions de sécurité. # |
40
|
|
|
|
|
|
|
# # |
41
|
|
|
|
|
|
|
# Le fait que vous puissiez accéder à cet en-tête signifie que vous avez # |
42
|
|
|
|
|
|
|
# pris connaissance de la licence CeCILL, et que vous en avez accepté les # |
43
|
|
|
|
|
|
|
# termes. # |
44
|
|
|
|
|
|
|
# # |
45
|
|
|
|
|
|
|
# ------------------------------------------------------------------------- # |
46
|
|
|
|
|
|
|
# # |
47
|
|
|
|
|
|
|
# This file is part of the CracTools which provide several integrated # |
48
|
|
|
|
|
|
|
# pipeline to analyze biological events present in RNA-Seq data. CracTools # |
49
|
|
|
|
|
|
|
# work on a SAM file generated by CRAC and an annotation file in GFF3 format.# |
50
|
|
|
|
|
|
|
# # |
51
|
|
|
|
|
|
|
# This software is governed by the CeCILL license under French law and # |
52
|
|
|
|
|
|
|
# abiding by the rules of distribution of free software. You can use, # |
53
|
|
|
|
|
|
|
# modify and/ or redistribute the software under the terms of the CeCILL # |
54
|
|
|
|
|
|
|
# license as circulated by CEA, CNRS and INRIA at the following URL # |
55
|
|
|
|
|
|
|
# "http://www.cecill.info". # |
56
|
|
|
|
|
|
|
# # |
57
|
|
|
|
|
|
|
# As a counterpart to the access to the source code and rights to copy, # |
58
|
|
|
|
|
|
|
# modify and redistribute granted by the license, users are provided only # |
59
|
|
|
|
|
|
|
# with a limited warranty and the software's author, the holder of the # |
60
|
|
|
|
|
|
|
# economic rights, and the successive licensors have only limited # |
61
|
|
|
|
|
|
|
# liability. # |
62
|
|
|
|
|
|
|
# # |
63
|
|
|
|
|
|
|
# In this respect, the user's attention is drawn to the risks associated # |
64
|
|
|
|
|
|
|
# with loading, using, modifying and/or developing or reproducing the # |
65
|
|
|
|
|
|
|
# software by the user in light of its specific status of free software, # |
66
|
|
|
|
|
|
|
# that may mean that it is complicated to manipulate, and that also # |
67
|
|
|
|
|
|
|
# therefore means that it is reserved for developers and experienced # |
68
|
|
|
|
|
|
|
# professionals having in-depth computer knowledge. Users are therefore # |
69
|
|
|
|
|
|
|
# encouraged to load and test the software's suitability as regards their # |
70
|
|
|
|
|
|
|
# requirements in conditions enabling the security of their systems and/or # |
71
|
|
|
|
|
|
|
# data to be ensured and, more generally, to use and operate it in the same # |
72
|
|
|
|
|
|
|
# conditions as regards security. # |
73
|
|
|
|
|
|
|
# # |
74
|
|
|
|
|
|
|
# The fact that you are presently reading this means that you have had # |
75
|
|
|
|
|
|
|
# knowledge of the CeCILL license and that you accept its terms. # |
76
|
|
|
|
|
|
|
# # |
77
|
|
|
|
|
|
|
############################################################################### |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
=head1 NAME |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
CracTools::GFF::Query - Query GFF files easily. |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=head1 SYNOPSIS |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
Usage: |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
use CracTools::GFF::Query; |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
# Creating the reader |
90
|
|
|
|
|
|
|
my $gffQuery = CracTools::GFF::Query->new('annotations.gff'); |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
my @annotations = $gffQuery->fetchByLocation('1',298345,'+'); |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
foreach my $gff_line (@annotations) { |
95
|
|
|
|
|
|
|
my $annotation = CracTools::GFF::Annotation->new($gff_line); |
96
|
|
|
|
|
|
|
print "Gene_id : ",$annotation->getAttribute('gene_id'),"\n"; |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=head1 DESCRIPTION |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
CracTools::GFF::Query is a tool to query GFF files without building a database. |
102
|
|
|
|
|
|
|
It is memory efficient and designed to run fast. |
103
|
|
|
|
|
|
|
You can easily retrives GFF data from a specific region of position. |
104
|
|
|
|
|
|
|
This tool can be use with CracTools::GFF::Annotation in order to parse GFF line |
105
|
|
|
|
|
|
|
into a nice usable Perl Object. |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
=cut |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
package CracTools::GFF::Query; |
110
|
|
|
|
|
|
|
|
111
|
2
|
|
|
2
|
|
37223
|
use strict; |
|
2
|
|
|
|
|
6
|
|
|
2
|
|
|
|
|
94
|
|
112
|
2
|
|
|
2
|
|
13
|
use warnings; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
73
|
|
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
#use Storable; # for persistency |
115
|
|
|
|
|
|
|
|
116
|
2
|
|
|
2
|
|
2042
|
use Set::IntervalTree; |
|
2
|
|
|
|
|
46904
|
|
|
2
|
|
|
|
|
173
|
|
117
|
2
|
|
|
2
|
|
23
|
use Fcntl qw( SEEK_SET ); |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
111
|
|
118
|
2
|
|
|
2
|
|
13
|
use Carp; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
2116
|
|
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
=head1 METHODS |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=head2 new |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
Arg [1] : String - GFF file |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
Example : my $gffQuery = CracTools::GFF::Query->new('annotations.gff'); |
127
|
|
|
|
|
|
|
Description : Create a new GFF Query object |
128
|
|
|
|
|
|
|
ReturnType : CracTools::GFF::Query |
129
|
|
|
|
|
|
|
Exceptions : none |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=cut |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub new { |
134
|
2
|
|
|
2
|
1
|
1235
|
my $class = shift; |
135
|
|
|
|
|
|
|
|
136
|
2
|
|
|
|
|
25
|
my $gff_file = shift; |
137
|
|
|
|
|
|
|
|
138
|
2
|
|
|
|
|
10
|
my $self = bless { |
139
|
|
|
|
|
|
|
GFF_FILE => $gff_file, |
140
|
|
|
|
|
|
|
}, $class; |
141
|
|
|
|
|
|
|
|
142
|
2
|
|
|
|
|
11
|
$self->_init(); |
143
|
|
|
|
|
|
|
|
144
|
2
|
|
|
|
|
6
|
return $self; |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
=head2 fetchByRegion |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
Arg [1] : String $seq_region_name |
150
|
|
|
|
|
|
|
The name of the sequence region that the slice will be |
151
|
|
|
|
|
|
|
created on. |
152
|
|
|
|
|
|
|
Arg [2] : int $start |
153
|
|
|
|
|
|
|
The start of the slice on the sequence region |
154
|
|
|
|
|
|
|
Arg [3] : int $end |
155
|
|
|
|
|
|
|
The end of the slice on the sequence region |
156
|
|
|
|
|
|
|
Arg [4] : int $strand |
157
|
|
|
|
|
|
|
The orientation of the slice on the sequence region |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
Example : my @annotations = $gffQuery->fetchByRegion('1',298345,309209,'+'); |
160
|
|
|
|
|
|
|
Description : Retrives GFF lines that belong to the region. |
161
|
|
|
|
|
|
|
ReturnType : Reference to an Array of strings |
162
|
|
|
|
|
|
|
Exceptions : none |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=cut |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
sub fetchByRegion { |
167
|
10
|
|
|
10
|
1
|
18
|
my ($self,$chr,$pos_start,$pos_end,$strand) = @_; |
168
|
|
|
|
|
|
|
|
169
|
10
|
|
|
|
|
29
|
my $annotations_ref = $self->_getAnnotations($chr,$strand); |
170
|
10
|
|
|
|
|
18
|
my @gff_lines; |
171
|
|
|
|
|
|
|
|
172
|
10
|
50
|
|
|
|
26
|
if(defined $annotations_ref) { |
173
|
|
|
|
|
|
|
# pos_start -1 beacause Interval tree use [a,b) intervals |
174
|
10
|
|
|
|
|
79
|
my $seek_values = $annotations_ref->fetch($pos_start-1,$pos_end); |
175
|
|
|
|
|
|
|
|
176
|
10
|
|
|
|
|
28
|
my $gff_fh = $self->_gffFilehandle; |
177
|
|
|
|
|
|
|
|
178
|
10
|
|
|
|
|
23
|
foreach (@$seek_values) { |
179
|
32
|
|
|
|
|
241
|
seek($gff_fh,$_,SEEK_SET); |
180
|
32
|
|
|
|
|
227
|
my $annot = <$gff_fh>; |
181
|
32
|
|
|
|
|
81
|
push(@gff_lines,$annot); |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
|
185
|
10
|
|
|
|
|
51
|
return \@gff_lines; |
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
=head2 fetchByLocation |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
Arg [1] : String $seq_region_name |
191
|
|
|
|
|
|
|
The name of the sequence region that the slice will be |
192
|
|
|
|
|
|
|
created on. |
193
|
|
|
|
|
|
|
Arg [2] : int $position |
194
|
|
|
|
|
|
|
Location to look for |
195
|
|
|
|
|
|
|
Arg [3] : int $strand |
196
|
|
|
|
|
|
|
The orientation of the slice on the sequence region |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
Example : my @annotations = $gffQuery->fetchByLocation('1',298345,'+'); |
199
|
|
|
|
|
|
|
Description : Retrives GFF lines that overlapped the given location. |
200
|
|
|
|
|
|
|
ReturnType : Reference to an Array of strings |
201
|
|
|
|
|
|
|
Exceptions : none |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
=cut |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
sub fetchByLocation { |
206
|
5
|
|
|
5
|
1
|
16
|
my ($self,$chr,$position,$strand) = @_; |
207
|
5
|
|
|
|
|
13
|
return $self->fetchByRegion($chr,$position,$position,$strand); |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
=head1 GETTERS AND SETTERS |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
=head2 gffFile |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
Description : Getter method for the attribute gff_file |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
=cut |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
sub gffFile { |
219
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
220
|
0
|
|
|
|
|
0
|
return $self->{GFF_FILE}; |
221
|
|
|
|
|
|
|
} |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
=head1 PRIVATE METHODS |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
=cut |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
sub _gffFilehandle { |
228
|
10
|
|
|
10
|
|
13
|
my $self = shift; |
229
|
10
|
|
|
|
|
38
|
return $self->{gff_fh}; |
230
|
|
|
|
|
|
|
} |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
sub _init { |
233
|
2
|
|
|
2
|
|
4
|
my $self = shift; |
234
|
2
|
|
|
|
|
4
|
my %annotations; |
235
|
|
|
|
|
|
|
|
236
|
2
|
50
|
|
|
|
14
|
if(!defined $self->{GFF_FILE}) { |
237
|
0
|
|
|
|
|
0
|
confess "Missing GFF file argument"; |
238
|
|
|
|
|
|
|
} |
239
|
|
|
|
|
|
|
|
240
|
2
|
50
|
|
|
|
14
|
open(IN,$self->{GFF_FILE}) or die ("Cannot open file ".$self->{GFF_FILE}); |
241
|
|
|
|
|
|
|
|
242
|
2
|
|
|
|
|
283
|
my $curr_pos = tell(IN); |
243
|
2
|
|
|
|
|
35
|
while() { |
244
|
|
|
|
|
|
|
# skip headers |
245
|
10
|
50
|
|
|
|
25
|
if($_ =~ /^#/) { |
246
|
0
|
|
|
|
|
0
|
next; |
247
|
|
|
|
|
|
|
} |
248
|
10
|
|
|
|
|
13
|
my $pos = $curr_pos; |
249
|
10
|
|
|
|
|
69
|
my ($chr,$source,$feature,$start,$end,$score,$strand) = split("\t",$_,8); |
250
|
10
|
50
|
33
|
|
|
100
|
if(defined $start && defined $end && defined $chr && defined $strand) { |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
251
|
10
|
|
|
|
|
23
|
$strand = convertStrand($strand); |
252
|
10
|
|
|
|
|
130
|
my $key = $self->_getAnnotationHashKey($chr,$strand); |
253
|
10
|
100
|
|
|
|
28
|
if(!defined $annotations{$key}) { |
254
|
4
|
|
|
|
|
55
|
$annotations{$key} = Set::IntervalTree->new; |
255
|
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
|
# Minus one because gff is 1-based |
257
|
10
|
|
|
|
|
43
|
$annotations{$key}->insert($pos,$start-1,$end); |
258
|
|
|
|
|
|
|
} |
259
|
10
|
|
|
|
|
42
|
$curr_pos = tell(IN); |
260
|
|
|
|
|
|
|
} |
261
|
2
|
|
|
|
|
21
|
close IN; |
262
|
|
|
|
|
|
|
|
263
|
2
|
|
|
|
|
4
|
my $gff_fh; |
264
|
2
|
50
|
|
|
|
13
|
open($gff_fh,$self->{GFF_FILE}) or die ("Cannot open file ".$self->{GFF_FILE}); |
265
|
|
|
|
|
|
|
|
266
|
2
|
|
|
|
|
88
|
$self->{gff_fh} = $gff_fh; |
267
|
2
|
|
|
|
|
7
|
$self->{ANNOTATIONS} = \%annotations; |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
sub _getAnnotationHashKey { |
272
|
20
|
|
|
20
|
|
37
|
my ($self,$chr,$strand) = @_; |
273
|
20
|
|
|
|
|
71
|
return "$chr"."@"."$strand"; |
274
|
|
|
|
|
|
|
} |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
sub _extractAnnotationHashKey { |
277
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
278
|
0
|
|
|
|
|
0
|
my $key = shift; |
279
|
0
|
|
|
|
|
0
|
my ($chr,$strand) = split("@",$key); |
280
|
0
|
|
|
|
|
0
|
return ($chr,$strand); |
281
|
|
|
|
|
|
|
} |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
sub _getAnnotations { |
284
|
10
|
|
|
10
|
|
15
|
my ($self,$chr,$strand) = @_; |
285
|
10
|
|
|
|
|
28
|
return $self->{ANNOTATIONS}{$self->_getAnnotationHashKey($chr,$strand)}; |
286
|
|
|
|
|
|
|
} |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
=head1 STATIC METHODS |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
=head2 convertStrand |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
Arg [1] : Character - strand using '+' and '-' signs |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
Description : Retrun the strand using the (1,-1) convention |
295
|
|
|
|
|
|
|
instead of the ('+','-') convention of GFF files. |
296
|
|
|
|
|
|
|
=cut |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
sub convertStrand($) { |
299
|
10
|
|
|
10
|
1
|
14
|
my $strand = shift; |
300
|
10
|
|
|
|
|
35
|
my %conversion_hash = ( '+' => 1, '-' => -1, 1 => '+', -1 => '-'); |
301
|
10
|
|
|
|
|
24
|
return $conversion_hash{$strand}; |
302
|
|
|
|
|
|
|
} |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
1; |