File Coverage

Bio/ClusterIO/unigene.pm

Criterion	Covered	Total	%
statement	68	70	97.1
branch	65	72	90.2
condition			n/a
subroutine	6	6	100.0
pod	1	1	100.0
total	140	149	93.9

line	stmt	bran	sub	pod	time	code
1						# BioPerl module for Bio::ClusterIO::unigene
2						#
3						# Please direct questions and support issues to
4						#
5						# Cared for by Andrew Macgregor
6						#
7						# Copyright Andrew Macgregor, Jo-Ann Stanton, David Green
8						# Molecular Embryology Group, Anatomy & Structural Biology, University of Otago
9						# http://meg.otago.ac.nz
10						#
11						# You may distribute this module under the same terms as perl itself
12						#
13						# _history
14						# April 17, 2002 - Initial implementation by Andrew Macgregor
15
16						# POD documentation - main docs before the code
17
18						=head1 NAME
19
20						Bio::ClusterIO::unigene - UniGene input stream
21
22						=head1 SYNOPSIS
23
24						Do not use this module directly. Use it via the Bio::ClusterIO class.
25
26						=head1 DESCRIPTION
27
28						This object reads from Unigene *.data files downloaded from
29						ftp://ftp.ncbi.nih.gov/repository/UniGene/. It does not download and
30						decompress the file, you have to do that yourself.
31
32						=head1 FEEDBACK
33
34						=head2 Mailing Lists
35
36						User feedback is an integral part of the evolution of this and other
37						Bioperl modules. Send your comments and suggestions preferably to one
38						of the Bioperl mailing lists. Your participation is much appreciated.
39
40						bioperl-l@bioperl.org - General discussion
41						http://bioperl.org/wiki/Mailing_lists - About the mailing lists
42
43						=head2 Support
44
45						Please direct usage questions or support issues to the mailing list:
46
47						I
48
49						rather than to the module maintainer directly. Many experienced and
50						reponsive experts will be able look at the problem and quickly
51						address it. Please include a thorough description of the problem
52						with code and data examples if at all possible.
53
54						=head2 Reporting Bugs
55
56						Report bugs to the Bioperl bug tracking system to help us keep track
57						the bugs and their resolution. Bug reports can be submitted via the
58						web:
59
60						https://github.com/bioperl/bioperl-live/issues
61
62						=head1 AUTHORS - Andrew Macgregor
63
64						Email: andrew at cbbc.murdoch.edu.au
65
66
67						=head1 APPENDIX
68
69						The rest of the documentation details each of the object
70						methods. Internal methods are usually preceded with a _
71
72						=cut
73
74						#'
75						# Let the code begin...
76
77						package Bio::ClusterIO::unigene;
78	1		1		5	use strict;
	1				2
	1				27
79
80	1		1		350	use Bio::Cluster::UniGene;
	1				2
	1				31
81	1		1		327	use Bio::Cluster::ClusterFactory;
	1				2
	1				27
82
83	1		1		5	use base qw(Bio::ClusterIO);
	1				2
	1				778
84
85						my %line_is = (
86						ID => q/ID\s+(\w{2,3}\.\d+)/,
87						TITLE => q/TITLE\s+(\S.*)/,
88						GENE => q/GENE\s+(\S.*)/,
89						CYTOBAND => q/CYTOBAND\s+(\S.*)/,
90						MGI => q/MGI\s+(\S.*)/,
91						LOCUSLINK => q/LOCUSLINK\s+(\S.*)/,
92						HOMOL => q/HOMOL\s+(\S.*)/,
93						EXPRESS => q/EXPRESS\s+(\S.*)/,
94						RESTR_EXPR => q/RESTR_EXPR\s+(\S.*)/,
95						GNM_TERMINUS => q/GNM_TERMINUS\s+(\S.*)/,
96						CHROMOSOME => q/CHROMOSOME\s+(\S.*)/,
97						STS => q/STS\s+(\S.*)/,
98						TXMAP => q/TXMAP\s+(\S.*)/,
99						PROTSIM => q/PROTSIM\s+(\S.*)/,
100						SCOUNT => q/SCOUNT\s+(\S.*)/,
101						SEQUENCE => q/SEQUENCE\s+(\S.*)/,
102						ACC => q/ACC=(\w+)(\.\d+)?/,
103						NID => q/NID=\s(\S.)/,
104						PID => q/PID=\s(\S.)/,
105						CLONE => q/CLONE=\s(\S.)/,
106						END => q/END=\s(\S.)/,
107						LID => q/LID=\s(\S.)/,
108						MGC => q/MGC=\s(\S.)/,
109						SEQTYPE => q/SEQTYPE=\s(\S.)/,
110						TRACE => q/TRACE=\s(\S.)/,
111						PERIPHERAL => q/PERIPHERAL=\s(\S.)/,
112						DELIMITER => q{^//},
113						);
114
115						# we set the right factory here
116						sub _initialize {
117	1		1		2	my($self, @args) = @_;
118
119	1				5	$self->SUPER::_initialize(@args);
120	1	50			4	if(! $self->cluster_factory()) {
121	1				5	$self->cluster_factory(Bio::Cluster::ClusterFactory->new(
122						-type => 'Bio::Cluster::UniGene'));
123						}
124						}
125
126						=head2 next_cluster
127
128						Title : next_cluster
129						Usage : $unigene = $stream->next_cluster()
130						Function: returns the next unigene in the stream
131						Returns : Bio::Cluster::UniGene object
132						Args : NONE
133
134						=cut
135
136						sub next_cluster {
137	3		3	1	1801	my( $self) = @_;
138	3				11	local $/ = "\n//";
139	3	50			17	return unless my $entry = $self->_readline;
140
141						# set up the variables we'll need
142	3				7	my (%unigene,@express,@locuslink,@chromosome,
143						@sts,@txmap,@protsim,@sequence);
144	3				0	my $UGobj;
145
146						# set up the regexes
147
148						# add whitespace parsing and precompile regexes
149						#foreach (values %line_is) {
150						# $_ =~ s/\s+/\\s+/g;
151						# print STDERR "Regex is $_\n";
152						# #$_ = qr/$_/x;
153						#}
154
155						#$line_is{'TITLE'} = qq/TITLE\\s+(\\S.+)/;
156
157						# run each line in an entry against the regexes
158	3				34	foreach my $line (split /\n/, $entry) {
159						#print STDERR "Wanting to match $line\n";
160	112	100			1201	if ($line =~ /$line_is{ID}/gcx) {
		100
		100
		100
		50
		100
		100
		100
		100
		100
		100
		50
		100
		100
		100
		100
		100
161	3				11	$unigene{ID} = $1;
162						}
163						elsif ($line =~ /$line_is{TITLE}/gcx ) {
164						#print STDERR "MATCHED with [$1]\n";
165	3				7	$unigene{TITLE} = $1;
166						}
167						elsif ($line =~ /$line_is{GENE}/gcx) {
168	1				3	$unigene{GENE} = $1;
169						}
170						elsif ($line =~ /$line_is{CYTOBAND}/gcx) {
171	1				2	$unigene{CYTOBAND} = $1;
172						}
173						elsif ($line =~ /$line_is{MGI}/gcx) {
174	0				0	$unigene{MGI} = $1;
175						}
176						elsif ($line =~ /$line_is{LOCUSLINK}/gcx) {
177	1				4	@locuslink = split /;/, $1;
178						}
179						elsif ($line =~ /$line_is{HOMOL}/gcx) {
180	2				7	$unigene{HOMOL} = $1;
181						}
182						elsif ($line =~ /$line_is{EXPRESS}/gcx) {
183	3				7	my $express = $1;
184						# remove initial semicolon if present
185	3				5	$express =~ s/^;//;
186	3				19	@express = split /\s*;/, $express;
187						}
188						elsif ($line =~ /$line_is{RESTR_EXPR}/gcx) {
189	1				3	$unigene{RESTR_EXPR} = $1;
190						}
191						elsif ($line =~ /$line_is{GNM_TERMINUS}/gcx) {
192	1				2	$unigene{GNM_TERMINUS} = $1;
193						}
194						elsif ($line =~ /$line_is{CHROMOSOME}/gcx) {
195	2				6	push @chromosome, $1;
196						}
197						elsif ($line =~ /$line_is{TXMAP}/gcx) {
198	0				0	push @txmap, $1;
199						}
200						elsif ($line =~ /$line_is{STS}/gcx) {
201	10				19	push @sts, $1;
202						}
203						elsif ($line =~ /$line_is{PROTSIM}/gcx) {
204	10				19	push @protsim, $1;
205						}
206						elsif ($line =~ /$line_is{SCOUNT}/gcx) {
207	3				7	$unigene{SCOUNT} = $1;
208						}
209						elsif ($line =~ /$line_is{SEQUENCE}/gcx) {
210						# parse into each sequence line
211	66				114	my $seq = {};
212						# add unigene id to each seq
213						#$seq->{unigene_id} = $unigene{ID};
214	66				175	my @items = split(/;/, $1);
215	66				87	foreach (@items) {
216	403	100			1753	if (/$line_is{ACC}/gcx) {
		100
		100
		100
		100
		100
		100
		100
		100
		50
217	66				120	$seq->{acc} = $1;
218						# remove leading dot if version pattern matched
219	66	50			161	$seq->{version} = substr($2,1) if defined $2;
220						}
221						elsif (/$line_is{NID}/gcx) {
222	66				124	$seq->{nid} = $1;
223						}
224						elsif (/$line_is{PID}/gcx) {
225	6				15	$seq->{pid} = $1;
226						}
227						elsif (/$line_is{CLONE}/gcx) {
228	58				100	$seq->{clone} = $1;
229						}
230						elsif (/$line_is{END}/gcx) {
231	53				92	$seq->{end} = $1;
232						}
233						elsif (/$line_is{LID}/gcx) {
234	60				113	$seq->{lid} = $1;
235						}
236						elsif (/$line_is{MGC}/gcx) {
237	5				12	$seq->{mgc} = $1;
238						}
239						elsif (/$line_is{SEQTYPE}/gcx) {
240	66				118	$seq->{seqtype} = $1;
241						}
242						elsif (/$line_is{TRACE}/gcx) {
243	21				56	$seq->{trace} = $1;
244						}
245						elsif (/$line_is{PERIPHERAL}/gcx) {
246	2				7	$seq->{peripheral} = $1;
247						}
248						}
249	66				156	push @sequence, $seq;
250						}
251						elsif ($line =~ /$line_is{DELIMITER}/gcx) {
252						# at the end of the record, add data to the object
253						$UGobj = $self->cluster_factory->create_object(
254						-display_id => $unigene{ID},
255						-description => $unigene{TITLE},
256						-size => $unigene{SCOUNT},
257	3				13	-members => \@sequence);
258	3	100			12	$UGobj->gene($unigene{GENE}) if defined ($unigene{GENE});
259	3	100			7	$UGobj->cytoband($unigene{CYTOBAND}) if defined($unigene{CYTOBAND});
260	3	50			6	$UGobj->mgi($unigene{MGI}) if defined ($unigene{MGI});
261	3				9	$UGobj->locuslink(\@locuslink);
262	3	100			11	$UGobj->homol($unigene{HOMOL}) if defined ($unigene{HOMOL});
263	3				8	$UGobj->express(\@express);
264	3	100			10	$UGobj->restr_expr($unigene{RESTR_EXPR}) if defined ($unigene{RESTR_EXPR});
265	3	100			8	$UGobj->gnm_terminus($unigene{GNM_TERMINUS}) if defined ($unigene{GNM_TERMINUS});
266	3				7	$UGobj->chromosome(\@chromosome);
267	3				14	$UGobj->sts(\@sts);
268	3				9	$UGobj->txmap(\@txmap);
269	3				6	$UGobj->protsim(\@protsim);
270						}
271						}
272	3				45	return $UGobj;
273						}
274
275						1;
276