File Coverage

Bio/Restriction/Enzyme.pm

Criterion	Covered	Total	%
statement	297	363	81.8
branch	152	224	67.8
condition	36	62	58.0
subroutine	47	53	88.6
pod	36	42	85.7
total	568	744	76.3

line	stmt	bran	cond	sub	pod	time	code
1							#------------------------------------------------------------------
2							#
3							# BioPerl module Bio::Restriction::Enzyme
4							#
5							# Please direct questions and support issues to
6							#
7							# Cared for by Rob Edwards
8							#
9							# You may distribute this module under the same terms as perl itself
10							#------------------------------------------------------------------
11
12							## POD Documentation:
13
14							=head1 NAME
15
16							Bio::Restriction::Enzyme - A single restriction endonuclease
17							(cuts DNA at specific locations)
18
19							=head1 SYNOPSIS
20
21							# set up a single restriction enzyme. This contains lots of
22							# information about the enzyme that is generally parsed from a
23							# rebase file and can then be read back
24
25							use Bio::Restriction::Enzyme;
26
27							# define a new enzyme with the cut sequence
28							my $re=Bio::Restriction::Enzyme->new
29							(-enzyme=>'EcoRI', -seq=>'G^AATTC');
30
31							# once the sequence has been defined a bunch of stuff is calculated
32							# for you:
33
34							#### PRECALCULATED
35
36							# find where the enzyme cuts after ...
37							my $ca=$re->cut;
38
39							# ... and where it cuts on the opposite strand
40							my $oca = $re->complementary_cut;
41
42							# get the cut sequence string back.
43							# Note that site will return the sequence with a caret
44							my $with_caret=$re->site; #returns 'G^AATTC';
45
46							# but it is also a Bio::PrimarySeq object ....
47							my $without_caret=$re->seq; # returns 'GAATTC';
48							# ... and so does string
49							$without_caret=$re->string; #returns 'GAATTC';
50
51							# what is the reverse complement of the cut site
52							my $rc=$re->revcom; # returns 'GAATTC';
53
54							# now the recognition length. There are two types:
55							# recognition_length() is the length of the sequence
56							# cutter() estimate of cut frequency
57
58							my $recog_length = $re->recognition_length; # returns 6
59							# also returns 6 in this case but would return
60							# 4 for GANNTC and 5 for RGATCY (BstX2I)!
61							$recog_length=$re->cutter;
62
63							# is the sequence a palindrome - the same forwards and backwards
64							my $pal= $re->palindromic; # this is a boolean
65
66							# is the sequence blunt (i.e. no overhang - the forward and reverse
67							# cuts are the same)
68							print "blunt\n" if $re->overhang eq 'blunt';
69
70							# Overhang can have three values: "5'", "3'", "blunt", and undef
71							# Direction is very important if you use Klenow!
72							my $oh=$re->overhang;
73
74							# what is the overhang sequence
75							my $ohseq=$re->overhang_seq; # will return 'AATT';
76
77							# is the sequence ambiguous - does it contain non-GATC bases?
78							my $ambig=$re->is_ambiguous; # this is boolean
79
80							print "Stuff about the enzyme\nCuts after: $ca\n",
81							"Complementary cut: $oca\nSite:\n\t$with_caret or\n",
82							"\t$without_caret\n";
83							print "Reverse of the sequence: $rc\nRecognition length: $recog_length\n",
84							"Is it palindromic? $pal\n";
85							print "The overhang is $oh with sequence $ohseq\n",
86							"And is it ambiguous? $ambig\n\n";
87
88
89							### THINGS YOU CAN SET, and get from rich REBASE file
90
91							# get or set the isoschizomers (enzymes that recognize the same
92							# site)
93							$re->isoschizomers('PvuII', 'SmaI'); # not really true :)
94							print "Isoschizomers are ", join " ", $re->isoschizomers, "\n";
95
96							# get or set the methylation sites
97							$re->methylation_sites(2); # not really true :)
98							print "Methylated at ", join " ", keys %{$re->methylation_sites},"\n";
99
100							#Get or set the source microbe
101							$re->microbe('E. coli');
102							print "It came from ", $re->microbe, "\n";
103
104							# get or set the person who isolated it
105							$re->source("Rob"); # not really true :)
106							print $re->source, " sent it to us\n";
107
108							# get or set whether it is commercially available and the company
109							# that it can be bought at
110							$re->vendors('NEB'); # my favorite
111							print "Is it commercially available :";
112							print $re->vendors ? "Yes" : "No";
113							print " and it can be got from ", join " ",
114							$re->vendors, "\n";
115
116							# get or set a reference for this
117							$re->reference('Edwards et al. J. Bacteriology');
118							print "It was not published in ", $re->reference, "\n";
119
120							# get or set the enzyme name
121							$re->name('BamHI');
122							print "The name of EcoRI is not really ", $re->name, "\n";
123
124
125							=head1 DESCRIPTION
126
127							This module defines a single restriction endonuclease. You can use it
128							to make custom restriction enzymes, and it is used by
129							Bio::Restriction::IO to define enzymes in the New England Biolabs
130							REBASE collection.
131
132							Use Bio::Restriction::Analysis to figure out which enzymes are available
133							and where they cut your sequence.
134
135
136							=head1 RESTRICTION MODIFICATION SYSTEMS
137
138							At least three geneticaly and biochamically distinct restriction
139							modification systems exist. The cutting components of them are known
140							as restriction endonuleases. The three systems are known by roman
141							numerals: Type I, II, and III restriction enzymes.
142
143							REBASE format 'cutzymes'(#15) lists enzyme type in its last field. The
144							categories there do not always match the the following short
145							descriptions of the enzymes types. See
146							http://it.stlawu.edu/~tbudd/rmsyst.html for a better overview.
147
148
149							=head2 TypeI
150
151							Type I systems recognize a bipartite asymetrical sequence of 5-7 bp:
152
153							---TGANnTGCT--- = methylation sites
154							---ACTNnA*CGA--- n = 6 for EcoK, n = 8 for EcoB
155
156							The cleavage site is roughly 1000 (400-7000) base pairs from the
157							recognition site.
158
159							=head2 TypeII
160
161							The simplest and most common (at least commercially).
162
163							Site recognition is via short palindromic base sequences that are 4-6
164							base pairs long. Cleavage is at the recognition site (but may
165							occasionally be just adjacent to the palindromic sequence, usually
166							within) and may produce blunt end termini or staggered, "sticky
167							end" termini.
168
169							=head2 TypeIII
170
171							The recognition site is a 5-7 bp asymmetrical sequence. Cleavage is
172							ATP dependent 24-26 base pairs downstream from the recognition site
173							and usually yields staggered cuts 2-4 bases apart.
174
175
176							=head1 COMMENTS
177
178							I am trying to make this backwards compatible with
179							Bio::Tools::RestrictionEnzyme. Undoubtedly some things will break,
180							but we can fix things as we progress.....!
181
182							I have added another comments section at the end of this POD that
183							discusses a couple of areas I know are broken (at the moment)
184
185
186							=head1 TO DO
187
188							=over 2
189
190							=item *
191
192							Convert vendors touse full names of companies instead of code
193
194							=item *
195
196							Add regular expression based matching to vendors
197
198							=item *
199
200							Move away from the archaic ^ notation for cut sites. Ideally
201							I'd totally like to remove this altogether, or add a method
202							that adds it in if someone really wants it. We should be
203							fixed on a sequence, number notation.
204
205							=back
206
207							=head1 FEEDBACK
208
209							=head2 Mailing Lists
210
211							User feedback is an integral part of the evolution of this and other
212							Bioperl modules. Send your comments and suggestions preferably to one
213							of the Bioperl mailing lists. Your participation is much appreciated.
214
215							bioperl-l@bioperl.org - General discussion
216							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
217
218							=head2 Support
219
220							Please direct usage questions or support issues to the mailing list:
221
222							I
223
224							rather than to the module maintainer directly. Many experienced and
225							reponsive experts will be able look at the problem and quickly
226							address it. Please include a thorough description of the problem
227							with code and data examples if at all possible.
228
229							=head2 Reporting Bugs
230
231							Report bugs to the Bioperl bug tracking system to help us keep track
232							the bugs and their resolution. Bug reports can be submitted via the
233							web:
234
235							https://github.com/bioperl/bioperl-live/issues
236
237							=head1 AUTHOR
238
239							Rob Edwards, redwards@utmem.edu
240
241							=head1 CONTRIBUTORS
242
243							Heikki Lehvaslaiho, heikki-at-bioperl-dot-org
244							Peter Blaiklock, pblaiklo@restrictionmapper.org
245							Mark A. Jensen, maj-at-fortinbras-dot-us
246
247							=head1 COPYRIGHT
248
249							Copyright (c) 2003 Rob Edwards.
250
251							Some of this work is Copyright (c) 1997-2002 Steve A. Chervitz. All
252							Rights Reserved. This module is free software; you can redistribute
253							it and/or modify it under the same terms as Perl itself.
254
255							=head1 SEE ALSO
256
257							L,
258							L, L
259
260							=head1 APPENDIX
261
262							Methods beginning with a leading underscore are considered private and
263							are intended for internal use by this module. They are not considered
264							part of the public interface and are described here for documentation
265							purposes only.
266
267							=cut
268
269							package Bio::Restriction::Enzyme;
270	4			4		568	use strict;
	4					3
	4					94
271
272	4			4		624	use Bio::PrimarySeq;
	4					5
	4					86
273
274	4			4		16	use Data::Dumper;
	4					10
	4					183
275	4			4		1691	use Tie::RefHash;
	4					9232
	4					93
276	4			4		19	use vars qw (%TYPE);
	4					4
	4					122
277	4			4		14	use base qw(Bio::Root::Root Bio::Restriction::EnzymeI);
	4					5
	4					1787
278
279							BEGIN {
280	4			4		6345	my %TYPE = (I => 1, II => 1, III => 1);
281							}
282
283							=head2 new
284
285							Title : new
286							Function
287							Function : Initializes the Enzyme object
288							Returns : The Restriction::Enzyme object
289							Argument : A standard definition can have several formats. For example:
290							$re->new(-enzyme='EcoRI', -seq->'GAATTC' -cut->'1')
291							Or, you can define the cut site in the sequence, for example
292							$re->new(-enzyme='EcoRI', -seq->'G^AATTC'), but you must use a caret
293							Or, a sequence can cut outside the recognition site, for example
294							$re->new(-enzyme='AbeI', -seq->'CCTCAGC' -cut->'-5/-2')
295
296							Other arguments:
297							-isoschizomers=>\@list a reference to an array of
298							known isoschizomers
299							-references=>$ref a reference to the enzyme
300							-source=>$source the source (person) of the enzyme
301							-commercial_availability=>@companies a list of companies
302							that supply the enzyme
303							-methylation_site=>\%sites a reference to hash that has
304							the position as the key and the type of methylation
305							as the value
306							-xln_sub => sub { ($self,$cut) = @_; ...; return $xln_cut },
307							a coderef to a routine that translates the input cut value
308							into Bio::Restriction::Enzyme coordinates
309							( e.g., for withrefm format, this might be
310							-xln_sub => sub { length( shift()->string ) + shift } )
311
312							A Restriction::Enzyme object manages its recognition sequence as a
313							Bio::PrimarySeq object.
314
315							The minimum requirement is for a name and a sequence.
316
317							This will create the restriction enzyme object, and define several
318							things about the sequence, such as palindromic, size, etc.
319
320							=cut
321
322							# do all cut/comp cut setting within the constructor
323							# new args
324
325							sub new {
326	10163			10163	1	25021	my($class, @args) = @_;
327	10163					20528	my $self = $class->SUPER::new(@args);
328
329	10163					39650	my ($name,$enzyme,$site,$seq,$precut, $postcut,$cut,$complementary_cut, $is_prototype, $prototype,
330							$isoschizomers, $meth, $microbe, $source, $vendors, $references, $neo, $recog, $xln_sub) =
331							$self->_rearrange([qw(
332							NAME
333							ENZYME
334							SITE
335							SEQ
336							PRECUT
337							POSTCUT
338							CUT
339							COMPLEMENTARY_CUT
340							IS_PROTOTYPE
341							PROTOTYPE
342							ISOSCHIZOMERS
343							METHYLATION_SITES
344							MICROBE
345							SOURCE
346							VENDORS
347							REFERENCES
348							IS_NEOSCHIZOMER
349							RECOG
350							XLN_SUB
351							)], @args);
352
353	10163	0	66			53968	$self->throw('At the minimum, you must define a name and '.
			33
			33
354							'recognition site for the restriction enzyme')
355							unless (($name \|\| $enzyme) && ($site \|\| $recog \|\| $seq));
356
357	10163					11706	$self->{_isoschizomers} = [];
358	10163					9691	$self->{_methylation_sites} = {};
359	10163					9147	$self->{_vendors} = [];
360	10163					9941	$self->{_references} = [];
361
362							# squelch warnings
363	10163		100			23299	$postcut \|\|='';
364
365							# enzyme name
366	10163	100				12199	$enzyme && $self->name($enzyme);
367	10163	100				19387	$name && $self->name($name);
368
369							# site
370							#
371							# note that the site() setter will automatically set
372							# cut(), complementary_cut(), if the cut site is indicated
373							# in $site with '^' /maj
374
375							# create the cut site if appropriate/this is a kludge due to
376							# the base.pm format in the new B:R order...
377	10163	100	100			21747	if ( $cut and $cut <= length $site) {
378	2531					4448	$site = substr($site, 0, $cut).'^'.substr($site, $cut);
379							}
380
381	10163	50				12212	if ($site) {
382	10163					12633	$self->site($site);
383							}
384							else {
385	0	0				0	$seq && $self->site($seq);
386							}
387
388	10162	100				11324	if ($recog) {
389	7495					9651	$self->recog($recog);
390							}
391							else {
392	2667	50				3302	$seq && $self->recog($seq);
393	2667	50				4642	$site && $self->recog($site);
394							}
395							# call revcom_site to initialize it and revcom_recog:
396	10162					11843	$self->revcom_site();
397
398	10162					17245	$recog = $self->string; # for length calculations below
399
400	10162	100				15481	if ($xln_sub) {
401	7494	50				13441	$self->warn("Translation subroutine is not a coderef; ignoring") unless
402							ref($xln_sub) eq 'CODE';
403							}
404
405							# cut coordinates
406	10162					11272	my ($pc_cut, $pc_comp_cut) = ( $postcut =~ /(-?\d+)\/(-?\d+)/ );
407
408							# cut definitions in constructor override any autoset in
409							# site()
410							# definitions in site conform to withrefm coords, translation
411							# happens here
412
413	10162	100				16176	if (defined $cut) {
		100
414	2664	50				4511	$self->cut( $xln_sub ? $xln_sub->($self, $cut) : $cut );
415							}
416							elsif ( defined $pc_cut ) {
417	457	100				1088	$self->cut( $xln_sub ? $xln_sub->($self, $pc_cut) : $pc_cut );
418							}
419
420	10162	100				16503	if (defined $complementary_cut) {
		100
421	4	50				10	$self->complementary_cut($xln_sub ? $xln_sub->($self,$complementary_cut) : $complementary_cut);
422							}
423							elsif (defined $pc_comp_cut) {
424	457	100				884	$self->complementary_cut($xln_sub ? $xln_sub->($self,$pc_comp_cut) : $pc_comp_cut);
425							}
426
427	10162	100				18619	$is_prototype && $self->is_prototype($is_prototype);
428	10162	100				14366	$prototype && $self->prototype($prototype);
429	10162	100				17423	$isoschizomers && $self->isoschizomers($isoschizomers);
430	10162	50				12455	$meth && $self->methylation_sites($meth);
431	10162	50				10908	$microbe && $self->microbe($microbe);
432	10162	100				16515	$source && $self->source($source);
433	10162	100				14740	$vendors && $self->vendors($vendors);
434	10162	100				15532	$references && $self->references($references);
435	10162	50				13116	$neo && $self->is_neoschizomer($neo);
436
437							# create multicut enzymes here if $precut defined
438	10162	100				12731	if (defined $precut) {
439	49					114	bless $self, 'Bio::Restriction::Enzyme::MultiCut';
440	49					179	my ($pc_cut, $pc_comp_cut) = $precut =~ /(-?\d+)\/(-?\d+)/;
441	49					157	my $re2 = $self->clone;
442	49	50				194	$re2->cut($xln_sub ? $xln_sub->($self, -$pc_cut) : -$pc_cut);
443	49	50				162	$re2->complementary_cut($xln_sub ? $xln_sub->($self, -$pc_comp_cut) : -$pc_comp_cut);
444	49					146	$self->others($re2);
445							}
446
447	10162					26934	return $self;
448							}
449
450							=head1 Essential methods
451
452							=cut
453
454							=head2 name
455
456							Title : name
457							Usage : $re->name($newval)
458							Function : Gets/Sets the restriction enzyme name
459							Example : $re->name('EcoRI')
460							Returns : value of name
461							Args : newvalue (optional)
462
463							This will also clean up the name. I have added this because some
464							people get confused about restriction enzyme names. The name should
465							be One upper case letter, and two lower case letters (because it is
466							derived from the organism name, eg. EcoRI is from E. coli). After
467							that it is all confused, but the numbers should be roman numbers not
468							numbers, therefore we'll correct those. At least this will provide
469							some standard, I hope.
470
471							=cut
472
473							sub name{
474	70047			70047	1	50639	my ($self, $name)=@_;
475
476	70047	100				76093	if ($name) { # correct and set the name
477	10164					7542	my $old_name = $name;
478
479							# remove spaces. Some people write HindIII as Hind III
480	10164					14149	$name =~ s/\s+//g;
481							# change TAILING ones to I's
482	10164	50				18036	if ($name =~ m/(1+)$/) {
483	0					0	my $i = 'I' x length($1);
484	0					0	$name =~ s/1+$/$i/;
485							}
486
487							# make the first letter upper case
488	10164					25271	$name =~ s/^(\w)/uc($1)/e;
	10164					20997
489
490	10164	50				15642	unless ($name eq $old_name) {
491							# we have changed the name, so send a warning
492	0					0	$self->warn("The enzyme name $old_name was changed to $name");
493							}
494	10164					14069	$self->{'_name'} = $name;
495							}
496	70047					140318	return $self->{'_name'};
497							}
498
499
500							=head2 site
501
502							Title : site
503							Usage : $re->site();
504							Function : Gets/sets the recognition sequence for the enzyme.
505							Example : $seq_string = $re->site();
506							Returns : String containing recognition sequence indicating
507							: cleavage site as in 'G^AATTC'.
508							Argument : n/a
509							Throws : n/a
510
511
512							Side effect: the sequence is always converted to upper case.
513
514							The cut site can also be set by using methods L and
515							L.
516
517							This will pad out missing sequence with N's. For example the enzyme
518							Acc36I cuts at ACCTGC(4/8). This will be returned as ACCTGCNNNN^
519
520							Note that the common notation ACCTGC(4/8) means that the forward
521							strand cut is four nucleotides after the END of the recognition
522							site. The forward cut() in the coordinates used here in Acc36I
523							ACCTGC(4/8) is at 6+4 i.e. 10.
524
525							** This is the main setable method for the recognition site.
526
527							=cut
528
529							sub site {
530	10169			10169	1	9446	my ( $self, $site ) = @_;
531
532	10169	100				13133	if ($site) {
533
534	10163	100				17119	$self->throw("Unrecognized characters in site: [$site]")
535							if $site =~ /[^ATGCMRWSYKVHDBN\^]/i;
536
537							# we may have to redefine this if there is a ^ in the sequence
538
539							# first, check and see if we have a cut site in the sequence
540							# if so, find the position, and set the target sequence and cut site
541
542	10162					10630	$self->{'_site'} = $site;
543
544	10162					18936	my ( $first, $second ) = $site =~ /(.)\^(.)/;
545	10162	100				18436	$site = "$1$2" if defined $first;
546	10162					8224	$self->{'_site'} = $site;
547
548							# now set the recognition site as a new Bio::PrimarySeq object
549							# we need it before calling cut() and complementary_cut()
550	10162					13289	$self->{_seq} = Bio::PrimarySeq->new(
551							-id => $self->name,
552							-seq => $site,
553							-verbose => $self->verbose,
554							-alphabet => 'dna'
555							);
556
557	10162	100				18755	if ( defined $first ) {
558	4945					7204	$self->cut( length $first );
559	4945					6335	$self->complementary_cut( length $second );
560	4945					6221	$self->revcom_site();
561							}
562							}
563	10168					12039	return $self->{'_site'};
564							}
565
566							=head2 revcom_site
567
568							Title : revcom_site
569							Usage : $re->revcom_site();
570							Function : Gets/sets the complementary recognition sequence for the enzyme.
571							Example : $seq_string = $re->revcom_site();
572							Returns : String containing recognition sequence indicating
573							: cleavage site as in 'G^AATTC'.
574							Argument : none (sets on first call)
575							Throws : n/a
576
577							This is the same as site, except it returns the revcom site. For
578							palindromic enzymes these two are identical. For non-palindromic
579							enzymes they are not!
580
581							On set, this also handles setting the revcom_recog attribute.
582
583							See also L above.
584
585							=cut
586
587							sub revcom_site {
588	15112			15112	1	10991	my $self = shift;
589							# getter
590	15112	100				20823	return $self->{'_revcom_site'} unless !$self->{'_revcom_site'};
591
592							# setter
593	10162					8153	my $site = $self->{'_site'};
594	10162	100				12697	if ($self->is_palindromic) {
595	9266					9632	$self->{'_revcom_site'}=$self->{'_site'};
596	9266					11406	$self->revcom_recog( $self->string );
597	9266					10580	return $self->{'_revcom_site'};
598							}
599
600	896	50				1732	$self->throw("Unrecognized characters in revcom site: [$site]")
601							if $site =~ /[^ATGCMRWSYKVHDBN\^]/i;
602
603	896	100				1335	if ($site =~ /\^/) {
604							# first, check and see if we have a cut site indicated in the sequence
605							# if so, find the position, and set the target sequence and cut site
606	16					31	$site = $self->revcom;
607	16					36	$self->revcom_recog( $site );
608	16					41	my $c = length($site)-$self->cut;
609	16					53	$site = substr($site, 0, $c).'^'.substr($site,$c);
610	16					28	$self->{'_revcom_site'} = $site;
611							}
612							else {
613	880					1046	my $revcom=$self->revcom;
614	880					1641	$self->revcom_recog( $revcom );
615							# my $cc=$self->complementary_cut;
616							# my $hat=length($revcom)-$cc+1; # we need it on the other strand!
617							# if ($cc > length($revcom)) {
618							# my $pad= "N" x ($cc-length($revcom));
619							# $revcom = $pad. $revcom;
620							# $hat=length($revcom)-$cc+1;
621							# }
622							# elsif ($cc < 0) {
623							# my $pad = "N" x -$cc;
624							# $revcom .= $pad;
625							# $hat=length($revcom);
626							# }
627							# $revcom =~ s/(.{$hat})/$1\^/;
628	880					967	$self->{'_revcom_site'}=$revcom;
629							}
630	896					833	return $self->{'_revcom_site'};
631							}
632
633							=head2 cut
634
635							Title : cut
636							Usage : $num = $re->cut(1);
637							Function : Sets/gets an integer indicating the position of cleavage
638							relative to the 5' end of the recognition sequence in the
639							forward strand.
640
641							For type II enzymes, sets the symmetrically positioned
642							reverse strand cut site by calling complementary_cut().
643
644							Returns : Integer, 0 if not set
645							Argument : an integer for the forward strand cut site (optional)
646
647							Note that the common notation ACCTGC(4/8) means that the forward
648							strand cut is four nucleotides after the END of the recognition
649							site. The forwad cut in the coordinates used here in Acc36I
650							ACCTGC(4/8) is at 6+4 i.e. 10.
651
652							Note that REBASE uses notation where cuts within symmetic sites are
653							marked by '^' within the forward sequence but if the site is
654							asymmetric the parenthesis syntax is used where numbering ALWAYS
655							starts from last nucleotide in the forward strand. That's why AciI has
656							a site usually written as CCGC(-3/-1) actualy cuts in
657
658							C^C G C
659							G G C^G
660
661							In our notation, these locations are 1 and 3.
662
663
664							The cuts locations in the notation used are relative to the first
665							(non-N) nucleotide of the reported forward strand of the recognition
666							sequence. The following diagram numbers the phosphodiester bonds
667							(marked by + ) which can be cut by the restriction enzymes:
668
669							1 2 3 4 5 6 7 8 ...
670							N + N + N + N + N + G + A + C + T + G + G + N + N + N
671							... -5 -4 -3 -2 -1
672
673
674							=cut
675
676							sub cut {
677	24294			24294	1	18639	my ($self, $value) = @_;
678	24294	100				29480	if (defined $value) {
679	8117	50				18315	$self->throw("The cut position needs to be an integer [$value]")
680							unless $value =~ /[-+]?\d+/;
681	8117					8570	$self->{'_cut'} = $value;
682
683							# add the caret to the site attribute only if internal /maj
684	8117	100	100			23965	if ( ($self->{_site} !~ /\^/) && ($value <= length ($self->{_site}))) {
685							$self->{_site} =
686	5174					10030	substr($self->{_site}, 0, $value). '^'. substr($self->{_site}, $value);
687							}
688
689							# auto-set comp cut only if cut site is inside the recog site./maj
690							$self->complementary_cut(length ($self->seq->seq) - $value )
691	8117	100	100			19689	if (($self->{_site} =~ /\^/) && ($self->type eq 'II'));
692
693							}
694							# return undef if not defined yet, not 0 /maj
695	24294					29077	return $self->{'_cut'};
696							}
697
698							=head2 cuts_after
699
700							Title : cuts_after
701							Usage : Alias for cut()
702
703							=cut
704
705							sub cuts_after {
706	0			0	1	0	shift->cut(@_);
707							}
708
709
710							=head2 complementary_cut
711
712							Title : complementary_cut
713							Usage : $num = $re->complementary_cut('1');
714							Function : Sets/Gets an integer indicating the position of cleavage
715							: on the reverse strand of the restriction site.
716							Returns : Integer
717							Argument : An integer (optional)
718							Throws : Exception if argument is non-numeric.
719
720							This method determines the cut on the reverse strand of the sequence.
721							For most enzymes this will be within the sequence, and will be set
722							automatically based on the forward strand cut, but it need not be.
723
724							B that the returned location indicates the location AFTER the
725							first non-N site nucleotide in the FORWARD strand.
726
727							=cut
728
729							sub complementary_cut {
730	16431			16431	1	11815	my ($self, $num)=@_;
731
732	16431	100				19969	if (defined $num) {
733	13152	50				27538	$self->throw("The cut position needs to be an integer [$num]")
734							unless $num =~ /[-+]?\d+/;
735	13152					12929	$self->{'_rc_cut'} = $num;
736							}
737							# return undef, not 0, if not yet defined /maj
738	16431					14801	return $self->{'_rc_cut'};
739							}
740
741
742							=head1 Read only (usually) recognition site descriptive methods
743
744							=cut
745
746							=head2 type
747
748							Title : type
749							Usage : $re->type();
750							Function : Get/set the restriction system type
751							Returns :
752							Argument : optional type: ('I'\|II\|III)
753
754							Restriction enzymes have been catezorized into three types. Some
755							REBASE formats give the type, but the following rules can be used to
756							classify the known enzymes:
757
758							=over 4
759
760							=item 1
761
762							Bipartite site (with 6-8 Ns in the middle and the cut site
763							is E 50 nt away) =E type I
764
765							=item 2
766
767							Site length E 3 =E type I
768
769							=item 3
770
771							5-6 asymmetric site and cuts E20 nt away =E type III
772
773							=item 4
774
775							All other =E type II
776
777							=back
778
779							There are some enzymes in REBASE which have bipartite recognition site
780							and cat far from the site but are still classified as type I. I've no
781							idea if this is really so.
782
783							=cut
784
785							sub type {
786	7707			7707	1	6246	my ($self, $value) = @_;
787
788	7707	50				9397	if ($value) {
789							$self->throw("Not a valid value [$value], needs to one of : ".
790							join (', ', sort keys %TYPE) )
791	0	0				0	unless $TYPE{$value};
792	0					0	return $self->{'_type'} = $value;
793							}
794
795							# pre set
796							#return $self->{'_type'} if $self->{'_type'};
797							# bipartite
798							return $self->{'_type'} = 'I'
799	7707	50	66			11122	if $self->{'_seq'}->seq =~ /N*[^N]+N{6,8}[^N]/ and abs($self->cut) > 50 ;
800							# 3 nt site
801							return $self->{'_type'} = 'I'
802	7707	100				12355	if $self->{'_seq'}->length == 3;
803							# asymmetric and cuts > 20 nt
804	7697	50	100			8906	return $self->{'_type'} = 'III'
			100
			66
805							if (length $self->string == 5 or length $self->string == 6 ) and
806							not $self->palindromic and abs($self->cut) > 20;
807	7697					22371	return $self->{'_type'} = 'II';
808							}
809
810							=head2 seq
811
812							Title : seq
813							Usage : $re->seq();
814							Function : Get the Bio::PrimarySeq.pm object representing
815							: the recognition sequence
816							Returns : A Bio::PrimarySeq object representing the
817							enzyme recognition site
818							Argument : n/a
819							Throws : n/a
820
821
822							=cut
823
824							sub seq {
825	7854			7854	1	12136	shift->{'_seq'};
826							}
827
828							=head2 string
829
830							Title : string
831							Usage : $re->string();
832							Function : Get a string representing the recognition sequence.
833							Returns : String. Does NOT contain a '^' representing the cut location
834							as returned by the site() method.
835							Argument : n/a
836							Throws : n/a
837
838							=cut
839
840							sub string {
841	52617			52617	1	73954	shift->{'_seq'}->seq;
842							}
843
844							=head2 recog
845
846							Title : recog
847							Usage : $enz->recog($recognition_sequence)
848							Function: Gets/sets the pure recognition site. Sets as
849							regexp if appropriate.
850							As for string(), the cut indicating carets (^)
851							are expunged.
852							Example :
853							Returns : value of recog (a scalar)
854							Args : on set, new value (a scalar or undef, optional)
855
856							=cut
857
858							sub recog{
859	25643			25643	1	18561	my $self = shift;
860	25643					16527	my $recog = shift;
861	25643	100				47970	return $self->{'recog'} unless $recog;
862	10162					14206	$recog =~ s/\^//g;
863	10162	100				20744	$recog = _expand($recog) if $recog =~ /[^ATGC]/;
864	10162					12480	return $self->{'recog'} = $recog;
865							}
866
867							=head2 revcom_recog
868
869							Title : revcom_recog
870							Usage : $enz->revcom_recog($recognition_sequence)
871							Function: Gets/sets the pure reverse-complemented recognition site.
872							Sets as regexp if appropriate.
873							As for string(), the cut indicating carets (^) are expunged.
874							Example :
875							Returns : value of recog (a scalar)
876							Args : on set, new value (a scalar or undef, optional)
877
878							=cut
879
880							sub revcom_recog{
881	12969			12969	1	10919	my $self = shift;
882	12969					9229	my $recog = shift;
883	12969	100				17131	unless ($recog) {
884	2807	50				3643	$self->throw( "revcom recognition site not set; call \$enz->revcom_site to initialize" ) unless $self->{'revcom_recog'};
885	2807					3539	return $self->{'revcom_recog'};
886							}
887	10162					10705	$recog =~ s/\^//g;
888	10162	100				22334	$recog = _expand($recog) if $recog =~ /[^ATGC]/;
889	10162					12511	return $self->{'revcom_recog'} = $recog;
890							}
891
892							=head2 revcom
893
894							Title : revcom
895							Usage : $re->revcom();
896							Function : Get a string representing the reverse complement of
897							: the recognition sequence.
898							Returns : String
899							Argument : n/a
900							Throws : n/a
901
902							=cut
903
904							sub revcom {
905	11059			11059	1	20741	shift->{'_seq'}->revcom->seq();
906							}
907
908							=head2 recognition_length
909
910							Title : recognition_length
911							Usage : $re->recognition_length();
912							Function : Get the length of the RECOGNITION sequence.
913							This is the total recognition sequence,
914							inluding the ambiguous codes.
915							Returns : An integer
916							Argument : Nothing
917
918							See also: L
919
920							=cut
921
922							sub recognition_length {
923	1			1	1	2	my $self = shift;
924	1					2	return length($self->string);
925							}
926
927							=head2 cutter
928
929							Title : cutter
930							Usage : $re->cutter
931							Function : Returns the "cutter" value of the recognition site.
932
933							This is a value relative to site length and lack of
934							ambiguity codes. Hence: 'RCATGY' is a five (5) cutter site
935							and 'CCTNAGG' a six cutter
936
937							This measure correlates to the frequency of the enzyme
938							cuts much better than plain recognition site length.
939
940							Example : $re->cutter
941							Returns : integer or float number
942							Args : none
943
944							Why is this better than just stripping the ambiguos codes? Think about
945							it like this: You have a random sequence; all nucleotides are equally
946							probable. You have a four nucleotide re site. The probability of that
947							site finding a match is one out of 4^4 or 256, meaning that on average
948							a four cutter finds a match every 256 nucleotides. For a six cutter,
949							the average fragment length is 4^6 or 4096. In the case of ambiguity
950							codes the chances are finding the match are better: an R (A\|T) has 1/2
951							chance of finding a match in a random sequence. Therefore, for RGCGCY
952							the probability is one out of (24444*2) which exactly the same as
953							for a five cutter! Cutter, although it can have non-integer values
954							turns out to be a useful and simple measure.
955
956							From bug 2178: VHDB are ambiguity symbols that match three different
957							nucleotides, so they contribute less to the effective recognition sequence
958							length than e.g. Y which matches only two nucleotides. A symbol which matches n
959							of the 4 nucleotides has an effective length of 1 - log(n) / log(4).
960
961							=cut
962
963							sub cutter {
964	3532			3532	1	2200	my ($self)=@_;
965	3532					2794	$_ = uc $self->string;
966
967	3532					2864	my $cutter = tr/[ATGC]//d;
968	3532					2273	my $count = tr/[MRWSYK]//d;
969	3532					2513	$cutter += $count/2;
970	3532					2306	$count = tr/[VHDB]//d;
971	3532					2434	$cutter += $count * (1 - log(3) / log(4));
972	3532					6773	return $cutter;
973							}
974
975
976							=head2 is_palindromic
977
978							Title : is_palindromic
979							Alias : palindromic
980							Usage : $re->is_palindromic();
981							Function : Determines if the recognition sequence is palindromic
982							: for the current restriction enzyme.
983							Returns : Boolean
984							Argument : n/a
985							Throws : n/a
986
987							A palindromic site (EcoRI):
988
989							5-GAATTC-3
990							3-CTTAAG-5
991
992							=cut
993
994							sub is_palindromic {
995	36823			36823	1	24803	my $self = shift;
996	36823	100				79256	return $self->{_palindromic} if defined $self->{_palindromic};
997	10162	100				11391	if ($self->string eq $self->revcom) {
998	9266					23356	return $self->{_palindromic}=1;
999							}
1000	896					2119	return $self->{_palindromic} = 0;
1001							}
1002
1003	5824			5824	0	8061	sub palindromic { shift->is_palindromic(@_) }
1004
1005							=head2 is_symmetric
1006
1007							Title : is_symmetric
1008							Alias : symmetric
1009							Usage : $re->is_symmetric();
1010							Function : Determines if the enzyme is a symmetric cutter
1011							Returns : Boolean
1012							Argument : none
1013
1014							A symmetric but non-palindromic site (HindI):
1015							v
1016							5-C A C-3
1017							3-G T G-5
1018							^
1019
1020							=cut
1021
1022							sub is_symmetric {
1023	4			4		20	no warnings qw( uninitialized );
	4					4
	4					5466
1024	9641			9641	1	6356	my $self = shift;
1025
1026	9641	100				18115	return $self->{_symmetric} if defined $self->{_symmetric};
1027	5355	100				4876	if ($self->is_palindromic) {
1028	4892					11997	return $self->{_symmetric} = 1;
1029							}
1030	463	100				493	if ($self->cut == length($self->string) - $self->complementary_cut) {
1031	27					82	return $self->{_symmetric}=1;
1032							}
1033	436					953	return $self->{_symmetric} = 0;
1034							}
1035
1036
1037	0			0	0	0	sub symmetric { shift->is_symmetric(@_) }
1038
1039							=head2 overhang
1040
1041							Title : overhang
1042							Usage : $re->overhang();
1043							Function : Determines the overhang of the restriction enzyme
1044							Returns : "5'", "3'", "blunt" of undef
1045							Argument : n/a
1046							Throws : n/a
1047
1048							A blunt site in SmaI returns C
1049
1050							5' C C C^G G G 3'
1051							3' G G G^C C C 5'
1052
1053							A 5' overhang in EcoRI returns C<5'>
1054
1055							5' G^A A T T C 3'
1056							3' C T T A A^G 5'
1057
1058							A 3' overhang in KpnI returns C<3'>
1059
1060							5' G G T A C^C 3'
1061							3' C^C A T G G 5'
1062
1063							=cut
1064
1065							sub overhang {
1066	545			545	1	331	my $self = shift;
1067	545	100	66			1241	unless ($self->{'_cut'} && $self->{'_rc_cut'}) {
1068	32					41	return "unknown";
1069							}
1070	513	100				660	if ($self->{_cut} < $self->{_rc_cut}) {
		100
		50
1071	287					434	$self->{_overhang}="5'";
1072							} elsif ($self->{_cut} == $self->{_rc_cut}) {
1073	116					188	$self->{_overhang}="blunt";
1074							} elsif ($self->{_cut} > $self->{_rc_cut}) {
1075	110					165	$self->{_overhang}="3'";
1076							} else {
1077	0					0	$self->{_overhang}="unknown";
1078							}
1079							return $self->{_overhang}
1080	513					752	}
1081
1082							=head2 overhang_seq
1083
1084							Title : overhang_seq
1085							Usage : $re->overhang_seq();
1086							Function : Determines the overhang sequence of the restriction enzyme
1087							Returns : a Bio::LocatableSeq
1088							Argument : n/a
1089							Throws : n/a
1090
1091							I do not think it is necessary to create a seq object of these. (Heikki)
1092
1093							Note: returns empty string for blunt sequences and undef for ones that
1094							we don't know. Compare these:
1095
1096							A blunt site in SmaI returns empty string
1097
1098							5' C C C^G G G 3'
1099							3' G G G^C C C 5'
1100
1101							A 5' overhang in EcoRI returns C
1102
1103							5' G^A A T T C 3'
1104							3' C T T A A^G 5'
1105
1106							A 3' overhang in KpnI returns C
1107
1108							5' G G T A C^C 3'
1109							3' C^C A T G G 5'
1110
1111							Note that you need to use method L to decide
1112							whether it is a 5' or 3' overhang!!!
1113
1114							Note: The overhang stuff does not work if the site is asymmetric! Rethink!
1115
1116							=cut
1117
1118							sub overhang_seq {
1119	5			5	1	6	my $self = shift;
1120
1121							# my $overhang->Bio::PrimarySeq(-id=>$self->name . '-overhang',
1122							# -verbose=>$self->verbose,
1123							# -alphabet=>'dna');
1124
1125	5	50				6	return '' if $self->overhang eq 'blunt' ;
1126
1127	5	50	33			16	unless ($self->{_cut} && $self->{_rc_cut}) {
1128							# lets just check that we really can't figure it out
1129	0					0	$self->cut;
1130	0					0	$self->complementary_cut;
1131	0	0	0			0	unless ($self->{_cut} && $self->{_rc_cut}) {
1132	0					0	return;
1133							}
1134							}
1135
1136							# this is throwing an error for sequences outside the restriction
1137							# site (eg ^NNNNGATCNNNN^)
1138							# So if this is the case we need to fake these guys
1139	5	50	33			20	if (($self->{_cut}<0) \|\|
			33
			33
1140							($self->{_rc_cut}<0) \|\|
1141							($self->{_cut}>$self->seq->length) \|\|
1142							($self->{_rc_cut}>$self->seq->length)) {
1143	0					0	my $tempseq=$self->site;
1144	0					0	my ($five, $three)=split /\^/, $tempseq;
1145	0	0				0	if ($self->{_cut} > $self->{_rc_cut}) {
		0
1146							return substr($five, $self->{_rc_cut})
1147	0					0	} elsif ($self->{_cut} < $self->{_rc_cut}) {
1148							return substr($three, 0, $self->{_rc_cut})
1149	0					0	} else {
1150	0					0	return '';
1151							}
1152							}
1153
1154	5	50				12	if ($self->{_cut} > $self->{_rc_cut}) {
		50
1155	0					0	return $self->seq->subseq($self->{_rc_cut}+1,$self->{_cut});
1156							} elsif ($self->{_cut} < $self->{_rc_cut}) {
1157	5					6	return $self->seq->subseq($self->{_cut}+1, $self->{_rc_cut});
1158							} else {
1159	0					0	return '';
1160							}
1161							}
1162
1163
1164
1165							=head2 compatible_ends
1166
1167							Title : compatible_ends
1168							Usage : $re->compatible_ends($re2);
1169							Function : Determines if the two restriction enzyme cut sites
1170							have compatible ends.
1171							Returns : 0 if not, 1 if only one pair ends match, 2 if both ends.
1172							Argument : a Bio::Restriction::Enzyme
1173							Throws : unless the argument is a Bio::Resriction::Enzyme and
1174							if there are Ns in the ovarhangs
1175
1176							In case of type II enzymes which which cut symmetrically, this
1177							function can be considered to return a boolean value.
1178
1179
1180							=cut
1181
1182							sub compatible_ends {
1183	1			1	1	2	my ($self, $re) = @_;
1184
1185	1	50				5	$self->throw("Need a Bio::Restriction::Enzyme as an argument, [$re]")
1186							unless $re->isa('Bio::Restriction::Enzyme');
1187
1188							# $self->throw("Only type II enzymes work now")
1189							# unless $self->type eq 'II';
1190
1191	1	50	33			3	$self->debug("N(s) in overhangs. Can not compare")
1192							if $self->overhang_seq =~ /N/ or $re->overhang_seq =~ /N/;
1193
1194	1	50	33			2	return 2 if $self->overhang_seq eq $re->overhang_seq and
1195							$self->overhang eq $re->overhang;
1196
1197	0					0	return 0;
1198							}
1199
1200							=head2 is_ambiguous
1201
1202							Title : is_ambiguous
1203							Usage : $re->is_ambiguous();
1204							Function : Determines if the restriction enzyme contains ambiguous sequences
1205							Returns : Boolean
1206							Argument : n/a
1207							Throws : n/a
1208
1209							=cut
1210
1211							sub is_ambiguous {
1212	1			1	1	3	my $self = shift;
1213	1	50				2	return $self->string =~ m/[^AGCT]/ ? 1 : 0 ;
1214							}
1215
1216							=head2 Additional methods from Rebase
1217
1218							=cut
1219
1220							=head2 is_prototype
1221
1222							Title : is_prototype
1223							Usage : $re->is_prototype
1224							Function : Get/Set method for finding out if this enzyme is a prototype
1225							Example : $re->is_prototype(1)
1226							Returns : Boolean
1227							Args : none
1228
1229							Prototype enzymes are the most commonly available and usually first
1230							enzymes discoverd that have the same recognition site. Using only
1231							prototype enzymes in restriction analysis avoids redundancy and
1232							speeds things up.
1233
1234							=cut
1235
1236							sub is_prototype {
1237	7213			7213	1	5317	my ($self, $value) = @_;
1238	7213	100				9293	if (defined $value) {
1239	7210					10816	return $self->{'_is_prototype'} = $value ;
1240							}
1241	3	100				7	if (defined $self->{'_is_prototype'}) {
1242	2					7	return $self->{'_is_prototype'}
1243							} else {
1244	1					7	$self->warn("Can't unequivocally assign prototype based on input format alone");
1245							return
1246	0					0	}
1247							}
1248
1249							=head2 is_neoschizomer
1250
1251							Title : is_neoschizomer
1252							Usage : $re->is_neoschizomer
1253							Function : Get/Set method for finding out if this enzyme is a neoschizomer
1254							Example : $re->is_neoschizomer(1)
1255							Returns : Boolean
1256							Args : none
1257
1258							Neoschizomers are distinguishable from the prototype enzyme by having a
1259							different cleavage pattern. Note that not all formats report this
1260
1261							=cut
1262
1263							sub is_neoschizomer {
1264	0			0	1	0	my ($self, $value) = @_;
1265	0	0				0	if (defined $value) {
1266	0					0	return $self->{'_is_neoschizomer'} = $value ;
1267							}
1268	0	0				0	if (defined $self->{'_is_neoschizomer'}) {
1269	0					0	return $self->{'_is_neoschizomer'}
1270							} else {
1271	0					0	$self->warn("Can't unequivocally assign neoschizomer based on input format alone");
1272							return
1273	0					0	}
1274							}
1275
1276							=head2 prototype_name
1277
1278							Title : prototype_name
1279							Alias : prototype
1280							Usage : $re->prototype_name
1281							Function : Get/Set method for the name of prototype for
1282							this enzyme's recognition site
1283							Example : $re->prototype_name(1)
1284							Returns : prototype enzyme name string or an empty string
1285							Args : optional prototype enzyme name string
1286
1287							If the enzyme itself is the prototype, its own name is returned. Not to
1288							confuse the negative result with an unset value, use method
1289							L.
1290
1291							This method is called I rather than I,
1292							because it returns a string rather than on object.
1293
1294							=cut
1295
1296							sub prototype_name {
1297	12			12	1	12	my $self = shift;
1298
1299	12	100				28	$self->{'_prototype'} = shift if @_;
1300	12	100				29	return $self->name if $self->{'_is_prototype'};
1301	2		50			7	return $self->{'_prototype'} \|\| '';
1302							}
1303
1304	9			9	0	16	sub prototype { shift->prototype_name(@_) }
1305
1306							=head2 isoschizomers
1307
1308							Title : isoschizomers
1309							Alias : isos
1310							Usage : $re->isoschizomers(@list);
1311							Function : Gets/Sets a list of known isoschizomers (enzymes that
1312							recognize the same site, but don't necessarily cut at
1313							the same position).
1314							Arguments : A reference to an array that contains the isoschizomers
1315							Returns : A reference to an array of the known isoschizomers or 0
1316							if not defined.
1317
1318							This has to be the hardest name to spell, so now you can use the alias
1319							'isos'. Added for compatibility to REBASE
1320
1321							=cut
1322
1323							sub isoschizomers {
1324	7456			7456	1	5987	my ($self) = shift;
1325	7456	100				10390	push @{$self->{_isoschizomers}}, @_ if @_;
	7454					11366
1326							# make sure that you don't dereference if null
1327							# chad believes quite strongly that you should return
1328							# a reference to an array anyway. don't bother dereferencing.
1329							# i'll post that to the list.
1330	7456	50				11081	if ($self->{'_isoschizomers'}) {
1331	7456					4649	return @{$self->{_isoschizomers}};
	7456					6976
1332							}
1333
1334							}
1335
1336	0			0	0	0	sub isos { shift->isoschizomers(@_) }
1337
1338							=head2 purge_isoschizomers
1339
1340							Title : purge_isoschizomers
1341							Alias : purge_isos
1342							Usage : $re->purge_isoschizomers();
1343							Function : Purges the set of isoschizomers for this enzyme
1344							Arguments :
1345							Returns : 1
1346
1347							=cut
1348
1349							sub purge_isoschizomers {
1350	1			1	1	3	my ($self) = shift;
1351	1					4	$self->{_isoschizomers} = [];
1352
1353							}
1354
1355	0			0	0	0	sub purge_isos { shift->purge_isoschizomers(@_) }
1356
1357							=head2 methylation_sites
1358
1359							Title : methylation_sites
1360							Usage : $re->methylation_sites(\%sites);
1361							Function : Gets/Sets known methylation sites (positions on the sequence
1362							that get modified to promote or prevent cleavage).
1363							Arguments : A reference to a hash that contains the methylation sites
1364							Returns : A reference to a hash of the methylation sites or
1365							an empty string if not defined.
1366
1367							There are three types of methylation sites:
1368
1369							=over 3
1370
1371							=item * (6) = N6-methyladenosine
1372
1373							=item * (5) = 5-methylcytosine
1374
1375							=item * (4) = N4-methylcytosine
1376
1377							=back
1378
1379							These are stored as 6, 5, and 4 respectively. The hash has the
1380							sequence position as the key and the type of methylation as the value.
1381							A negative number in the sequence position indicates that the DNA is
1382							methylated on the complementary strand.
1383
1384							Note that in REBASE, the methylation positions are given
1385							Added for compatibility to REBASE.
1386
1387							=cut
1388
1389							sub methylation_sites {
1390	782			782	1	623	my $self = shift;
1391
1392	782					1175	while (@_) {
1393	829					584	my $key = shift;
1394	829					2059	$self->{'_methylation_sites'}->{$key} = shift;
1395							}
1396	782					543	return %{$self->{_methylation_sites}};
	782					1091
1397							}
1398
1399
1400							=head2 purge_methylation_sites
1401
1402							Title : purge_methylation_sites
1403							Usage : $re->purge_methylation_sites();
1404							Function : Purges the set of methylation_sites for this enzyme
1405							Arguments :
1406							Returns :
1407
1408							=cut
1409
1410							sub purge_methylation_sites {
1411	23			23	1	27	my ($self) = shift;
1412	23					38	$self->{_methylation_sites} = {};
1413							}
1414
1415							=head2 microbe
1416
1417							Title : microbe
1418							Usage : $re->microbe($microbe);
1419							Function : Gets/Sets microorganism where the restriction enzyme was found
1420							Arguments : A scalar containing the microbes name
1421							Returns : A scalar containing the microbes name or 0 if not defined
1422
1423							Added for compatibility to REBASE
1424
1425							=cut
1426
1427							sub microbe {
1428	2			2	1	3	my ($self, $microbe) = @_;
1429	2	100				5	if ($microbe) {
1430	1					1	$self->{_microbe}=$microbe;
1431							}
1432	2		50			9	return $self->{_microbe} \|\| '';
1433
1434							}
1435
1436
1437							=head2 source
1438
1439							Title : source
1440							Usage : $re->source('Rob Edwards');
1441							Function : Gets/Sets the person who provided the enzyme
1442							Arguments : A scalar containing the persons name
1443							Returns : A scalar containing the persons name or 0 if not defined
1444
1445							Added for compatibility to REBASE
1446
1447							=cut
1448
1449							sub source {
1450	7455			7455	1	5879	my ($self, $source) = @_;
1451	7455	100				8554	if ($source) {
1452	7454					8662	$self->{_source}=$source;
1453							}
1454	7455		50			10966	return $self->{_source} \|\| '';
1455							}
1456
1457
1458							=head2 vendors
1459
1460							Title : vendors
1461							Usage : $re->vendor(@list_of_companies);
1462							Function : Gets/Sets the a list of companies that you can get the enzyme from.
1463							Also sets the commercially_available boolean
1464							Arguments : A reference to an array containing the names of companies
1465							that you can get the enzyme from
1466							Returns : A reference to an array containing the names of companies
1467							that you can get the enzyme from
1468
1469							Added for compatibility to REBASE
1470
1471							=cut
1472
1473							sub vendors {
1474	7472			7472	1	6627	my $self = shift;
1475	7472	100				8888	push @{$self->{_vendors}}, @_ if @_;
	7470					8439
1476	7472	50				10745	if ($self->{'_vendors'}) {
1477	7472					4676	return @{$self->{'_vendors'}};
	7472					8196
1478							}
1479							}
1480
1481
1482							=head2 purge_vendors
1483
1484							Title : purge_vendors
1485							Usage : $re->purge_references();
1486							Function : Purges the set of references for this enzyme
1487							Arguments :
1488							Returns :
1489
1490							=cut
1491
1492							sub purge_vendors {
1493	1			1	1	3	my ($self) = shift;
1494	1					2	$self->{_vendors} = [];
1495
1496							}
1497
1498							=head2 vendor
1499
1500							Title : vendor
1501							Usage : $re->vendor(@list_of_companies);
1502							Function : Gets/Sets the a list of companies that you can get the enzyme from.
1503							Also sets the commercially_available boolean
1504							Arguments : A reference to an array containing the names of companies
1505							that you can get the enzyme from
1506							Returns : A reference to an array containing the names of companies
1507							that you can get the enzyme from
1508
1509							Added for compatibility to REBASE
1510
1511							=cut
1512
1513
1514							sub vendor {
1515	1			1	1	2	my $self = shift;
1516	1					2	return push @{$self->{_vendors}}, @_;
	1					5
1517	0					0	return $self->{_vendors};
1518							}
1519
1520
1521							=head2 references
1522
1523							Title : references
1524							Usage : $re->references(string);
1525							Function : Gets/Sets the references for this enzyme
1526							Arguments : an array of string reference(s) (optional)
1527							Returns : an array of references
1528
1529							Use L to reset the list of references
1530
1531							This should be a L object, but its not (yet)
1532
1533							=cut
1534
1535							sub references {
1536	7472			7472	1	6046	my ($self) = shift;
1537	7472	100				8731	push @{$self->{_references}}, @_ if @_;
	7470					7602
1538	7472					5105	return @{$self->{_references}};
	7472					6764
1539							}
1540
1541
1542							=head2 purge_references
1543
1544							Title : purge_references
1545							Usage : $re->purge_references();
1546							Function : Purges the set of references for this enzyme
1547							Arguments :
1548							Returns : 1
1549
1550							=cut
1551
1552							sub purge_references {
1553	1			1	1	2	my ($self) = shift;
1554	1					2	$self->{_references} = [];
1555
1556							}
1557
1558							=head2 clone
1559
1560							Title : clone
1561							Usage : $re->clone
1562							Function : Deep copy of the object
1563							Arguments : -
1564							Returns : new Bio::Restriction::EnzymeI object
1565
1566							This works as long as the object is a clean in-memory object using
1567							scalars, arrays and hashes. You have been warned.
1568
1569							If you have module Storable, it is used, otherwise local code is used.
1570							Todo: local code cuts circular references.
1571
1572							=cut
1573
1574							# there's some issue here; deprecating and rolling another below/maj
1575
1576							sub clone_depr {
1577	0			0	0	0	my ($self, $this) = @_;
1578
1579	0					0	eval { require Storable; };
	0					0
1580	0	0				0	return Storable::dclone($self) unless $@;
1581							# modified from deep_copy() @ http://www.stonehenge.com/merlyn/UnixReview/col30.html
1582	0	0				0	unless ($this) {
1583	0					0	my $new;
1584	0					0	foreach my $k (keys %$self) {
1585	0	0				0	if (not ref $self->{$k}) {
1586	0					0	$new->{$k} = $self->{$k};
1587							} else {
1588	0					0	$new->{$k} = $self->clone($self->{$k});
1589							}
1590							#print Dumper $new;
1591							}
1592	0					0	bless $new, ref($self);
1593	0					0	return $new;
1594							}
1595	0	0				0	if (not ref $this) {
		0
		0
1596	0					0	$this;
1597							}
1598							elsif (ref $this eq "ARRAY") {
1599	0					0	[map $self->clone($_), @$this];
1600							}
1601							elsif (ref $this eq "HASH") {
1602	0					0	+{map { $_ => $self->clone($this->{$_}) } keys %$this};
	0					0
1603							} else { # objects
1604	0	0				0	return if $this->isa('Bio::Restriction::EnzymeI');
1605	0	0				0	return $this->clone if $this->can('clone');
1606	0					0	my $obj;
1607	0					0	foreach my $k (keys %$this) {
1608	0	0				0	if (not ref $this->{$k}) {
1609	0					0	$obj->{$k} = $this->{$k};
1610							} else {
1611	0					0	$obj->{$k} = $this->clone($this->{$k});
1612							}
1613							}
1614	0					0	bless $obj, ref($this);
1615	0					0	return $obj;
1616							}
1617							}
1618
1619							sub clone {
1620	1501			1501	1	951	my $self = shift;
1621	1501					1161	my ($this, $visited) = @_;
1622	1501	100				1715	unless (defined $this) {
1623	52					53	my %h;
1624	52					291	tie %h, 'Tie::RefHash';
1625	52					407	my $visited = \%h;
1626	52					106	return $self->clone($self, $visited);
1627							}
1628	1449					831	my $thing;
1629	1449					1176	for ($this) {
1630	1449	100				1696	if (ref) {
1631	475	100				1183	return $visited->{$this} if $visited->{$this};
1632							}
1633							# scalar
1634	1447	100				4096	(!ref) && do {
1635	974					628	$thing = $this;
1636	974					716	last;
1637							};
1638							# object
1639	473	100				852	(ref =~ /^Bio::/) && do {
1640	108					120	$thing = {};
1641	108					164	bless($thing, ref);
1642	108					304	$visited->{$this} = $thing;
1643	108					646	foreach my $attr (keys %{$_}) {
	108					344
1644	1196	100				1873	$thing->{$attr} = (defined $_->{$attr} ? $self->clone($_->{$attr},$visited) : undef );
1645							}
1646	108					155	last;
1647							};
1648	365	100				513	(ref eq 'ARRAY') && do {
1649	311					267	$thing = [];
1650	311					640	$visited->{$this} = $thing;
1651	311					1426	foreach my $elt (@{$_}) {
	311					372
1652	309	50				534	push @$thing, (defined $elt ? $self->clone($elt,$visited) : undef);
1653							}
1654	311					289	last;
1655							};
1656	54	50				119	(ref eq 'HASH') && do {
1657	54					71	$thing = {};
1658	54					129	$visited->{$this} = $thing;
1659	4			4		22	no warnings qw( uninitialized ); # avoid 'uninitialized value' warning against $key
	4					9
	4					242
1660	54					295	foreach my $key (%{$_}) {
	54					127
1661	0	0				0	$thing->{$key} = (defined $_->{key} ? $self->clone( $_->{$key},$visited) : undef );
1662							}
1663	4			4		16	use warnings;
	4					4
	4					851
1664	54					71	last;
1665							};
1666	0	0				0	(ref eq 'SCALAR') && do {
1667	0					0	$thing = ${$_};
	0					0
1668	0					0	$visited->{$this} = $thing;
1669	0					0	$thing = \$thing;
1670	0					0	last;
1671							};
1672							}
1673
1674	1447					2253	return $thing;
1675							}
1676
1677
1678
1679							=head2 _expand
1680
1681							Title : _expand
1682							Function : Expand nucleotide ambiguity codes to their representative letters
1683							Returns : The full length string
1684							Arguments : The string to be expanded.
1685
1686							Stolen from the original RestrictionEnzyme.pm
1687
1688							=cut
1689
1690
1691							sub _expand {
1692	7774			7774		5816	my $str = shift;
1693
1694	7774					19514	$str =~ s/N\|X/\./g;
1695	7774					7603	$str =~ s/R/\[AG\]/g;
1696	7774					6376	$str =~ s/Y/\[CT\]/g;
1697	7774					5725	$str =~ s/S/\[GC\]/g;
1698	7774					7020	$str =~ s/W/\[AT\]/g;
1699	7774					5266	$str =~ s/M/\[AC\]/g;
1700	7774					5397	$str =~ s/K/\[TG\]/g;
1701	7774					4834	$str =~ s/B/\[CGT\]/g;
1702	7774					5266	$str =~ s/D/\[AGT\]/g;
1703	7774					5021	$str =~ s/H/\[ACT\]/g;
1704	7774					4738	$str =~ s/V/\[ACG\]/g;
1705
1706	7774					8158	return $str;
1707							}
1708
1709							1;
1710