File Coverage

Bio/Restriction/Enzyme.pm
Criterion Covered Total %
statement 297 363 81.8
branch 152 224 67.8
condition 36 62 58.0
subroutine 47 53 88.6
pod 36 42 85.7
total 568 744 76.3


line stmt bran cond sub pod time code
1             #------------------------------------------------------------------
2             #
3             # BioPerl module Bio::Restriction::Enzyme
4             #
5             # Please direct questions and support issues to
6             #
7             # Cared for by Rob Edwards
8             #
9             # You may distribute this module under the same terms as perl itself
10             #------------------------------------------------------------------
11              
12             ## POD Documentation:
13              
14             =head1 NAME
15              
16             Bio::Restriction::Enzyme - A single restriction endonuclease
17             (cuts DNA at specific locations)
18              
19             =head1 SYNOPSIS
20              
21             # set up a single restriction enzyme. This contains lots of
22             # information about the enzyme that is generally parsed from a
23             # rebase file and can then be read back
24              
25             use Bio::Restriction::Enzyme;
26              
27             # define a new enzyme with the cut sequence
28             my $re=Bio::Restriction::Enzyme->new
29             (-enzyme=>'EcoRI', -seq=>'G^AATTC');
30              
31             # once the sequence has been defined a bunch of stuff is calculated
32             # for you:
33              
34             #### PRECALCULATED
35              
36             # find where the enzyme cuts after ...
37             my $ca=$re->cut;
38              
39             # ... and where it cuts on the opposite strand
40             my $oca = $re->complementary_cut;
41              
42             # get the cut sequence string back.
43             # Note that site will return the sequence with a caret
44             my $with_caret=$re->site; #returns 'G^AATTC';
45              
46             # but it is also a Bio::PrimarySeq object ....
47             my $without_caret=$re->seq; # returns 'GAATTC';
48             # ... and so does string
49             $without_caret=$re->string; #returns 'GAATTC';
50              
51             # what is the reverse complement of the cut site
52             my $rc=$re->revcom; # returns 'GAATTC';
53              
54             # now the recognition length. There are two types:
55             # recognition_length() is the length of the sequence
56             # cutter() estimate of cut frequency
57              
58             my $recog_length = $re->recognition_length; # returns 6
59             # also returns 6 in this case but would return
60             # 4 for GANNTC and 5 for RGATCY (BstX2I)!
61             $recog_length=$re->cutter;
62              
63             # is the sequence a palindrome - the same forwards and backwards
64             my $pal= $re->palindromic; # this is a boolean
65              
66             # is the sequence blunt (i.e. no overhang - the forward and reverse
67             # cuts are the same)
68             print "blunt\n" if $re->overhang eq 'blunt';
69              
70             # Overhang can have three values: "5'", "3'", "blunt", and undef
71             # Direction is very important if you use Klenow!
72             my $oh=$re->overhang;
73              
74             # what is the overhang sequence
75             my $ohseq=$re->overhang_seq; # will return 'AATT';
76              
77             # is the sequence ambiguous - does it contain non-GATC bases?
78             my $ambig=$re->is_ambiguous; # this is boolean
79              
80             print "Stuff about the enzyme\nCuts after: $ca\n",
81             "Complementary cut: $oca\nSite:\n\t$with_caret or\n",
82             "\t$without_caret\n";
83             print "Reverse of the sequence: $rc\nRecognition length: $recog_length\n",
84             "Is it palindromic? $pal\n";
85             print "The overhang is $oh with sequence $ohseq\n",
86             "And is it ambiguous? $ambig\n\n";
87              
88              
89             ### THINGS YOU CAN SET, and get from rich REBASE file
90              
91             # get or set the isoschizomers (enzymes that recognize the same
92             # site)
93             $re->isoschizomers('PvuII', 'SmaI'); # not really true :)
94             print "Isoschizomers are ", join " ", $re->isoschizomers, "\n";
95              
96             # get or set the methylation sites
97             $re->methylation_sites(2); # not really true :)
98             print "Methylated at ", join " ", keys %{$re->methylation_sites},"\n";
99              
100             #Get or set the source microbe
101             $re->microbe('E. coli');
102             print "It came from ", $re->microbe, "\n";
103              
104             # get or set the person who isolated it
105             $re->source("Rob"); # not really true :)
106             print $re->source, " sent it to us\n";
107              
108             # get or set whether it is commercially available and the company
109             # that it can be bought at
110             $re->vendors('NEB'); # my favorite
111             print "Is it commercially available :";
112             print $re->vendors ? "Yes" : "No";
113             print " and it can be got from ", join " ",
114             $re->vendors, "\n";
115              
116             # get or set a reference for this
117             $re->reference('Edwards et al. J. Bacteriology');
118             print "It was not published in ", $re->reference, "\n";
119              
120             # get or set the enzyme name
121             $re->name('BamHI');
122             print "The name of EcoRI is not really ", $re->name, "\n";
123              
124              
125             =head1 DESCRIPTION
126              
127             This module defines a single restriction endonuclease. You can use it
128             to make custom restriction enzymes, and it is used by
129             Bio::Restriction::IO to define enzymes in the New England Biolabs
130             REBASE collection.
131              
132             Use Bio::Restriction::Analysis to figure out which enzymes are available
133             and where they cut your sequence.
134              
135              
136             =head1 RESTRICTION MODIFICATION SYSTEMS
137              
138             At least three geneticaly and biochamically distinct restriction
139             modification systems exist. The cutting components of them are known
140             as restriction endonuleases. The three systems are known by roman
141             numerals: Type I, II, and III restriction enzymes.
142              
143             REBASE format 'cutzymes'(#15) lists enzyme type in its last field. The
144             categories there do not always match the the following short
145             descriptions of the enzymes types. See
146             http://it.stlawu.edu/~tbudd/rmsyst.html for a better overview.
147              
148              
149             =head2 TypeI
150              
151             Type I systems recognize a bipartite asymetrical sequence of 5-7 bp:
152              
153             ---TGA*NnTGCT--- * = methylation sites
154             ---ACTNnA*CGA--- n = 6 for EcoK, n = 8 for EcoB
155              
156             The cleavage site is roughly 1000 (400-7000) base pairs from the
157             recognition site.
158              
159             =head2 TypeII
160              
161             The simplest and most common (at least commercially).
162              
163             Site recognition is via short palindromic base sequences that are 4-6
164             base pairs long. Cleavage is at the recognition site (but may
165             occasionally be just adjacent to the palindromic sequence, usually
166             within) and may produce blunt end termini or staggered, "sticky
167             end" termini.
168              
169             =head2 TypeIII
170              
171             The recognition site is a 5-7 bp asymmetrical sequence. Cleavage is
172             ATP dependent 24-26 base pairs downstream from the recognition site
173             and usually yields staggered cuts 2-4 bases apart.
174              
175              
176             =head1 COMMENTS
177              
178             I am trying to make this backwards compatible with
179             Bio::Tools::RestrictionEnzyme. Undoubtedly some things will break,
180             but we can fix things as we progress.....!
181              
182             I have added another comments section at the end of this POD that
183             discusses a couple of areas I know are broken (at the moment)
184              
185              
186             =head1 TO DO
187              
188             =over 2
189              
190             =item *
191              
192             Convert vendors touse full names of companies instead of code
193              
194             =item *
195              
196             Add regular expression based matching to vendors
197              
198             =item *
199              
200             Move away from the archaic ^ notation for cut sites. Ideally
201             I'd totally like to remove this altogether, or add a method
202             that adds it in if someone really wants it. We should be
203             fixed on a sequence, number notation.
204              
205             =back
206              
207             =head1 FEEDBACK
208              
209             =head2 Mailing Lists
210              
211             User feedback is an integral part of the evolution of this and other
212             Bioperl modules. Send your comments and suggestions preferably to one
213             of the Bioperl mailing lists. Your participation is much appreciated.
214              
215             bioperl-l@bioperl.org - General discussion
216             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
217              
218             =head2 Support
219              
220             Please direct usage questions or support issues to the mailing list:
221              
222             I
223              
224             rather than to the module maintainer directly. Many experienced and
225             reponsive experts will be able look at the problem and quickly
226             address it. Please include a thorough description of the problem
227             with code and data examples if at all possible.
228              
229             =head2 Reporting Bugs
230              
231             Report bugs to the Bioperl bug tracking system to help us keep track
232             the bugs and their resolution. Bug reports can be submitted via the
233             web:
234              
235             https://github.com/bioperl/bioperl-live/issues
236              
237             =head1 AUTHOR
238              
239             Rob Edwards, redwards@utmem.edu
240              
241             =head1 CONTRIBUTORS
242              
243             Heikki Lehvaslaiho, heikki-at-bioperl-dot-org
244             Peter Blaiklock, pblaiklo@restrictionmapper.org
245             Mark A. Jensen, maj-at-fortinbras-dot-us
246              
247             =head1 COPYRIGHT
248              
249             Copyright (c) 2003 Rob Edwards.
250              
251             Some of this work is Copyright (c) 1997-2002 Steve A. Chervitz. All
252             Rights Reserved. This module is free software; you can redistribute
253             it and/or modify it under the same terms as Perl itself.
254              
255             =head1 SEE ALSO
256              
257             L,
258             L, L
259              
260             =head1 APPENDIX
261              
262             Methods beginning with a leading underscore are considered private and
263             are intended for internal use by this module. They are not considered
264             part of the public interface and are described here for documentation
265             purposes only.
266              
267             =cut
268              
269             package Bio::Restriction::Enzyme;
270 4     4   497 use strict;
  4         4  
  4         85  
271              
272 4     4   600 use Bio::PrimarySeq;
  4         5  
  4         85  
273              
274 4     4   15 use Data::Dumper;
  4         5  
  4         178  
275 4     4   1622 use Tie::RefHash;
  4         8926  
  4         83  
276 4     4   18 use vars qw (%TYPE);
  4         4  
  4         119  
277 4     4   12 use base qw(Bio::Root::Root Bio::Restriction::EnzymeI);
  4         4  
  4         1398  
278              
279             BEGIN {
280 4     4   6180 my %TYPE = (I => 1, II => 1, III => 1);
281             }
282              
283             =head2 new
284              
285             Title : new
286             Function
287             Function : Initializes the Enzyme object
288             Returns : The Restriction::Enzyme object
289             Argument : A standard definition can have several formats. For example:
290             $re->new(-enzyme='EcoRI', -seq->'GAATTC' -cut->'1')
291             Or, you can define the cut site in the sequence, for example
292             $re->new(-enzyme='EcoRI', -seq->'G^AATTC'), but you must use a caret
293             Or, a sequence can cut outside the recognition site, for example
294             $re->new(-enzyme='AbeI', -seq->'CCTCAGC' -cut->'-5/-2')
295              
296             Other arguments:
297             -isoschizomers=>\@list a reference to an array of
298             known isoschizomers
299             -references=>$ref a reference to the enzyme
300             -source=>$source the source (person) of the enzyme
301             -commercial_availability=>@companies a list of companies
302             that supply the enzyme
303             -methylation_site=>\%sites a reference to hash that has
304             the position as the key and the type of methylation
305             as the value
306             -xln_sub => sub { ($self,$cut) = @_; ...; return $xln_cut },
307             a coderef to a routine that translates the input cut value
308             into Bio::Restriction::Enzyme coordinates
309             ( e.g., for withrefm format, this might be
310             -xln_sub => sub { length( shift()->string ) + shift } )
311              
312             A Restriction::Enzyme object manages its recognition sequence as a
313             Bio::PrimarySeq object.
314              
315             The minimum requirement is for a name and a sequence.
316              
317             This will create the restriction enzyme object, and define several
318             things about the sequence, such as palindromic, size, etc.
319              
320             =cut
321              
322             # do all cut/comp cut setting within the constructor
323             # new args
324              
325             sub new {
326 10163     10163 1 26759 my($class, @args) = @_;
327 10163         22496 my $self = $class->SUPER::new(@args);
328              
329 10163         40325 my ($name,$enzyme,$site,$seq,$precut, $postcut,$cut,$complementary_cut, $is_prototype, $prototype,
330             $isoschizomers, $meth, $microbe, $source, $vendors, $references, $neo, $recog, $xln_sub) =
331             $self->_rearrange([qw(
332             NAME
333             ENZYME
334             SITE
335             SEQ
336             PRECUT
337             POSTCUT
338             CUT
339             COMPLEMENTARY_CUT
340             IS_PROTOTYPE
341             PROTOTYPE
342             ISOSCHIZOMERS
343             METHYLATION_SITES
344             MICROBE
345             SOURCE
346             VENDORS
347             REFERENCES
348             IS_NEOSCHIZOMER
349             RECOG
350             XLN_SUB
351             )], @args);
352              
353 10163 0 66     54615 $self->throw('At the minimum, you must define a name and '.
      33        
      33        
354             'recognition site for the restriction enzyme')
355             unless (($name || $enzyme) && ($site || $recog || $seq));
356              
357 10163         13059 $self->{_isoschizomers} = [];
358 10163         10659 $self->{_methylation_sites} = {};
359 10163         10228 $self->{_vendors} = [];
360 10163         8919 $self->{_references} = [];
361              
362             # squelch warnings
363 10163   100     23988 $postcut ||='';
364              
365             # enzyme name
366 10163 100       12763 $enzyme && $self->name($enzyme);
367 10163 100       18849 $name && $self->name($name);
368              
369             # site
370             #
371             # note that the site() setter will automatically set
372             # cut(), complementary_cut(), if the cut site is indicated
373             # in $site with '^' /maj
374              
375             # create the cut site if appropriate/this is a kludge due to
376             # the base.pm format in the new B:R order...
377 10163 100 100     21476 if ( $cut and $cut <= length $site) {
378 2531         4417 $site = substr($site, 0, $cut).'^'.substr($site, $cut);
379             }
380            
381 10163 50       11382 if ($site) {
382 10163         13103 $self->site($site);
383             }
384             else {
385 0 0       0 $seq && $self->site($seq);
386             }
387              
388 10162 100       12002 if ($recog) {
389 7495         10706 $self->recog($recog);
390             }
391             else {
392 2667 50       3616 $seq && $self->recog($seq);
393 2667 50       4811 $site && $self->recog($site);
394             }
395             # call revcom_site to initialize it and revcom_recog:
396 10162         12279 $self->revcom_site();
397              
398 10162         16565 $recog = $self->string; # for length calculations below
399            
400 10162 100       15697 if ($xln_sub) {
401 7494 50       12302 $self->warn("Translation subroutine is not a coderef; ignoring") unless
402             ref($xln_sub) eq 'CODE';
403             }
404              
405             # cut coordinates
406 10162         10973 my ($pc_cut, $pc_comp_cut) = ( $postcut =~ /(-?\d+)\/(-?\d+)/ );
407              
408             # cut definitions in constructor override any autoset in
409             # site()
410             # definitions in site conform to withrefm coords, translation
411             # happens here
412              
413 10162 100       16974 if (defined $cut) {
    100          
414 2664 50       4386 $self->cut( $xln_sub ? $xln_sub->($self, $cut) : $cut );
415             }
416             elsif ( defined $pc_cut ) {
417 457 100       1113 $self->cut( $xln_sub ? $xln_sub->($self, $pc_cut) : $pc_cut );
418             }
419              
420 10162 100       16523 if (defined $complementary_cut) {
    100          
421 4 50       12 $self->complementary_cut($xln_sub ? $xln_sub->($self,$complementary_cut) : $complementary_cut);
422             }
423             elsif (defined $pc_comp_cut) {
424 457 100       981 $self->complementary_cut($xln_sub ? $xln_sub->($self,$pc_comp_cut) : $pc_comp_cut);
425             }
426              
427 10162 100       18133 $is_prototype && $self->is_prototype($is_prototype);
428 10162 100       12379 $prototype && $self->prototype($prototype);
429 10162 100       16803 $isoschizomers && $self->isoschizomers($isoschizomers);
430 10162 50       13236 $meth && $self->methylation_sites($meth);
431 10162 50       11447 $microbe && $self->microbe($microbe);
432 10162 100       16255 $source && $self->source($source);
433 10162 100       14989 $vendors && $self->vendors($vendors);
434 10162 100       16711 $references && $self->references($references);
435 10162 50       12073 $neo && $self->is_neoschizomer($neo);
436              
437             # create multicut enzymes here if $precut defined
438 10162 100       12031 if (defined $precut) {
439 49         164 bless $self, 'Bio::Restriction::Enzyme::MultiCut';
440 49         222 my ($pc_cut, $pc_comp_cut) = $precut =~ /(-?\d+)\/(-?\d+)/;
441 49         173 my $re2 = $self->clone;
442 49 50       200 $re2->cut($xln_sub ? $xln_sub->($self, -$pc_cut) : -$pc_cut);
443 49 50       165 $re2->complementary_cut($xln_sub ? $xln_sub->($self, -$pc_comp_cut) : -$pc_comp_cut);
444 49         165 $self->others($re2);
445             }
446              
447 10162         25977 return $self;
448             }
449              
450             =head1 Essential methods
451              
452             =cut
453              
454             =head2 name
455              
456             Title : name
457             Usage : $re->name($newval)
458             Function : Gets/Sets the restriction enzyme name
459             Example : $re->name('EcoRI')
460             Returns : value of name
461             Args : newvalue (optional)
462              
463             This will also clean up the name. I have added this because some
464             people get confused about restriction enzyme names. The name should
465             be One upper case letter, and two lower case letters (because it is
466             derived from the organism name, eg. EcoRI is from E. coli). After
467             that it is all confused, but the numbers should be roman numbers not
468             numbers, therefore we'll correct those. At least this will provide
469             some standard, I hope.
470              
471             =cut
472              
473             sub name{
474 70047     70047 1 52711 my ($self, $name)=@_;
475              
476 70047 100       82053 if ($name) { # correct and set the name
477 10164         7896 my $old_name = $name;
478              
479             # remove spaces. Some people write HindIII as Hind III
480 10164         16994 $name =~ s/\s+//g;
481             # change TAILING ones to I's
482 10164 50       18688 if ($name =~ m/(1+)$/) {
483 0         0 my $i = 'I' x length($1);
484 0         0 $name =~ s/1+$/$i/;
485             }
486              
487             # make the first letter upper case
488 10164         26613 $name =~ s/^(\w)/uc($1)/e;
  10164         22538  
489              
490 10164 50       18321 unless ($name eq $old_name) {
491             # we have changed the name, so send a warning
492 0         0 $self->warn("The enzyme name $old_name was changed to $name");
493             }
494 10164         12827 $self->{'_name'} = $name;
495             }
496 70047         154207 return $self->{'_name'};
497             }
498              
499              
500             =head2 site
501              
502             Title : site
503             Usage : $re->site();
504             Function : Gets/sets the recognition sequence for the enzyme.
505             Example : $seq_string = $re->site();
506             Returns : String containing recognition sequence indicating
507             : cleavage site as in 'G^AATTC'.
508             Argument : n/a
509             Throws : n/a
510              
511              
512             Side effect: the sequence is always converted to upper case.
513              
514             The cut site can also be set by using methods L and
515             L.
516              
517             This will pad out missing sequence with N's. For example the enzyme
518             Acc36I cuts at ACCTGC(4/8). This will be returned as ACCTGCNNNN^
519              
520             Note that the common notation ACCTGC(4/8) means that the forward
521             strand cut is four nucleotides after the END of the recognition
522             site. The forward cut() in the coordinates used here in Acc36I
523             ACCTGC(4/8) is at 6+4 i.e. 10.
524              
525             ** This is the main setable method for the recognition site.
526              
527             =cut
528              
529             sub site {
530 10169     10169 1 9653 my ( $self, $site ) = @_;
531            
532 10169 100       12547 if ($site) {
533              
534 10163 100       18720 $self->throw("Unrecognized characters in site: [$site]")
535             if $site =~ /[^ATGCMRWSYKVHDBN\^]/i;
536              
537             # we may have to redefine this if there is a ^ in the sequence
538              
539             # first, check and see if we have a cut site in the sequence
540             # if so, find the position, and set the target sequence and cut site
541              
542 10162         10586 $self->{'_site'} = $site;
543              
544 10162         19702 my ( $first, $second ) = $site =~ /(.*)\^(.*)/;
545 10162 100       18628 $site = "$1$2" if defined $first;
546 10162         8578 $self->{'_site'} = $site;
547              
548             # now set the recognition site as a new Bio::PrimarySeq object
549             # we need it before calling cut() and complementary_cut()
550 10162         14225 $self->{_seq} = Bio::PrimarySeq->new(
551             -id => $self->name,
552             -seq => $site,
553             -verbose => $self->verbose,
554             -alphabet => 'dna'
555             );
556              
557 10162 100       18424 if ( defined $first ) {
558 4945         8283 $self->cut( length $first );
559 4945         6701 $self->complementary_cut( length $second );
560 4945         6483 $self->revcom_site();
561             }
562             }
563 10168         12787 return $self->{'_site'};
564             }
565              
566             =head2 revcom_site
567              
568             Title : revcom_site
569             Usage : $re->revcom_site();
570             Function : Gets/sets the complementary recognition sequence for the enzyme.
571             Example : $seq_string = $re->revcom_site();
572             Returns : String containing recognition sequence indicating
573             : cleavage site as in 'G^AATTC'.
574             Argument : none (sets on first call)
575             Throws : n/a
576              
577             This is the same as site, except it returns the revcom site. For
578             palindromic enzymes these two are identical. For non-palindromic
579             enzymes they are not!
580              
581             On set, this also handles setting the revcom_recog attribute.
582              
583             See also L above.
584              
585             =cut
586              
587             sub revcom_site {
588 15112     15112 1 11241 my $self = shift;
589             # getter
590 15112 100       22457 return $self->{'_revcom_site'} unless !$self->{'_revcom_site'};
591              
592             # setter
593 10162         8290 my $site = $self->{'_site'};
594 10162 100       12970 if ($self->is_palindromic) {
595 9266         9255 $self->{'_revcom_site'}=$self->{'_site'};
596 9266         12068 $self->revcom_recog( $self->string );
597 9266         10543 return $self->{'_revcom_site'};
598             }
599              
600 896 50       2021 $self->throw("Unrecognized characters in revcom site: [$site]")
601             if $site =~ /[^ATGCMRWSYKVHDBN\^]/i;
602            
603 896 100       1650 if ($site =~ /\^/) {
604             # first, check and see if we have a cut site indicated in the sequence
605             # if so, find the position, and set the target sequence and cut site
606 16         35 $site = $self->revcom;
607 16         33 $self->revcom_recog( $site );
608 16         43 my $c = length($site)-$self->cut;
609 16         54 $site = substr($site, 0, $c).'^'.substr($site,$c);
610 16         25 $self->{'_revcom_site'} = $site;
611             }
612             else {
613 880         1226 my $revcom=$self->revcom;
614 880         1718 $self->revcom_recog( $revcom );
615             # my $cc=$self->complementary_cut;
616             # my $hat=length($revcom)-$cc+1; # we need it on the other strand!
617             # if ($cc > length($revcom)) {
618             # my $pad= "N" x ($cc-length($revcom));
619             # $revcom = $pad. $revcom;
620             # $hat=length($revcom)-$cc+1;
621             # }
622             # elsif ($cc < 0) {
623             # my $pad = "N" x -$cc;
624             # $revcom .= $pad;
625             # $hat=length($revcom);
626             # }
627             # $revcom =~ s/(.{$hat})/$1\^/;
628 880         1002 $self->{'_revcom_site'}=$revcom;
629             }
630 896         918 return $self->{'_revcom_site'};
631             }
632              
633             =head2 cut
634              
635             Title : cut
636             Usage : $num = $re->cut(1);
637             Function : Sets/gets an integer indicating the position of cleavage
638             relative to the 5' end of the recognition sequence in the
639             forward strand.
640              
641             For type II enzymes, sets the symmetrically positioned
642             reverse strand cut site by calling complementary_cut().
643              
644             Returns : Integer, 0 if not set
645             Argument : an integer for the forward strand cut site (optional)
646              
647             Note that the common notation ACCTGC(4/8) means that the forward
648             strand cut is four nucleotides after the END of the recognition
649             site. The forwad cut in the coordinates used here in Acc36I
650             ACCTGC(4/8) is at 6+4 i.e. 10.
651              
652             Note that REBASE uses notation where cuts within symmetic sites are
653             marked by '^' within the forward sequence but if the site is
654             asymmetric the parenthesis syntax is used where numbering ALWAYS
655             starts from last nucleotide in the forward strand. That's why AciI has
656             a site usually written as CCGC(-3/-1) actualy cuts in
657              
658             C^C G C
659             G G C^G
660              
661             In our notation, these locations are 1 and 3.
662              
663              
664             The cuts locations in the notation used are relative to the first
665             (non-N) nucleotide of the reported forward strand of the recognition
666             sequence. The following diagram numbers the phosphodiester bonds
667             (marked by + ) which can be cut by the restriction enzymes:
668              
669             1 2 3 4 5 6 7 8 ...
670             N + N + N + N + N + G + A + C + T + G + G + N + N + N
671             ... -5 -4 -3 -2 -1
672              
673              
674             =cut
675              
676             sub cut {
677 24294     24294 1 20211 my ($self, $value) = @_;
678 24294 100       31408 if (defined $value) {
679 8117 50       19154 $self->throw("The cut position needs to be an integer [$value]")
680             unless $value =~ /[-+]?\d+/;
681 8117         8967 $self->{'_cut'} = $value;
682              
683             # add the caret to the site attribute only if internal /maj
684 8117 100 100     24473 if ( ($self->{_site} !~ /\^/) && ($value <= length ($self->{_site}))) {
685             $self->{_site} =
686 5174         10445 substr($self->{_site}, 0, $value). '^'. substr($self->{_site}, $value);
687             }
688              
689             # auto-set comp cut only if cut site is inside the recog site./maj
690             $self->complementary_cut(length ($self->seq->seq) - $value )
691 8117 100 100     19588 if (($self->{_site} =~ /\^/) && ($self->type eq 'II'));
692              
693             }
694             # return undef if not defined yet, not 0 /maj
695 24294         32383 return $self->{'_cut'};
696             }
697              
698             =head2 cuts_after
699              
700             Title : cuts_after
701             Usage : Alias for cut()
702              
703             =cut
704              
705             sub cuts_after {
706 0     0 1 0 shift->cut(@_);
707             }
708            
709              
710             =head2 complementary_cut
711              
712             Title : complementary_cut
713             Usage : $num = $re->complementary_cut('1');
714             Function : Sets/Gets an integer indicating the position of cleavage
715             : on the reverse strand of the restriction site.
716             Returns : Integer
717             Argument : An integer (optional)
718             Throws : Exception if argument is non-numeric.
719              
720             This method determines the cut on the reverse strand of the sequence.
721             For most enzymes this will be within the sequence, and will be set
722             automatically based on the forward strand cut, but it need not be.
723              
724             B that the returned location indicates the location AFTER the
725             first non-N site nucleotide in the FORWARD strand.
726              
727             =cut
728              
729             sub complementary_cut {
730 16431     16431 1 13080 my ($self, $num)=@_;
731              
732 16431 100       22103 if (defined $num) {
733 13152 50       28337 $self->throw("The cut position needs to be an integer [$num]")
734             unless $num =~ /[-+]?\d+/;
735 13152         13263 $self->{'_rc_cut'} = $num;
736             }
737             # return undef, not 0, if not yet defined /maj
738 16431         16238 return $self->{'_rc_cut'};
739             }
740              
741              
742             =head1 Read only (usually) recognition site descriptive methods
743              
744             =cut
745              
746             =head2 type
747              
748             Title : type
749             Usage : $re->type();
750             Function : Get/set the restriction system type
751             Returns :
752             Argument : optional type: ('I'|II|III)
753              
754             Restriction enzymes have been catezorized into three types. Some
755             REBASE formats give the type, but the following rules can be used to
756             classify the known enzymes:
757              
758             =over 4
759              
760             =item 1
761              
762             Bipartite site (with 6-8 Ns in the middle and the cut site
763             is E 50 nt away) =E type I
764              
765             =item 2
766              
767             Site length E 3 =E type I
768              
769             =item 3
770              
771             5-6 asymmetric site and cuts E20 nt away =E type III
772              
773             =item 4
774              
775             All other =E type II
776              
777             =back
778              
779             There are some enzymes in REBASE which have bipartite recognition site
780             and cat far from the site but are still classified as type I. I've no
781             idea if this is really so.
782              
783             =cut
784              
785             sub type {
786 7707     7707 1 6568 my ($self, $value) = @_;
787              
788 7707 50       9949 if ($value) {
789             $self->throw("Not a valid value [$value], needs to one of : ".
790             join (', ', sort keys %TYPE) )
791 0 0       0 unless $TYPE{$value};
792 0         0 return $self->{'_type'} = $value;
793             }
794              
795             # pre set
796             #return $self->{'_type'} if $self->{'_type'};
797             # bipartite
798             return $self->{'_type'} = 'I'
799 7707 50 66     11204 if $self->{'_seq'}->seq =~ /N*[^N]+N{6,8}[^N]/ and abs($self->cut) > 50 ;
800             # 3 nt site
801             return $self->{'_type'} = 'I'
802 7707 100       13462 if $self->{'_seq'}->length == 3;
803             # asymmetric and cuts > 20 nt
804 7697 50 100     9518 return $self->{'_type'} = 'III'
      100        
      66        
805             if (length $self->string == 5 or length $self->string == 6 ) and
806             not $self->palindromic and abs($self->cut) > 20;
807 7697         23096 return $self->{'_type'} = 'II';
808             }
809              
810             =head2 seq
811              
812             Title : seq
813             Usage : $re->seq();
814             Function : Get the Bio::PrimarySeq.pm object representing
815             : the recognition sequence
816             Returns : A Bio::PrimarySeq object representing the
817             enzyme recognition site
818             Argument : n/a
819             Throws : n/a
820              
821              
822             =cut
823              
824             sub seq {
825 7854     7854 1 12910 shift->{'_seq'};
826             }
827              
828             =head2 string
829              
830             Title : string
831             Usage : $re->string();
832             Function : Get a string representing the recognition sequence.
833             Returns : String. Does NOT contain a '^' representing the cut location
834             as returned by the site() method.
835             Argument : n/a
836             Throws : n/a
837              
838             =cut
839              
840             sub string {
841 52617     52617 1 78025 shift->{'_seq'}->seq;
842             }
843              
844             =head2 recog
845              
846             Title : recog
847             Usage : $enz->recog($recognition_sequence)
848             Function: Gets/sets the pure recognition site. Sets as
849             regexp if appropriate.
850             As for string(), the cut indicating carets (^)
851             are expunged.
852             Example :
853             Returns : value of recog (a scalar)
854             Args : on set, new value (a scalar or undef, optional)
855              
856             =cut
857              
858             sub recog{
859 25643     25643 1 19329 my $self = shift;
860 25643         16888 my $recog = shift;
861 25643 100       51500 return $self->{'recog'} unless $recog;
862 10162         15480 $recog =~ s/\^//g;
863 10162 100       21957 $recog = _expand($recog) if $recog =~ /[^ATGC]/;
864 10162         12403 return $self->{'recog'} = $recog;
865             }
866              
867             =head2 revcom_recog
868              
869             Title : revcom_recog
870             Usage : $enz->revcom_recog($recognition_sequence)
871             Function: Gets/sets the pure reverse-complemented recognition site.
872             Sets as regexp if appropriate.
873             As for string(), the cut indicating carets (^) are expunged.
874             Example :
875             Returns : value of recog (a scalar)
876             Args : on set, new value (a scalar or undef, optional)
877              
878             =cut
879              
880             sub revcom_recog{
881 12969     12969 1 10471 my $self = shift;
882 12969         8907 my $recog = shift;
883 12969 100       16584 unless ($recog) {
884 2807 50       4192 $self->throw( "revcom recognition site not set; call \$enz->revcom_site to initialize" ) unless $self->{'revcom_recog'};
885 2807         4567 return $self->{'revcom_recog'};
886             }
887 10162         10270 $recog =~ s/\^//g;
888 10162 100       23433 $recog = _expand($recog) if $recog =~ /[^ATGC]/;
889 10162         12339 return $self->{'revcom_recog'} = $recog;
890             }
891              
892             =head2 revcom
893              
894             Title : revcom
895             Usage : $re->revcom();
896             Function : Get a string representing the reverse complement of
897             : the recognition sequence.
898             Returns : String
899             Argument : n/a
900             Throws : n/a
901              
902             =cut
903              
904             sub revcom {
905 11059     11059 1 23220 shift->{'_seq'}->revcom->seq();
906             }
907              
908             =head2 recognition_length
909              
910             Title : recognition_length
911             Usage : $re->recognition_length();
912             Function : Get the length of the RECOGNITION sequence.
913             This is the total recognition sequence,
914             inluding the ambiguous codes.
915             Returns : An integer
916             Argument : Nothing
917              
918             See also: L
919              
920             =cut
921              
922             sub recognition_length {
923 1     1 1 4 my $self = shift;
924 1         3 return length($self->string);
925             }
926              
927             =head2 cutter
928              
929             Title : cutter
930             Usage : $re->cutter
931             Function : Returns the "cutter" value of the recognition site.
932              
933             This is a value relative to site length and lack of
934             ambiguity codes. Hence: 'RCATGY' is a five (5) cutter site
935             and 'CCTNAGG' a six cutter
936              
937             This measure correlates to the frequency of the enzyme
938             cuts much better than plain recognition site length.
939              
940             Example : $re->cutter
941             Returns : integer or float number
942             Args : none
943              
944             Why is this better than just stripping the ambiguos codes? Think about
945             it like this: You have a random sequence; all nucleotides are equally
946             probable. You have a four nucleotide re site. The probability of that
947             site finding a match is one out of 4^4 or 256, meaning that on average
948             a four cutter finds a match every 256 nucleotides. For a six cutter,
949             the average fragment length is 4^6 or 4096. In the case of ambiguity
950             codes the chances are finding the match are better: an R (A|T) has 1/2
951             chance of finding a match in a random sequence. Therefore, for RGCGCY
952             the probability is one out of (2*4*4*4*4*2) which exactly the same as
953             for a five cutter! Cutter, although it can have non-integer values
954             turns out to be a useful and simple measure.
955              
956             From bug 2178: VHDB are ambiguity symbols that match three different
957             nucleotides, so they contribute less to the effective recognition sequence
958             length than e.g. Y which matches only two nucleotides. A symbol which matches n
959             of the 4 nucleotides has an effective length of 1 - log(n) / log(4).
960              
961             =cut
962              
963             sub cutter {
964 3532     3532 1 2328 my ($self)=@_;
965 3532         3326 $_ = uc $self->string;
966              
967 3532         2782 my $cutter = tr/[ATGC]//d;
968 3532         2277 my $count = tr/[MRWSYK]//d;
969 3532         2498 $cutter += $count/2;
970 3532         2366 $count = tr/[VHDB]//d;
971 3532         2260 $cutter += $count * (1 - log(3) / log(4));
972 3532         7031 return $cutter;
973             }
974              
975              
976             =head2 is_palindromic
977              
978             Title : is_palindromic
979             Alias : palindromic
980             Usage : $re->is_palindromic();
981             Function : Determines if the recognition sequence is palindromic
982             : for the current restriction enzyme.
983             Returns : Boolean
984             Argument : n/a
985             Throws : n/a
986              
987             A palindromic site (EcoRI):
988              
989             5-GAATTC-3
990             3-CTTAAG-5
991              
992             =cut
993              
994             sub is_palindromic {
995 36823     36823 1 27963 my $self = shift;
996 36823 100       84890 return $self->{_palindromic} if defined $self->{_palindromic};
997 10162 100       11801 if ($self->string eq $self->revcom) {
998 9266         23228 return $self->{_palindromic}=1;
999             }
1000 896         2041 return $self->{_palindromic} = 0;
1001             }
1002              
1003 5824     5824 0 8119 sub palindromic { shift->is_palindromic(@_) }
1004              
1005             =head2 is_symmetric
1006              
1007             Title : is_symmetric
1008             Alias : symmetric
1009             Usage : $re->is_symmetric();
1010             Function : Determines if the enzyme is a symmetric cutter
1011             Returns : Boolean
1012             Argument : none
1013              
1014             A symmetric but non-palindromic site (HindI):
1015             v
1016             5-C A C-3
1017             3-G T G-5
1018             ^
1019              
1020             =cut
1021              
1022             sub is_symmetric {
1023 4     4   20 no warnings qw( uninitialized );
  4         5  
  4         5241  
1024 9641     9641 1 6551 my $self = shift;
1025              
1026 9641 100       20053 return $self->{_symmetric} if defined $self->{_symmetric};
1027 5355 100       4859 if ($self->is_palindromic) {
1028 4892         13023 return $self->{_symmetric} = 1;
1029             }
1030 463 100       503 if ($self->cut == length($self->string) - $self->complementary_cut) {
1031 27         87 return $self->{_symmetric}=1;
1032             }
1033 436         1036 return $self->{_symmetric} = 0;
1034             }
1035              
1036              
1037 0     0 0 0 sub symmetric { shift->is_symmetric(@_) }
1038              
1039             =head2 overhang
1040              
1041             Title : overhang
1042             Usage : $re->overhang();
1043             Function : Determines the overhang of the restriction enzyme
1044             Returns : "5'", "3'", "blunt" of undef
1045             Argument : n/a
1046             Throws : n/a
1047              
1048             A blunt site in SmaI returns C
1049              
1050             5' C C C^G G G 3'
1051             3' G G G^C C C 5'
1052              
1053             A 5' overhang in EcoRI returns C<5'>
1054              
1055             5' G^A A T T C 3'
1056             3' C T T A A^G 5'
1057              
1058             A 3' overhang in KpnI returns C<3'>
1059              
1060             5' G G T A C^C 3'
1061             3' C^C A T G G 5'
1062              
1063             =cut
1064              
1065             sub overhang {
1066 545     545 1 335 my $self = shift;
1067 545 100 66     1168 unless ($self->{'_cut'} && $self->{'_rc_cut'}) {
1068 32         50 return "unknown";
1069             }
1070 513 100       655 if ($self->{_cut} < $self->{_rc_cut}) {
    100          
    50          
1071 287         425 $self->{_overhang}="5'";
1072             } elsif ($self->{_cut} == $self->{_rc_cut}) {
1073 116         168 $self->{_overhang}="blunt";
1074             } elsif ($self->{_cut} > $self->{_rc_cut}) {
1075 110         172 $self->{_overhang}="3'";
1076             } else {
1077 0         0 $self->{_overhang}="unknown";
1078             }
1079             return $self->{_overhang}
1080 513         752 }
1081              
1082             =head2 overhang_seq
1083              
1084             Title : overhang_seq
1085             Usage : $re->overhang_seq();
1086             Function : Determines the overhang sequence of the restriction enzyme
1087             Returns : a Bio::LocatableSeq
1088             Argument : n/a
1089             Throws : n/a
1090              
1091             I do not think it is necessary to create a seq object of these. (Heikki)
1092              
1093             Note: returns empty string for blunt sequences and undef for ones that
1094             we don't know. Compare these:
1095              
1096             A blunt site in SmaI returns empty string
1097              
1098             5' C C C^G G G 3'
1099             3' G G G^C C C 5'
1100              
1101             A 5' overhang in EcoRI returns C
1102              
1103             5' G^A A T T C 3'
1104             3' C T T A A^G 5'
1105              
1106             A 3' overhang in KpnI returns C
1107              
1108             5' G G T A C^C 3'
1109             3' C^C A T G G 5'
1110              
1111             Note that you need to use method L to decide
1112             whether it is a 5' or 3' overhang!!!
1113              
1114             Note: The overhang stuff does not work if the site is asymmetric! Rethink!
1115              
1116             =cut
1117              
1118             sub overhang_seq {
1119 5     5 1 7 my $self = shift;
1120              
1121             # my $overhang->Bio::PrimarySeq(-id=>$self->name . '-overhang',
1122             # -verbose=>$self->verbose,
1123             # -alphabet=>'dna');
1124              
1125 5 50       11 return '' if $self->overhang eq 'blunt' ;
1126              
1127 5 50 33     21 unless ($self->{_cut} && $self->{_rc_cut}) {
1128             # lets just check that we really can't figure it out
1129 0         0 $self->cut;
1130 0         0 $self->complementary_cut;
1131 0 0 0     0 unless ($self->{_cut} && $self->{_rc_cut}) {
1132 0         0 return;
1133             }
1134             }
1135              
1136             # this is throwing an error for sequences outside the restriction
1137             # site (eg ^NNNNGATCNNNN^)
1138             # So if this is the case we need to fake these guys
1139 5 50 33     37 if (($self->{_cut}<0) ||
      33        
      33        
1140             ($self->{_rc_cut}<0) ||
1141             ($self->{_cut}>$self->seq->length) ||
1142             ($self->{_rc_cut}>$self->seq->length)) {
1143 0         0 my $tempseq=$self->site;
1144 0         0 my ($five, $three)=split /\^/, $tempseq;
1145 0 0       0 if ($self->{_cut} > $self->{_rc_cut}) {
    0          
1146             return substr($five, $self->{_rc_cut})
1147 0         0 } elsif ($self->{_cut} < $self->{_rc_cut}) {
1148             return substr($three, 0, $self->{_rc_cut})
1149 0         0 } else {
1150 0         0 return '';
1151             }
1152             }
1153              
1154 5 50       20 if ($self->{_cut} > $self->{_rc_cut}) {
    50          
1155 0         0 return $self->seq->subseq($self->{_rc_cut}+1,$self->{_cut});
1156             } elsif ($self->{_cut} < $self->{_rc_cut}) {
1157 5         12 return $self->seq->subseq($self->{_cut}+1, $self->{_rc_cut});
1158             } else {
1159 0         0 return '';
1160             }
1161             }
1162              
1163              
1164              
1165             =head2 compatible_ends
1166              
1167             Title : compatible_ends
1168             Usage : $re->compatible_ends($re2);
1169             Function : Determines if the two restriction enzyme cut sites
1170             have compatible ends.
1171             Returns : 0 if not, 1 if only one pair ends match, 2 if both ends.
1172             Argument : a Bio::Restriction::Enzyme
1173             Throws : unless the argument is a Bio::Resriction::Enzyme and
1174             if there are Ns in the ovarhangs
1175              
1176             In case of type II enzymes which which cut symmetrically, this
1177             function can be considered to return a boolean value.
1178              
1179              
1180             =cut
1181              
1182             sub compatible_ends {
1183 1     1 1 2 my ($self, $re) = @_;
1184              
1185 1 50       10 $self->throw("Need a Bio::Restriction::Enzyme as an argument, [$re]")
1186             unless $re->isa('Bio::Restriction::Enzyme');
1187              
1188             # $self->throw("Only type II enzymes work now")
1189             # unless $self->type eq 'II';
1190              
1191 1 50 33     3 $self->debug("N(s) in overhangs. Can not compare")
1192             if $self->overhang_seq =~ /N/ or $re->overhang_seq =~ /N/;
1193              
1194 1 50 33     5 return 2 if $self->overhang_seq eq $re->overhang_seq and
1195             $self->overhang eq $re->overhang;
1196              
1197 0         0 return 0;
1198             }
1199              
1200             =head2 is_ambiguous
1201              
1202             Title : is_ambiguous
1203             Usage : $re->is_ambiguous();
1204             Function : Determines if the restriction enzyme contains ambiguous sequences
1205             Returns : Boolean
1206             Argument : n/a
1207             Throws : n/a
1208              
1209             =cut
1210              
1211             sub is_ambiguous {
1212 1     1 1 2 my $self = shift;
1213 1 50       5 return $self->string =~ m/[^AGCT]/ ? 1 : 0 ;
1214             }
1215              
1216             =head2 Additional methods from Rebase
1217              
1218             =cut
1219              
1220             =head2 is_prototype
1221              
1222             Title : is_prototype
1223             Usage : $re->is_prototype
1224             Function : Get/Set method for finding out if this enzyme is a prototype
1225             Example : $re->is_prototype(1)
1226             Returns : Boolean
1227             Args : none
1228              
1229             Prototype enzymes are the most commonly available and usually first
1230             enzymes discoverd that have the same recognition site. Using only
1231             prototype enzymes in restriction analysis avoids redundancy and
1232             speeds things up.
1233              
1234             =cut
1235              
1236             sub is_prototype {
1237 7213     7213 1 5418 my ($self, $value) = @_;
1238 7213 100       9877 if (defined $value) {
1239 7210         12159 return $self->{'_is_prototype'} = $value ;
1240             }
1241 3 100       8 if (defined $self->{'_is_prototype'}) {
1242 2         6 return $self->{'_is_prototype'}
1243             } else {
1244 1         10 $self->warn("Can't unequivocally assign prototype based on input format alone");
1245             return
1246 0         0 }
1247             }
1248              
1249             =head2 is_neoschizomer
1250              
1251             Title : is_neoschizomer
1252             Usage : $re->is_neoschizomer
1253             Function : Get/Set method for finding out if this enzyme is a neoschizomer
1254             Example : $re->is_neoschizomer(1)
1255             Returns : Boolean
1256             Args : none
1257              
1258             Neoschizomers are distinguishable from the prototype enzyme by having a
1259             different cleavage pattern. Note that not all formats report this
1260              
1261             =cut
1262              
1263             sub is_neoschizomer {
1264 0     0 1 0 my ($self, $value) = @_;
1265 0 0       0 if (defined $value) {
1266 0         0 return $self->{'_is_neoschizomer'} = $value ;
1267             }
1268 0 0       0 if (defined $self->{'_is_neoschizomer'}) {
1269 0         0 return $self->{'_is_neoschizomer'}
1270             } else {
1271 0         0 $self->warn("Can't unequivocally assign neoschizomer based on input format alone");
1272             return
1273 0         0 }
1274             }
1275              
1276             =head2 prototype_name
1277              
1278             Title : prototype_name
1279             Alias : prototype
1280             Usage : $re->prototype_name
1281             Function : Get/Set method for the name of prototype for
1282             this enzyme's recognition site
1283             Example : $re->prototype_name(1)
1284             Returns : prototype enzyme name string or an empty string
1285             Args : optional prototype enzyme name string
1286              
1287             If the enzyme itself is the prototype, its own name is returned. Not to
1288             confuse the negative result with an unset value, use method
1289             L.
1290              
1291             This method is called I rather than I,
1292             because it returns a string rather than on object.
1293              
1294             =cut
1295              
1296             sub prototype_name {
1297 12     12 1 12 my $self = shift;
1298              
1299 12 100       28 $self->{'_prototype'} = shift if @_;
1300 12 100       27 return $self->name if $self->{'_is_prototype'};
1301 2   50     10 return $self->{'_prototype'} || '';
1302             }
1303              
1304 9     9 0 16 sub prototype { shift->prototype_name(@_) }
1305              
1306             =head2 isoschizomers
1307              
1308             Title : isoschizomers
1309             Alias : isos
1310             Usage : $re->isoschizomers(@list);
1311             Function : Gets/Sets a list of known isoschizomers (enzymes that
1312             recognize the same site, but don't necessarily cut at
1313             the same position).
1314             Arguments : A reference to an array that contains the isoschizomers
1315             Returns : A reference to an array of the known isoschizomers or 0
1316             if not defined.
1317              
1318             This has to be the hardest name to spell, so now you can use the alias
1319             'isos'. Added for compatibility to REBASE
1320              
1321             =cut
1322              
1323             sub isoschizomers {
1324 7456     7456 1 6203 my ($self) = shift;
1325 7456 100       10298 push @{$self->{_isoschizomers}}, @_ if @_;
  7454         12049  
1326             # make sure that you don't dereference if null
1327             # chad believes quite strongly that you should return
1328             # a reference to an array anyway. don't bother dereferencing.
1329             # i'll post that to the list.
1330 7456 50       10736 if ($self->{'_isoschizomers'}) {
1331 7456         6174 return @{$self->{_isoschizomers}};
  7456         7997  
1332             }
1333            
1334             }
1335              
1336 0     0 0 0 sub isos { shift->isoschizomers(@_) }
1337              
1338             =head2 purge_isoschizomers
1339              
1340             Title : purge_isoschizomers
1341             Alias : purge_isos
1342             Usage : $re->purge_isoschizomers();
1343             Function : Purges the set of isoschizomers for this enzyme
1344             Arguments :
1345             Returns : 1
1346              
1347             =cut
1348              
1349             sub purge_isoschizomers {
1350 1     1 1 4 my ($self) = shift;
1351 1         6 $self->{_isoschizomers} = [];
1352              
1353             }
1354              
1355 0     0 0 0 sub purge_isos { shift->purge_isoschizomers(@_) }
1356              
1357             =head2 methylation_sites
1358              
1359             Title : methylation_sites
1360             Usage : $re->methylation_sites(\%sites);
1361             Function : Gets/Sets known methylation sites (positions on the sequence
1362             that get modified to promote or prevent cleavage).
1363             Arguments : A reference to a hash that contains the methylation sites
1364             Returns : A reference to a hash of the methylation sites or
1365             an empty string if not defined.
1366              
1367             There are three types of methylation sites:
1368              
1369             =over 3
1370              
1371             =item * (6) = N6-methyladenosine
1372              
1373             =item * (5) = 5-methylcytosine
1374              
1375             =item * (4) = N4-methylcytosine
1376              
1377             =back
1378              
1379             These are stored as 6, 5, and 4 respectively. The hash has the
1380             sequence position as the key and the type of methylation as the value.
1381             A negative number in the sequence position indicates that the DNA is
1382             methylated on the complementary strand.
1383              
1384             Note that in REBASE, the methylation positions are given
1385             Added for compatibility to REBASE.
1386              
1387             =cut
1388              
1389             sub methylation_sites {
1390 782     782 1 598 my $self = shift;
1391              
1392 782         1175 while (@_) {
1393 829         601 my $key = shift;
1394 829         2281 $self->{'_methylation_sites'}->{$key} = shift;
1395             }
1396 782         552 return %{$self->{_methylation_sites}};
  782         1201  
1397             }
1398              
1399              
1400             =head2 purge_methylation_sites
1401              
1402             Title : purge_methylation_sites
1403             Usage : $re->purge_methylation_sites();
1404             Function : Purges the set of methylation_sites for this enzyme
1405             Arguments :
1406             Returns :
1407              
1408             =cut
1409              
1410             sub purge_methylation_sites {
1411 23     23 1 25 my ($self) = shift;
1412 23         36 $self->{_methylation_sites} = {};
1413             }
1414              
1415             =head2 microbe
1416              
1417             Title : microbe
1418             Usage : $re->microbe($microbe);
1419             Function : Gets/Sets microorganism where the restriction enzyme was found
1420             Arguments : A scalar containing the microbes name
1421             Returns : A scalar containing the microbes name or 0 if not defined
1422              
1423             Added for compatibility to REBASE
1424              
1425             =cut
1426              
1427             sub microbe {
1428 2     2 1 3 my ($self, $microbe) = @_;
1429 2 100       6 if ($microbe) {
1430 1         2 $self->{_microbe}=$microbe;
1431             }
1432 2   50     9 return $self->{_microbe} || '';
1433              
1434             }
1435              
1436              
1437             =head2 source
1438              
1439             Title : source
1440             Usage : $re->source('Rob Edwards');
1441             Function : Gets/Sets the person who provided the enzyme
1442             Arguments : A scalar containing the persons name
1443             Returns : A scalar containing the persons name or 0 if not defined
1444              
1445             Added for compatibility to REBASE
1446              
1447             =cut
1448              
1449             sub source {
1450 7455     7455 1 6042 my ($self, $source) = @_;
1451 7455 100       9526 if ($source) {
1452 7454         8182 $self->{_source}=$source;
1453             }
1454 7455   50     10779 return $self->{_source} || '';
1455             }
1456              
1457              
1458             =head2 vendors
1459              
1460             Title : vendors
1461             Usage : $re->vendor(@list_of_companies);
1462             Function : Gets/Sets the a list of companies that you can get the enzyme from.
1463             Also sets the commercially_available boolean
1464             Arguments : A reference to an array containing the names of companies
1465             that you can get the enzyme from
1466             Returns : A reference to an array containing the names of companies
1467             that you can get the enzyme from
1468              
1469             Added for compatibility to REBASE
1470              
1471             =cut
1472              
1473             sub vendors {
1474 7472     7472 1 5304 my $self = shift;
1475 7472 100       8827 push @{$self->{_vendors}}, @_ if @_;
  7470         8212  
1476 7472 50       10169 if ($self->{'_vendors'}) {
1477 7472         4629 return @{$self->{'_vendors'}};
  7472         7087  
1478             }
1479             }
1480              
1481              
1482             =head2 purge_vendors
1483              
1484             Title : purge_vendors
1485             Usage : $re->purge_references();
1486             Function : Purges the set of references for this enzyme
1487             Arguments :
1488             Returns :
1489              
1490             =cut
1491              
1492             sub purge_vendors {
1493 1     1 1 2 my ($self) = shift;
1494 1         2 $self->{_vendors} = [];
1495              
1496             }
1497              
1498             =head2 vendor
1499              
1500             Title : vendor
1501             Usage : $re->vendor(@list_of_companies);
1502             Function : Gets/Sets the a list of companies that you can get the enzyme from.
1503             Also sets the commercially_available boolean
1504             Arguments : A reference to an array containing the names of companies
1505             that you can get the enzyme from
1506             Returns : A reference to an array containing the names of companies
1507             that you can get the enzyme from
1508              
1509             Added for compatibility to REBASE
1510              
1511             =cut
1512              
1513              
1514             sub vendor {
1515 1     1 1 1 my $self = shift;
1516 1         2 return push @{$self->{_vendors}}, @_;
  1         5  
1517 0         0 return $self->{_vendors};
1518             }
1519              
1520              
1521             =head2 references
1522              
1523             Title : references
1524             Usage : $re->references(string);
1525             Function : Gets/Sets the references for this enzyme
1526             Arguments : an array of string reference(s) (optional)
1527             Returns : an array of references
1528              
1529             Use L to reset the list of references
1530              
1531             This should be a L object, but its not (yet)
1532              
1533             =cut
1534              
1535             sub references {
1536 7472     7472 1 5942 my ($self) = shift;
1537 7472 100       8624 push @{$self->{_references}}, @_ if @_;
  7470         7685  
1538 7472         5740 return @{$self->{_references}};
  7472         7705  
1539             }
1540              
1541              
1542             =head2 purge_references
1543              
1544             Title : purge_references
1545             Usage : $re->purge_references();
1546             Function : Purges the set of references for this enzyme
1547             Arguments :
1548             Returns : 1
1549              
1550             =cut
1551              
1552             sub purge_references {
1553 1     1 1 1 my ($self) = shift;
1554 1         3 $self->{_references} = [];
1555              
1556             }
1557              
1558             =head2 clone
1559              
1560             Title : clone
1561             Usage : $re->clone
1562             Function : Deep copy of the object
1563             Arguments : -
1564             Returns : new Bio::Restriction::EnzymeI object
1565              
1566             This works as long as the object is a clean in-memory object using
1567             scalars, arrays and hashes. You have been warned.
1568              
1569             If you have module Storable, it is used, otherwise local code is used.
1570             Todo: local code cuts circular references.
1571              
1572             =cut
1573              
1574             # there's some issue here; deprecating and rolling another below/maj
1575              
1576             sub clone_depr {
1577 0     0 0 0 my ($self, $this) = @_;
1578              
1579 0         0 eval { require Storable; };
  0         0  
1580 0 0       0 return Storable::dclone($self) unless $@;
1581             # modified from deep_copy() @ http://www.stonehenge.com/merlyn/UnixReview/col30.html
1582 0 0       0 unless ($this) {
1583 0         0 my $new;
1584 0         0 foreach my $k (keys %$self) {
1585 0 0       0 if (not ref $self->{$k}) {
1586 0         0 $new->{$k} = $self->{$k};
1587             } else {
1588 0         0 $new->{$k} = $self->clone($self->{$k});
1589             }
1590             #print Dumper $new;
1591             }
1592 0         0 bless $new, ref($self);
1593 0         0 return $new;
1594             }
1595 0 0       0 if (not ref $this) {
    0          
    0          
1596 0         0 $this;
1597             }
1598             elsif (ref $this eq "ARRAY") {
1599 0         0 [map $self->clone($_), @$this];
1600             }
1601             elsif (ref $this eq "HASH") {
1602 0         0 +{map { $_ => $self->clone($this->{$_}) } keys %$this};
  0         0  
1603             } else { # objects
1604 0 0       0 return if $this->isa('Bio::Restriction::EnzymeI');
1605 0 0       0 return $this->clone if $this->can('clone');
1606 0         0 my $obj;
1607 0         0 foreach my $k (keys %$this) {
1608 0 0       0 if (not ref $this->{$k}) {
1609 0         0 $obj->{$k} = $this->{$k};
1610             } else {
1611 0         0 $obj->{$k} = $this->clone($this->{$k});
1612             }
1613             }
1614 0         0 bless $obj, ref($this);
1615 0         0 return $obj;
1616             }
1617             }
1618              
1619             sub clone {
1620 1501     1501 1 919 my $self = shift;
1621 1501         1127 my ($this, $visited) = @_;
1622 1501 100       1781 unless (defined $this) {
1623 52         57 my %h;
1624 52         357 tie %h, 'Tie::RefHash';
1625 52         481 my $visited = \%h;
1626 52         121 return $self->clone($self, $visited);
1627             }
1628 1449         769 my $thing;
1629 1449         1191 for ($this) {
1630 1449 100       1754 if (ref) {
1631 475 100       1175 return $visited->{$this} if $visited->{$this};
1632             }
1633             # scalar
1634 1447 100       4213 (!ref) && do {
1635 974         733 $thing = $this;
1636 974         723 last;
1637             };
1638             # object
1639 473 100       901 (ref =~ /^Bio::/) && do {
1640 108         125 $thing = {};
1641 108         141 bless($thing, ref);
1642 108         337 $visited->{$this} = $thing;
1643 108         700 foreach my $attr (keys %{$_}) {
  108         359  
1644 1196 100       1825 $thing->{$attr} = (defined $_->{$attr} ? $self->clone($_->{$attr},$visited) : undef );
1645             }
1646 108         141 last;
1647             };
1648 365 100       583 (ref eq 'ARRAY') && do {
1649 311         278 $thing = [];
1650 311         616 $visited->{$this} = $thing;
1651 311         1434 foreach my $elt (@{$_}) {
  311         391  
1652 309 50       557 push @$thing, (defined $elt ? $self->clone($elt,$visited) : undef);
1653             }
1654 311         263 last;
1655             };
1656 54 50       146 (ref eq 'HASH') && do {
1657 54         87 $thing = {};
1658 54         124 $visited->{$this} = $thing;
1659 4     4   21 no warnings qw( uninitialized ); # avoid 'uninitialized value' warning against $key
  4         11  
  4         211  
1660 54         284 foreach my $key (%{$_}) {
  54         149  
1661 0 0       0 $thing->{$key} = (defined $_->{key} ? $self->clone( $_->{$key},$visited) : undef );
1662             }
1663 4     4   14 use warnings;
  4         7  
  4         837  
1664 54         68 last;
1665             };
1666 0 0       0 (ref eq 'SCALAR') && do {
1667 0         0 $thing = ${$_};
  0         0  
1668 0         0 $visited->{$this} = $thing;
1669 0         0 $thing = \$thing;
1670 0         0 last;
1671             };
1672             }
1673              
1674 1447         2269 return $thing;
1675             }
1676              
1677              
1678              
1679             =head2 _expand
1680              
1681             Title : _expand
1682             Function : Expand nucleotide ambiguity codes to their representative letters
1683             Returns : The full length string
1684             Arguments : The string to be expanded.
1685              
1686             Stolen from the original RestrictionEnzyme.pm
1687              
1688             =cut
1689              
1690              
1691             sub _expand {
1692 7774     7774   6236 my $str = shift;
1693              
1694 7774         20023 $str =~ s/N|X/\./g;
1695 7774         8372 $str =~ s/R/\[AG\]/g;
1696 7774         7220 $str =~ s/Y/\[CT\]/g;
1697 7774         5969 $str =~ s/S/\[GC\]/g;
1698 7774         6943 $str =~ s/W/\[AT\]/g;
1699 7774         6156 $str =~ s/M/\[AC\]/g;
1700 7774         5374 $str =~ s/K/\[TG\]/g;
1701 7774         5364 $str =~ s/B/\[CGT\]/g;
1702 7774         5134 $str =~ s/D/\[AGT\]/g;
1703 7774         5117 $str =~ s/H/\[ACT\]/g;
1704 7774         5090 $str =~ s/V/\[ACG\]/g;
1705              
1706 7774         8494 return $str;
1707             }
1708              
1709             1;
1710