File Coverage

Bio/Restriction/Analysis.pm
Criterion Covered Total %
statement 262 356 73.6
branch 102 170 60.0
condition 4 9 44.4
subroutine 23 30 76.6
pod 13 14 92.8
total 404 579 69.7


line stmt bran cond sub pod time code
1             #
2             # BioPerl module Bio::Restriction::Analysis
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Rob Edwards
7             #
8             # You may distribute this module under the same terms as perl itself
9              
10             ## POD Documentation:
11              
12             =head1 NAME
13              
14             Bio::Restriction::Analysis - cutting sequences with restriction
15             enzymes
16              
17             =head1 SYNOPSIS
18              
19             # analyze a DNA sequence for restriction enzymes
20             use Bio::Restriction::Analysis;
21             use Bio::PrimarySeq;
22             use Data::Dumper;
23              
24             # get a DNA sequence from somewhere
25             my $seq = Bio::PrimarySeq->new
26             (-seq =>'AGCTTAATTCATTAGCTCTGACTGCAACGGGCAATATGTCTC',
27             -primary_id => 'synopsis',
28             -molecule => 'dna');
29              
30             # now start an analysis.
31             # this is using the default set of enzymes
32             my $ra = Bio::Restriction::Analysis->new(-seq=>$seq);
33              
34             # find unique cutters. This returns a
35             # Bio::Restriction::EnzymeCollection object
36             my $enzymes = $ra->unique_cutters;
37             print "Unique cutters: ", join (', ',
38             map {$_->name} $enzymes->unique_cutters), "\n";
39              
40             # AluI is one them. Where does it cut?
41             # This is will return an array of the sequence strings
42              
43             my $enz = 'AluI';
44             my @frags = $ra->fragments($enz);
45             # how big are the fragments?
46             print "AluI fragment lengths: ", join(' & ', map {length $_} @frags), "\n";
47              
48             # You can also bypass fragments and call sizes directly:
49             # to see all the fragment sizes
50             print "All sizes: ", join " ", $ra->sizes($enz), "\n";
51             # to see all the fragment sizes sorted by size like on a gel
52             print "All sizes, sorted ", join (" ", $ra->sizes($enz, 0, 1)), "\n";
53              
54             # how many times does each enzyme cut
55             my $cuts = $ra->cuts_by_enzyme('BamHI');
56             print "BamHI cuts $cuts times\n";
57              
58             # How many enzymes do not cut at all?
59             print "There are ", scalar $ra->zero_cutters->each_enzyme,
60             " enzymes that do not cut\n";
61              
62             # what about enzymes that cut twice?
63             my $two_cutters = $ra->cutters(2);
64             print join (" ", map {$_->name} $two_cutters->each_enzyme),
65             " cut the sequence twice\n";
66              
67             # what are all the enzymes that cut, and how often do they cut
68             printf "\n%-10s%s\n", 'Enzyme', 'Number of Cuts';
69             my $all_cutters = $ra->cutters;
70             map {
71             printf "%-10s%s\n", $_->name, $ra->cuts_by_enzyme($_->name)
72             } $all_cutters->each_enzyme;
73              
74             # Finally, we can interact the restriction enzyme object by
75             # retrieving it from the collection object see the docs for
76             # Bio::Restriction::Enzyme.pm
77             my $enzobj = $enzymes->get_enzyme($enz);
78              
79              
80             =head1 DESCRIPTION
81              
82             Bio::Restriction::Analysis describes the results of cutting a DNA
83             sequence with restriction enzymes.
84              
85             To use this module you can pass a sequence object and optionally a
86             Bio::Restriction::EnzymeCollection that contains the enzyme(s) to cut the
87             sequences with. There is a default set of enzymes that will be loaded
88             if you do not pass in a Bio::Restriction::EnzymeCollection.
89              
90             To cut a sequence, set up a Restriction::Analysis object with a sequence
91             like this:
92              
93             use Bio::Restriction::Analysis;
94             my $ra = Bio::Restriction::Analysis->new(-seq=>$seqobj);
95              
96             or
97              
98             my $ra = Bio::Restriction::Analysis->new
99             (-seq=>$seqobj, -enzymes=>$enzs);
100              
101             Then, to get the fragments for a particular enzyme use this:
102              
103             @fragments = $ra->fragments('EcoRI');
104              
105             Note that the naming of restriction enzymes is that the last numbers
106             are usually Roman numbers (I, II, III, etc). You may want to use
107             something like this:
108              
109             # get a reference to an array of unique (single) cutters
110             $singles = $re->unique_cutters;
111             foreach my $enz ($singles->each_enzyme) {
112             @fragments = $re->fragments($enz);
113             ... do something here ...
114             }
115              
116             Note that if your sequence is circular, the first and last fragment
117             will be joined so that they are the appropriate length and sequence
118             for further analysis. This fragment will also be checked for cuts
119             by the enzyme(s). However, this will change the start of the
120             sequence!
121              
122             There are two separate algorithms used depending on whether your
123             enzyme has ambiguity. The non-ambiguous algorithm is a lot faster,
124             and if you are using very large sequences you should try and use
125             this algorithm. If you have a large sequence (e.g. genome) and
126             want to use ambgiuous enzymes you may want to make separate
127             Bio::Restriction::Enzyme objects for each of the possible
128             alternatives and make sure that you do not set is_ambiguous!
129              
130             This version should correctly deal with overlapping cut sites
131             in both ambiguous and non-ambiguous enzymes.
132              
133             I have tried to write this module with speed and memory in mind
134             so that it can be effectively used for large (e.g. genome sized)
135             sequence. This module only stores the cut positions internally,
136             and calculates everything else on an as-needed basis. Therefore
137             when you call fragment_maps (for example), there may be another
138             delay while these are generated.
139              
140             =head1 FEEDBACK
141              
142             =head2 Mailing Lists
143              
144             User feedback is an integral part of the evolution of this and other
145             Bioperl modules. Send your comments and suggestions preferably to one
146             of the Bioperl mailing lists. Your participation is much appreciated.
147              
148             bioperl-l@bioperl.org - General discussion
149             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
150              
151             =head2 Support
152              
153             Please direct usage questions or support issues to the mailing list:
154              
155             I
156              
157             rather than to the module maintainer directly. Many experienced and
158             reponsive experts will be able look at the problem and quickly
159             address it. Please include a thorough description of the problem
160             with code and data examples if at all possible.
161              
162             =head2 Reporting Bugs
163              
164             Report bugs to the Bioperl bug tracking system to help us keep track
165             the bugs and their resolution. Bug reports can be submitted via the
166             web:
167              
168             https://github.com/bioperl/bioperl-live/issues
169              
170             =head1 AUTHOR
171              
172             Rob Edwards, redwards@utmem.edu,
173             Steve Chervitz, sac@bioperl.org
174              
175             =head1 CONTRIBUTORS
176              
177             Heikki Lehvaslaiho, heikki-at-bioperl-dot-org
178             Mark A. Jensen, maj-at-fortinbras-dot-us
179              
180             =head1 COPYRIGHT
181              
182             Copyright (c) 2003 Rob Edwards. Some of this work is Copyright (c)
183             1997-2002 Steve A. Chervitz. All Rights Reserved.
184              
185             This module is free software; you can redistribute it and/or modify it
186             under the same terms as Perl itself.
187              
188             =head1 SEE ALSO
189              
190             L,
191             L
192              
193             =head1 APPENDIX
194              
195             Methods beginning with a leading underscore are considered private and
196             are intended for internal use by this module. They are not considered
197             part of the public interface and are described here for documentation
198             purposes only.
199              
200             =cut
201              
202             package Bio::Restriction::Analysis;
203 3     3   3348 use Bio::Restriction::EnzymeCollection;
  3         8  
  3         89  
204 3     3   14 use strict;
  3         3  
  3         62  
205 3     3   12 use Data::Dumper;
  3         5  
  3         190  
206              
207 3     3   13 use base qw(Bio::Root::Root);
  3         5  
  3         228  
208 3     3   14 use Scalar::Util qw(blessed);
  3         5  
  3         8269  
209              
210             =head1 new
211              
212             Title : new
213             Function : Initializes the restriction enzyme object
214             Returns : The Restriction::Analysis object
215             Arguments :
216              
217             $re_anal->new(-seq=$seqobj,
218             -enzymes=>Restriction::EnzymeCollection object)
219             -seq requires a Bio::PrimarySeq object
220             -enzymes is optional.
221             If omitted it will use the default set of enzymes
222              
223             This is the place to start. Pass in a sequence, and you will be able
224             to get the fragments back out. Several other things are available
225             like the number of zero cutters or single cutters.
226              
227             =cut
228              
229             sub new {
230 4     4 1 802 my($class, @args) = @_;
231 4         21 my $self = $class->SUPER::new(@args);
232 4         20 my ($seq,$enzymes) =
233             $self->_rearrange([qw(
234             SEQ
235             ENZYMES
236             )], @args);
237              
238 4 50       18 $seq && $self->seq($seq);
239              
240             $enzymes ? $self->enzymes($enzymes)
241 4 100       24 : ($self->{'_enzymes'} = Bio::Restriction::EnzymeCollection->new );
242              
243             # keep track of status
244 4         9 $self->{'_cut'} = 0;
245            
246             # left these here because we want to reforce a _cut if someone
247             # just calls new
248 4         10 $self->{maximum_cuts} = 0;
249              
250 4         9 $self->{'_number_of_cuts_by_enzyme'} = {};
251 4         8 $self->{'_number_of_cuts_by_cuts'} = {};
252 4         11 $self->{'_fragments'} = {};
253 4         9 $self->{'_cut_positions'} = {}; # cut position is the real position
254 4         7 $self->{'_frag_map_list'} = {};
255              
256 4         21 return $self;
257              
258             }
259              
260             =head1 Methods to set parameters
261              
262             =cut
263              
264             =head2 seq
265              
266             Title : seq
267             Usage : $ranalysis->seq($newval);
268             Function : get/set method for the sequence to be cut
269             Example : $re->seq($seq);
270             Returns : value of seq
271             Args : A Bio::PrimarySeqI dna object (optional)
272              
273             =cut
274              
275             sub seq {
276 16     16 1 26 my $self = shift;
277 16 100       41 if (@_) {
278 6         8 my $seq = shift;
279 6 50       27 $self->throw('Need a sequence object ['. ref $seq. ']')
280             unless $seq->isa('Bio::PrimarySeqI');
281 6 50       16 $self->throw('Need a DNA sequence object ['. $seq->alphabet. ']')
282             unless $seq->alphabet eq 'dna';
283              
284 6         38 $self->{'_seq'} = $seq;
285 6         10 $self->{'_cut'} = 0;
286             }
287 16         38 return $self->{'_seq'};
288             }
289              
290             =head2 enzymes
291              
292             Title : enzymes
293             Usage : $re->enzymes($newval)
294             Function : gets/Set the restriction enzyme enzymes
295             Example : $re->enzymes('EcoRI')
296             Returns : reference to the collection
297             Args : an array of Bio::Restriction::EnzymeCollection and/or
298             Bio::Restriction::Enzyme objects
299              
300              
301             The default object for this method is
302             Bio::Restriction::EnzymeCollection. However, you can also pass it a
303             list of Bio::Restriction::Enzyme objects - even mixed with Collection
304             objects. They will all be stored into one collection.
305              
306             =cut
307              
308             sub enzymes {
309 2     2 1 5 my $self = shift;
310 2 50       5 if (@_) {
311             $self->{'_enzymes'} = Bio::Restriction::EnzymeCollection->new (-empty => 1)
312 2 50       19 unless $self->{'_enzymes'};
313 2         7 $self->{'_enzymes'}->enzymes(@_);
314 2         3 $self->{'_cut'} = 0;
315             }
316 2         4 return $self->{'_enzymes'};
317             }
318              
319              
320             =head1 Perform the analysis
321              
322             =cut
323              
324             =head2 cut
325              
326             Title : cut
327             Usage : $re->cut()
328             Function : Cut the sequence with the enzymes
329             Example : $re->cut(); $re->cut('single'); or $re->cut('multiple', $enzymecollection);
330             Returns : $self
331             Args : 'single' (optional), 'multiple' with enzyme collection.
332              
333             An explicit cut method is needed to pass arguments to it.
334              
335             There are two varieties of cut. Single is the default, and need
336             not be explicitly called. This cuts the sequence with each
337             enzyme separately.
338              
339             Multiple cuts a sequence with more than one enzyme. You must pass
340             it a Bio::Restriction::EnzymeCollection object of the set
341             of enzymes that you want to use in the double digest. The results
342             will be stored as an enzyme named "multiple_digest", so you can
343             use all the retrieval methods to get the data.
344              
345             If you want to use the default setting there is no need to call cut
346             directly. Every method in the class that needs output checks the
347             object's internal status and recalculates the cuts if needed.
348              
349             Note: cut doesn't now re-initialize everything before figuring
350             out cuts. This is so that you can do multiple digests, or add more
351             data or whatever. You'll have to use new to reset everything.
352              
353             See also the comments in above about ambiguous and non-ambiguous
354             sequences.
355              
356             =cut
357              
358             sub cut {
359 10     10 1 19 my ($self, $opt, $ec) = @_;
360              
361             # for the moment I have left this as a separate routine so
362             # the user calls cut rather than _cuts. This also initializes
363             # some stuff we need to use.
364            
365 10 50       26 $self->throw("A sequence must be supplied")
366             unless $self->seq;
367              
368 10 100 66     41 if ($opt && uc($opt) eq "MULTIPLE") {
369 3 50       10 $self->throw("You must supply a separate enzyme collection for multiple digests") unless $ec;
370 3         12 $self->_multiple_cuts($ec); # multiple digests
371             } else {
372             # reset some of the things that we save
373 7         13 $self->{maximum_cuts} = 0;
374 7         14 $self->{'_number_of_cuts_by_enzyme'} = {};
375 7         596 $self->{'_number_of_cuts_by_cuts'} = {};
376 7         160 $self->{'_fragments'} = {};
377 7         14 $self->{'_cut_positions'} = {}; # cut position is the real position
378 7         571 $self->{'_frag_map_list'} = {};
379 7         29 $self->_cuts;
380             }
381            
382 10         819 $self->{'_cut'} = 1;
383 10         32 return $self;
384             }
385              
386             =head2 mulitple_digest
387              
388             Title : multiple_digest
389             Function : perform a multiple digest on a sequence
390             Returns : $self so you can go and get any of the other methods
391             Arguments : An enzyme collection
392              
393             Multiple digests can use 1 or more enzymes, and the data is stored
394             in as if it were an enzyme called multiple_digest. You can then
395             retrieve information about multiple digests from any of the other
396             methods.
397              
398             You can use this method in place of $re->cut('multiple', $enz_coll);
399              
400             =cut
401              
402             sub multiple_digest {
403 0     0 0 0 my ($self, $ec)=@_;
404 0         0 return $self->cut('multiple', $ec);
405             }
406              
407             =head1 Query the results of the analysis
408              
409             =cut
410              
411             =head2 positions
412              
413             Title : positions
414             Function : Retrieve the positions that an enzyme cuts at
415             Returns : An array of the positions that an enzyme cuts at
416             : or an empty array if the enzyme doesn't cut
417             Arguments: An enzyme name to retrieve the positions for
418             Comments : The cut occurs after the base specified.
419              
420             =cut
421              
422             sub positions {
423 8     8 1 293 my ($self, $enz) = @_;
424 8 50       24 $self->cut unless $self->{'_cut'};
425 8 50       17 $self->throw('no enzyme selected to get positions for')
426             unless $enz;
427              
428             return defined $self->{'_cut_positions'}->{$enz} ?
429 8 50       21 @{$self->{'_cut_positions'}->{$enz}} :
  8         40  
430             ();
431             }
432              
433             =head2 fragments
434              
435             Title : fragments
436             Function : Retrieve the fragments that we cut
437             Returns : An array of the fragments retrieved.
438             Arguments: An enzyme name to retrieve the fragments for
439              
440             For example this code will retrieve the fragments for all enzymes that
441             cut your sequence
442              
443             my $all_cutters = $analysis->cutters;
444             foreach my $enz ($$all_cutters->each_enzyme}) {
445             @fragments=$analysis->fragments($enz);
446             }
447              
448             =cut
449              
450             sub fragments {
451 18     18 1 4143 my ($self, $enz) = @_;
452 18 100       58 $self->cut unless $self->{'_cut'};
453 18 50       35 $self->throw('no enzyme selected to get fragments for')
454             unless $enz;
455 18         19 my @fragments;
456 18         40 for ($self->fragment_maps($enz)) {push @fragments, $_->{seq}}
  46         53  
457 18         81 return @fragments;
458             }
459              
460             =head2 fragment_maps
461              
462             Title : fragment_maps
463             Function : Retrieves fragment sequences with start and end
464             points. Useful for feature construction.
465              
466             Returns : An array containing a hash reference for each fragment,
467             containing the start point, end point and DNA
468             sequence. The hash keys are 'start', 'end' and
469             'seq'. Returns an empty array if not defined.
470              
471             Arguments : An enzyme name, enzyme object,
472             or enzyme collection to retrieve the fragments for.
473              
474             If passes an enzyme collection it will return the result of a multiple
475             digest. This : will also cause the special enzyme 'multiple_digest' to
476             be created so you can get : other information about this multiple
477             digest. (TMTOWTDI).
478              
479             There is a minor problem with this and $self-Efragments that I
480             haven't got a good answer for (at the moment). If the sequence is not
481             cut, do we return undef, or the whole sequence?
482              
483             For linear fragments it would be good to return the whole
484             sequence. For circular fragments I am not sure.
485              
486             At the moment it returns the whole sequence with start of 1 and end of
487             length of the sequence. For example:
488              
489             use Bio::Restriction::Analysis;
490             use Bio::Restriction::EnzymeCollection;
491             use Bio::PrimarySeq;
492              
493             my $seq = Bio::PrimarySeq->new
494             (-seq =>'AGCTTAATTCATTAGCTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATCCAAAAAAGAGTGAGCTTCTGAT',
495             -primary_id => 'synopsis',
496             -molecule => 'dna');
497              
498             my $ra = Bio::Restriction::Analysis->new(-seq=>$seq);
499              
500             my @gel;
501             my @bam_maps = $ra->fragment_maps('BamHI');
502             foreach my $i (@bam_maps) {
503             my $start = $i->{start};
504             my $end = $i->{end};
505             my $sequence = $i->{seq};
506             push @gel, "$start--$sequence--$end";
507             @gel = sort {length $b <=> length $a} @gel;
508             }
509             print join("\n", @gel) . "\n";
510              
511             =cut
512              
513             sub fragment_maps {
514 21     21 1 24 my ($self, $enz) = @_;
515 21 100       42 $self->cut unless $self->{'_cut'};
516 21 50       40 $self->throw('no enzyme selected to get fragment maps for')
517             unless $enz;
518              
519             # we are going to generate this on an as-needed basis rather than
520             # for every enzyme this should cut down on the amount of
521             # duplicated data we are trying to save in memory and make this
522             # faster and easier for large sequences, e.g. genome analysis
523            
524 21         19 my @cut_positions;
525 21 100 66     138 if (ref $enz eq '' && exists $self->{'_cut_positions'}->{$enz}) {
    100          
    100          
526 13         8 @cut_positions=@{$self->{'_cut_positions'}->{$enz}};
  13         32  
527             } elsif ($enz->isa("Bio::Restriction::EnzymeI")) {
528 5         4 @cut_positions=@{$self->{'_cut_positions'}->{$enz->name}};
  5         13  
529             } elsif ($enz->isa("Bio::Restriction::EnzymeCollection")) {
530 2         7 $self->cut('multiple', $enz);
531 2         2 @cut_positions=@{$self->{'_cut_positions'}->{'multiple_digest'}};
  2         8  
532             }
533              
534 21 100       43 unless (defined($cut_positions[0])) {
535             # it doesn't cut
536             # return the whole sequence
537             # this should probably have the is_circular command
538             my %map=(
539             'start' => 1,
540             'end' => $self->{'_seq'}->length,
541 4         16 'seq' => $self->{'_seq'}->seq
542             );
543 4         6 push (@{$self->{'_frag_map_list'}->{$enz}}, \%map);
  4         13  
544             return defined $self->{'_frag_map_list'}->{$enz} ?
545 4 50       10 @{$self->{'_frag_map_list'}->{$enz}} : ();
  4         12  
546             }
547              
548 17         38 @cut_positions=sort {$a <=> $b} @cut_positions;
  109         75  
549 17         24 push my @cuts, $cut_positions[0];
550 17         23 foreach my $i (@cut_positions) {
551 72 100       111 push @cuts, $i if $i != $cuts[$#cuts];
552             }
553              
554 17         18 my $start=1; my $stop; my %seq; my %stop;
  17         16  
  0         0  
555 17         22 foreach $stop (@cuts) {
556 67 100       83 next if !$stop; # cuts at beginning of sequence
557 66         124 $seq{$start}=$self->{'_seq'}->subseq($start, $stop);
558 66         65 $stop{$start}=$stop;
559 66         63 $start=$stop+1;
560             }
561 17         34 $stop=$self->{'_seq'}->length;
562 17 50       31 if ($start > $stop) {
563             # borderline case. The enzyme cleaved at the end of the sequence
564             # what do I do now?
565             }
566             else {
567 17         31 $seq{$start}=$self->{'_seq'}->subseq($start, $stop);
568 17         25 $stop{$start}=$stop;
569             }
570              
571 17 100       39 if ($self->{'_seq'}->is_circular) {
572             # join the first and last fragments
573 6         11 $seq{$start}.=$seq{'1'};
574 6         11 delete $seq{'1'};
575 6         7 $stop{$start}=$stop{'1'};
576 6         6 delete $stop{'1'};
577             }
578              
579 17         56 foreach my $start (sort {$a <=> $b} keys %seq) {
  133         117  
580             my %map=(
581             'start' => $start,
582             'end' => $stop{$start},
583 77         162 'seq' => $seq{$start}
584             );
585 77         49 push (@{$self->{'_frag_map_list'}->{$enz}}, \%map);
  77         145  
586             }
587              
588             return defined $self->{'_frag_map_list'}->{$enz} ?
589 17 50       65 @{$self->{'_frag_map_list'}->{$enz}} : ();
  17         86  
590             }
591              
592              
593             =head2 sizes
594              
595             Title : sizes
596             Function : Retrieves an array with the sizes of the fragments
597             Returns : Array that has the sizes of the fragments ordered from
598             largest to smallest like they would appear in a gel.
599             Arguments: An enzyme name to retrieve the sizes for is required and
600             kilobases to the nearest 0.1 kb, else it will be in
601             bp. If the optional third entry is set the results will
602             be sorted.
603              
604             This is designed to make it easy to see what fragments you should get
605             on a gel!
606              
607             You should be able to do these:
608              
609             # to see all the fragment sizes,
610             print join "\n", $re->sizes($enz), "\n";
611             # to see all the fragment sizes sorted
612             print join "\n", $re->sizes($enz, 0, 1), "\n";
613             # to see all the fragment sizes in kb sorted
614             print join "\n", $re->sizes($enz, 1, 1), "\n";
615              
616             =cut
617              
618             sub sizes {
619 3     3 1 9 my ($self, $enz, $kb, $sort) = @_;
620 3 50       6 $self->throw('no enzyme selected to get fragments for')
621             unless $enz;
622            
623 3 100       12 if (blessed($enz)) {
624 1 50       4 $self->throw("Enzyme must be enzyme name or a Bio::Restriction::EnzymeI, not ".ref($enz))
625             if !$enz->isa('Bio::Restriction::EnzymeI');
626 1         4 $enz = $enz->name;
627             }
628 3 50       5 $self->cut unless $self->{'_cut'};
629 3         4 my @frag; my $lastsite=0;
  3         4  
630              
631 3         3 foreach my $site (@{$self->{'_cut_positions'}->{$enz}}) {
  3         7  
632 9 50       14 $kb ? push (@frag, (int($site-($lastsite))/100)/10)
633             : push (@frag, $site-($lastsite));
634 9         9 $lastsite=$site;
635             }
636             $kb ? push (@frag, (int($self->{'_seq'}->length-($lastsite))/100)/10)
637 3 50       11 : push (@frag, $self->{'_seq'}->length-($lastsite));
638 3 100       8 if ($self->{'_seq'}->is_circular) {
639 1         2 my $first=shift @frag;
640 1         2 my $last=pop @frag;
641 1         4 push @frag, ($first+$last);
642             }
643 3 50       5 $sort ? @frag = sort {$b <=> $a} @frag : 1;
  0         0  
644              
645 3         14 return @frag;
646             }
647              
648             =head1 How many times does enzymes X cut?
649              
650             =cut
651              
652             =head2 cuts_by_enzyme
653              
654             Title : cuts_by_enzyme
655             Function : Return the number of cuts for an enzyme
656             Returns : An integer with the number of times each enzyme cuts.
657             Returns 0 if doesn't cut or undef if not defined
658             Arguments : An enzyme name string
659              
660              
661             =cut
662              
663             sub cuts_by_enzyme {
664 4     4 1 8 my ($self, $enz)=@_;
665              
666 4 50       10 $self->throw("Need an enzyme name")
667             unless defined $enz;
668 4 50       7 $self->cut unless $self->{'_cut'};
669 4         14 return $self->{'_number_of_cuts_by_enzyme'}->{$enz};
670             }
671              
672             =head1 Which enzymes cut the sequence N times?
673              
674             =cut
675              
676             =head2 cutters
677              
678             Title : cutters
679             Function : Find enzymes that cut a given number of times
680             Returns : a Bio::Restriction::EnzymeCollection
681             Arguments : 1. exact time or lower limit,
682             non-negative integer, optional
683             2. upper limit, non-negative integer,
684             larger or equalthan first, optional
685              
686              
687             If no arguments are given, the method returns all enzymes that do cut
688             the sequence. The argument zero, '0', is same as method
689             zero_cutters(). The argument one, '1', corresponds to unique_cutters.
690             If either of the limits is larger than number of cuts any enzyme cuts the
691             sequence, the that limit is automagically lowered. The method max_cuts()
692             gives the largest number of cuts.
693              
694             See Also : L,
695             L, L
696              
697             =cut
698              
699             sub cutters {
700 5     5 1 6 my ($self, $a, $z) = @_;
701              
702 5 100       15 $self->cut unless $self->{'_cut'};
703              
704 5         3 my ($start, $end);
705 5 100       11 if (defined $a) {
706 4 50       21 $self->throw("Need a non-zero integer [$a]")
707             unless $a =~ /^[+]?\d+$/;
708 4         4 $start = $a;
709             } else {
710 1         1 $start = 1;
711             }
712 5 50       10 $start = $self->{'maximum_cuts'} if $start > $self->{'maximum_cuts'};
713              
714 5 50       10 if (defined $z) {
    100          
715 0 0 0     0 $self->throw("Need a non-zero integer no smaller than start [0]")
716             unless $z =~ /^[+]?\d+$/ and $z >= $a;
717 0         0 $end = $z;
718             }
719             elsif (defined $a) {
720 4         5 $end = $start;
721             } else {
722 1         1 $end = $self->{'maximum_cuts'};
723             }
724 5 50       10 $end = $self->{'maximum_cuts'} if $end > $self->{'maximum_cuts'};
725 5         21 my $set = Bio::Restriction::EnzymeCollection->new(-empty => 1);
726              
727             #return an empty set if nothing cuts
728 5 50       11 return $set unless $self->{'maximum_cuts'};
729              
730 5         9 for (my $i=$start; $i<=$end; $i++) {
731 8         27 $set->enzymes( @{$self->{_number_of_cuts_by_cuts}->{$i}} )
732 13 100       25 if defined $self->{_number_of_cuts_by_cuts}->{$i};
733             }
734              
735 5         17 return $set;
736             }
737              
738              
739             =head2 unique_cutters
740              
741             Title : unique_cutters
742             Function : A special case if cutters() where enzymes only cut once
743             Returns : a Bio::Restriction::EnzymeCollection
744             Arguments : -
745              
746              
747             See also: L, L
748              
749             =cut
750              
751             sub unique_cutters {
752 2     2 1 8 shift->cutters(1);
753             }
754              
755             =head2 zero_cutters
756              
757             Title : zero_cutters
758             Function : A special case if cutters() where enzymes don't cut the sequence
759             Returns : a Bio::Restriction::EnzymeCollection
760             Arguments : -
761              
762             See also: L, L
763              
764             =cut
765              
766             sub zero_cutters {
767 1     1 1 3 shift->cutters(0);
768             }
769              
770             =head2 max_cuts
771              
772             Title : max_cuts
773             Function : Find the most number of cuts
774             Returns : The number of times the enzyme that cuts most cuts.
775             Arguments : None
776              
777             This is not a very practical method, but if you are curious...
778              
779             =cut
780              
781 1     1 1 5 sub max_cuts { return shift->{maximum_cuts} }
782              
783             =head1 Internal methods
784              
785             =cut
786              
787             =head2 _cuts
788              
789             Title : _cuts
790             Function : Figures out which enzymes we know about and cuts the sequence.
791             Returns : Nothing.
792             Arguments : None.
793             Comments : An internal method. This will figure out where the sequence
794             should be cut, and provide the appropriate results.
795              
796             =cut
797              
798             sub _cuts {
799 7     7   15 my $self = shift;
800              
801 7         26 my $target_seq=uc $self->{'_seq'}->seq; # I have been burned on this before :)
802              
803              
804             # first, find out all the enzymes that we have
805 7         26 foreach my $enz ($self->{'_enzymes'}->each_enzyme) {
806 9575         5955 my @all_cuts;
807 9575 100       19912 my @others = $enz->others if $enz->can("others");
808 9575         7513 foreach my $enzyme ($enz, @others) {
809             # cut the sequence
810             # _make_cuts handles all cases (amibiguous, non-ambiguous) X
811             # (palindromic X non-palindromic)
812             #
813 9641         9245 my $cut_positions = $self->_make_cuts($target_seq, $enzyme);
814              
815 9641         7212 push @all_cuts, @$cut_positions;
816              
817             #### need to refactor circular handling....
818             ####
819              
820             # deal with is_circular sequences
821 9641 100       16413 if ($self->{'_seq'}->is_circular) {
822 4286         4903 $cut_positions=$self->_circular($target_seq, $enzyme);
823 4286         3869 push @all_cuts, @$cut_positions;
824             }
825             # non-symmetric cutters (most external cutters, e.g.) need
826             # special handling
827 9641 100       12834 unless ($enzyme->is_symmetric) {
828             # do all of above with explicit use of the
829             # enzyme's 'complementary_cut'...
830              
831 868         1117 $cut_positions = $self->_make_cuts($target_seq, $enzyme, 'COMP');
832 868         734 push @all_cuts, @$cut_positions;
833             # deal with is_circular sequences
834 868 100       1516 if ($self->{'_seq'}->is_circular) {
835 432         539 $cut_positions=$self->_circular($target_seq, $enzyme, 'COMP');
836 432         666 push @all_cuts, @$cut_positions;
837             }
838             }
839             }
840              
841 9575 100       9971 if (defined $all_cuts[0]) {
842             # now just remove any duplicate cut sites
843 1717         2090 @all_cuts = sort {$a <=> $b} @all_cuts;
  1350         1165  
844 1717         1022 push @{$self->{'_cut_positions'}->{$enz->name}}, $all_cuts[0];
  1717         2357  
845 1717         1926 foreach my $i (@all_cuts) {
846 565         683 push @{$self->{'_cut_positions'}->{$enz->name}}, $i
847 2636 100       1675 if $i != ${$self->{'_cut_positions'}->{$enz->name}}[$#{$self->{'_cut_positions'}->{$enz->name}}];
  2636         3059  
  2636         3125  
848             }
849             } else {
850             # this just fixes an eror when @all_cuts is not defined!
851 7858         5059 @{$self->{'_cut_positions'}->{$enz->name}}=();
  7858         9887  
852             }
853              
854             # note I have removed saving any other information except the
855             # cut_positions this should significantly decrease the amount
856             # of memory that is required for large sequences. It should
857             # also speed things up dramatically, because fragments and
858             # fragment maps are only calculated for those enzymes they are
859             # needed for.
860            
861             # finally, save minimal information about each enzyme
862 9575         6916 my $number_of_cuts=scalar @{$self->{'_cut_positions'}->{$enz->name}};
  9575         10716  
863             # now just store the number of cuts
864 9575         12267 $self->{_number_of_cuts_by_enzyme}->{$enz->name}=$number_of_cuts;
865 9575         6706 push (@{$self->{_number_of_cuts_by_cuts}->{$number_of_cuts}}, $enz);
  9575         11543  
866 9575 100       15339 if ($number_of_cuts > $self->{maximum_cuts}) {
867 24         36 $self->{maximum_cuts}=$number_of_cuts;
868             }
869              
870             }
871             }
872              
873             =head2 _enzyme_sites
874              
875             Title : _enzyme_sites
876             Function : An internal method to figure out the two sides of an enzyme
877             Returns : The sequence before the cut and the sequence after the cut
878             Arguments : A Bio::Restriction::Enzyme object,
879             $comp : boolean, calculate based on $enz->complementary_cut()
880             if true, $enz->cut() if false
881             Status : NOW DEPRECATED - maj
882              
883             =cut
884              
885             sub _enzyme_sites {
886 0     0   0 my ($self, $enz, $comp )=@_;
887             # get the cut site
888             # I have reworked this so that it uses $enz->cut to get the site
889              
890 0 0       0 my $site= ( $comp ? $enz->complementary_cut : $enz->cut );
891             # split it into the two fragments for the sequence before and after.
892 0 0       0 $site=0 unless defined $site;
893              
894             # the default values just stop an error from an undefined
895             # string. But they don't affect the split.
896 0         0 my ($beforeseq, $afterseq)= ('.', '.');
897              
898             # extra-site cutting
899             # the before seq is going to be the entire site
900             # the after seq is empty
901             # BUT, need to communicate how to cut within the sample sequence
902             # relative to the end of the site (do through $enz->cut), and
903             # ALSO, need to check length of sample seq so that if cut falls
904             # outside the input sequence, we have a warning/throw. /maj
905              
906             # pre-site cutting
907             # need to handle negative site numbers
908              
909 0 0       0 if ($site <= 0) { # <= to handle pre-site cutting
    0          
910 0         0 $afterseq=$enz->string;
911             }
912             elsif ($site >= $enz->seq->length) { # >= to handle extrasite cutters/maj
913 0         0 $beforeseq=$enz->string;
914             }
915             else { # $site < $enz->seq->length
916 0         0 $beforeseq=$enz->seq->subseq(1, $site);
917 0         0 $afterseq=$enz->seq->subseq($site+1, $enz->seq->length);
918             }
919             # if the enzyme is ambiguous we need to convert this into a perl string
920 0 0       0 if ($enz->is_ambiguous) {
921 0         0 $beforeseq=$self->_expanded_string($beforeseq);
922 0         0 $afterseq =$self->_expanded_string($afterseq);
923             }
924              
925 0         0 return ($beforeseq, $afterseq);
926             }
927              
928              
929             =head2 _non_pal_enz
930              
931             Title : _non_pal_enz
932             Function : Analyses non_palindromic enzymes for cuts in both ways
933             (in fact, delivers only minus strand cut positions in the
934             plus strand coordinates/maj)
935             Returns : A reference to an array of cut positions
936             Arguments: The sequence to check and the enzyme object
937             NOW DEPRECATED/maj
938              
939             =cut
940              
941             sub _non_pal_enz {
942 0     0   0 my ($self, $target_seq, $enz) =@_;
943             # add support for non-palindromic sequences
944             # the enzyme is not the same forwards and backwards
945              
946 0         0 my $site=$enz->complementary_cut;
947             # complementary_cut is in plus strand coordinates
948              
949             # we are going to rc the sequence, so complementary_cut becomes length-complementary_cut
950              
951             # I think this is wrong; cut sites are a matter of position with respect
952             # to the plus strand: the recognition site is double stranded and
953             # directly identifiable on the plus strand sequence. /maj
954              
955             # what really needs doing is to keep track of plus strand and minus strand
956             # nicks separately./maj
957              
958 0         0 my ($beforeseq, $afterseq)=('.', '.');
959              
960             # now, for extra-site cuts, $site > length...so...?/maj
961 0         0 my $new_left_cut=$enz->seq->length-$site;
962             # there is a problem when this is actually zero
963              
964 0 0       0 if ($new_left_cut == 0) {$afterseq=$enz->seq->revcom->seq}
  0 0       0  
965 0         0 elsif ($new_left_cut == $enz->seq->length) {$beforeseq=$enz->seq->revcom->seq}
966             else {
967             # this can't be right./maj
968 0         0 $beforeseq=$enz->seq->revcom->subseq(1, ($enz->seq->length-$site));
969 0         0 $afterseq=$enz->seq->revcom->subseq(($enz->seq->length-$site), $enz->seq->length);
970             }
971            
972             # do this correctly, in the context of the current code design,
973             # by providing a "complement" argument to _ambig_cuts and _nonambig_cuts,
974             # use these explicitly rather than this wrapper./maj
975              
976 0         0 my $results=[];
977 0 0       0 if ($enz->is_ambiguous) {
978 0         0 $results= $self->_ambig_cuts($beforeseq, $afterseq, $target_seq, $enz);
979             } else {
980 0         0 $results= $self->_nonambig_cuts($beforeseq, $afterseq, $target_seq, $enz);
981             }
982              
983             # deal with is_circular
984 0         0 my $more_results=[];
985             $more_results=$self->_circular($beforeseq, $afterseq, $enz)
986 0 0       0 if ($self->{'_seq'}->is_circular);
987              
988 0         0 return [@$more_results, @$results];
989             }
990              
991             =head2 _ambig_cuts
992              
993             Title : _ambig_cuts
994             Function : An internal method to localize the cuts in the sequence
995             Returns : A reference to an array of cut positions
996             Arguments : The separated enzyme site, the target sequence, and the enzyme object
997             Comments : This is a slow implementation but works for ambiguous sequences.
998             Whenever possible, _nonambig_cuts should be used as it is a lot faster.
999              
1000             =cut
1001              
1002             # we have problems here when the cut is extrasite: $beforeseq/$afterseq do
1003             # not define the cut site then! I am renaming this to _ambig_cuts_depr,
1004             # providing a more compact method that correctly handles extrasite cuts
1005             # below /maj
1006              
1007             sub _ambig_cuts_depr {
1008 0     0   0 my ($self, $beforeseq, $afterseq, $target_seq, $enz) = @_;
1009            
1010             # cut the sequence. This is done with split so we can use
1011             # regexp.
1012 0         0 $target_seq = uc $target_seq;
1013 0         0 my @cuts = split /($beforeseq)($afterseq)/i, $target_seq;
1014             # now the array has extra elements --- the before and after!
1015             # we have:
1016             # element 0 sequence
1017             # element 1 3' end
1018             # element 2 5' end of next sequence
1019             # element 3 sequence
1020             # ....
1021              
1022             # we need to loop through the array and add the ends to the
1023             # appropriate parts of the sequence
1024              
1025 0         0 my $i=0;
1026 0         0 my @re_frags;
1027 0 0       0 if ($#cuts) { # there is >1 element
1028 0         0 while ($i<$#cuts) {
1029 0         0 my $joinedseq;
1030             # the first sequence is a special case
1031 0 0       0 if ($i == 0) {
1032 0         0 $joinedseq=$cuts[$i].$cuts[$i+1];
1033             } else {
1034 0         0 $joinedseq=$cuts[$i-1].$cuts[$i].$cuts[$i+1];
1035             }
1036             # now deal with overlapping sequences
1037             # we can do this through a regular regexp as we only
1038             # have a short fragment to look through
1039              
1040 0         0 while ($joinedseq =~ /$beforeseq$afterseq/) {
1041 0         0 $joinedseq =~ s/^(.*?$beforeseq)($afterseq)/$2/;
1042 0         0 push @re_frags, $1;
1043             }
1044 0         0 push @re_frags, $joinedseq;
1045 0         0 $i+=3;
1046             }
1047              
1048             # I don't think we want the last fragment in. It is messing up the _circular
1049             # part of things. So I deleted this part of the code :)
1050              
1051             } else {
1052             # if we don't cut, leave the array empty
1053 0         0 return [];
1054             } # the sequence was not cut.
1055              
1056             # now @re_frags has the fragments of all the sequences
1057             # but some people want to have this return the lengths
1058             # of the fragments.
1059              
1060             # in theory the actual cut sites should be the length
1061             # of the fragments in @re_frags
1062              
1063             # note, that now this is the only data that we are saving. We
1064             # will have to go back add regenerate re_frags. The reason is
1065             # that we can use this in _circular easier
1066              
1067 0         0 my @cut_positions = map {length($_)} @re_frags;
  0         0  
1068              
1069             # the cut positions are right now the lengths of the sequence, but
1070             # we need to add them all onto each other
1071              
1072 0         0 for (my $i=1; $i<=$#cut_positions; $i++) {
1073 0         0 $cut_positions[$i]+=$cut_positions[$i-1];
1074             }
1075              
1076             # in one of those oddities in life, 2 fragments mean an enzyme cut once
1077             # so $#re_frags is the number of cuts
1078 0         0 return \@cut_positions;
1079             }
1080              
1081             # new version/maj
1082              
1083             sub _ambig_cuts {
1084 0     0   0 my ($self, $before, $after, $target, $enz, $comp) = @_;
1085 0 0       0 my $cut_site = ($comp ? $enz->complementary_cut : $enz->cut);
1086 0         0 local $_ = uc $target;
1087 0         0 my @cuts;
1088 0         0 my $recog = $enz->recog;
1089 0         0 my $site_re = qr/($recog)/;
1090 0         0 push @cuts, pos while (/$site_re/g);
1091 0         0 $_ = $_ - length($enz->recog) + $cut_site for @cuts;
1092 0         0 return [@cuts];
1093             }
1094              
1095             =head2 _nonambig_cuts
1096              
1097             Title : _nonambig_cuts
1098             Function : Figures out which enzymes we know about and cuts the sequence.
1099             Returns : Nothing.
1100             Arguments : The separated enzyme site, the target sequence, and the enzyme object
1101              
1102             An internal method. This will figure out where the sequence should be
1103             cut, and provide the appropriate results. This is a much faster
1104             implementation because it doesn't use a regexp, but it can not deal
1105             with ambiguous sequences
1106              
1107             =cut
1108              
1109             # now, DO want the enzyme object.../maj
1110              
1111             sub _nonambig_cuts {
1112 0     0   0 my ($self, $beforeseq, $afterseq, $target_seq, $enz, $comp) = @_;
1113 0 0       0 my $cut_site = ($comp ? $enz->complementary_cut : $enz->cut);
1114 0 0       0 if ($beforeseq eq ".") {$beforeseq = ''}
  0         0  
1115 0 0       0 if ($afterseq eq ".") {$afterseq = ''}
  0         0  
1116 0         0 $target_seq = uc $target_seq;
1117             # my $index_posn=index($target_seq, $beforeseq.$afterseq);
1118 0         0 my $index_posn=index($target_seq, $enz->recog);
1119 0 0       0 return [] if ($index_posn == -1); # there is no match to the sequence
1120              
1121             # there is at least one cut site
1122 0         0 my @cuts;
1123 0         0 while ($index_posn > -1) {
1124             # extrasite cutting issue here...
1125             # think we want $index_posn+$enz->cut
1126             # push (@cuts, $index_posn+length($beforeseq));
1127 0         0 push (@cuts, $index_posn+$cut_site);
1128             # $index_posn=index($target_seq, $beforeseq.$afterseq, $index_posn+1);
1129 0         0 $index_posn=index($target_seq, $enz->recog, $index_posn+1);
1130             }
1131              
1132 0         0 return \@cuts;
1133             }
1134              
1135             =head2 _make_cuts
1136              
1137             Title : _make_cuts
1138             Usage : $an->_make_cuts( $target_sequence, $enzyme, $complement_q )
1139             Function: Returns an array of cut sites on target seq, using enzyme
1140             on the plus strand ($complement_q = 0) or minus strand
1141             ($complement_q = 1); follows Enzyme objects in
1142             $enzyme->others()
1143             Returns : array of scalar integers
1144             Args : sequence string, B:R:Enzyme object, boolean
1145              
1146             =cut
1147            
1148             sub _make_cuts {
1149 3     3   17 no warnings qw( uninitialized );
  3         3  
  3         2012  
1150              
1151 15249     15249   15003 my ($self, $target, $enz, $comp) = @_;
1152 15249         18374 local $_ = uc $target;
1153              
1154 15249         8740 my @cuts;
1155              
1156 15249 50       26704 my @enzs = map { $_ || () } ($enz, $enz->can('others') ? $enz->others : ());
  15481 100       28145  
1157             ENZ:
1158 15249         12965 foreach $enz (@enzs) {
1159 15481         17442 my $recog = $enz->recog;
1160 15481 100       23699 my $cut_site = ($comp ? $enz->complementary_cut : $enz->cut);
1161 15481         10023 my @these_cuts;
1162              
1163 15481 100       24278 if ( $recog =~ /[^\w]/ ) { # "ambig"
1164 5810         35383 my $site_re = qr/($recog)/;
1165 5810         25144 push @these_cuts, pos while (/$site_re/g);
1166 5810         7399 $_ = $_ - length($enz->string) + $cut_site for @these_cuts;
1167 5810 100       7874 if (!$enz->is_palindromic) {
1168 864         1179 pos = 0;
1169 864         727 my @these_rev_cuts;
1170 864         1064 $recog = $enz->revcom_recog;
1171 864 100       1071 $cut_site = length($enz->string) - ($comp ? $enz->cut : $enz->complementary_cut);
1172 864         2928 $site_re = qr/($recog)/;
1173 864         2871 push @these_rev_cuts, pos while (/$site_re/g);
1174 864         984 $_ = $_ - length($enz->string) + $cut_site for @these_rev_cuts;
1175 864         1063 push @these_cuts, @these_rev_cuts;
1176             }
1177             }
1178             else { # "nonambig"
1179 9671         12183 my $index_posn=index($_, $recog);
1180 9671         12274 while ($index_posn > -1) {
1181 1547         1414 push (@these_cuts, $index_posn+$cut_site);
1182 1547         2478 $index_posn=index($_, $recog, $index_posn+1);
1183             }
1184 9671 100       11365 if (!$enz->is_palindromic) {
1185 1943         2378 $recog = $enz->revcom_recog;
1186 1943 100       2627 $cut_site = length($enz->string) - ($comp ? $enz->cut : $enz->complementary_cut);
1187 1943         2416 $index_posn=index($_, $recog);
1188 1943         2818 while ($index_posn > -1) {
1189 246         222 push @these_cuts, $index_posn+$cut_site;
1190 246         425 $index_posn=index($_, $recog, $index_posn+1);
1191             }
1192             }
1193             }
1194 15481         17372 push @cuts, @these_cuts;
1195             }
1196 15249         18480 return [@cuts];
1197             }
1198              
1199             =head2 _multiple_cuts
1200              
1201             Title : _multiple_cuts
1202             Function : Figures out multiple digests
1203             Returns : An array of the cut sites for multiply digested DNA
1204             Arguments : A Bio::Restriction::EnzymeCollection object
1205             Comments : Double digests is one subset of this, but you can use
1206             as many enzymes as you want.
1207              
1208             =cut
1209              
1210             sub _multiple_cuts {
1211 3     3   5 my ($self, $ec)=@_;
1212 3 50       9 $self->cut unless $self->{'_cut'};
1213              
1214             # now that we are using positions rather than fragments
1215             # this is really easy
1216 3         5 my @cuts;
1217 3         14 foreach my $enz ($ec->each_enzyme) {
1218 15         21 push @cuts, @{$self->{'_cut_positions'}->{$enz->name}}
1219 15 50       26 if defined $self->{'_cut_positions'}->{$enz->name};
1220             }
1221 3         11 @{$self->{'_cut_positions'}->{'multiple_digest'}}=sort {$a <=> $b} @cuts;
  3         13  
  87         56  
1222              
1223 3         4 my $number_of_cuts;
1224              
1225 3         4 $number_of_cuts=scalar @{$self->{'_cut_positions'}->{'multiple_digest'}};
  3         8  
1226 3         5 $self->{_number_of_cuts_by_enzyme}->{'multiple_digest'}=$number_of_cuts;
1227 3         4 push (@{$self->{_number_of_cuts_by_cuts}->{$number_of_cuts}}, 'multiple_digest');
  3         10  
1228 3 50       12 if ($number_of_cuts > $self->{maximum_cuts}) {
1229 0         0 $self->{maximum_cuts}=$number_of_cuts;
1230             }
1231             }
1232              
1233              
1234             =head2 _circular
1235              
1236             Title : _circular
1237             Function : Identifies cuts at the join of the end of the target with
1238             the beginning of the target
1239             Returns : array of scalar integers ( cut sites near join, if any )
1240             Arguments : scalar string (target sequence), Bio::Restriction::Enzyme obj
1241              
1242             =cut
1243              
1244             sub _circular {
1245 4718     4718   4075 my ($self, $target, $enz, $comp) = @_;
1246 4718         5580 $target=uc $target;
1247 4718 50       5315 my $patch_len = ( length $target > 20 ? 10 : int( length($target)/2 ) );
1248            
1249 4718         6203 my ($first, $last) =
1250             (substr($target, 0, $patch_len),substr($target, -$patch_len));
1251 4718         4276 my $patch=$last.$first;
1252            
1253             # now find the cut sites
1254            
1255 4718         5091 my $cut_positions = $self->_make_cuts($patch, $enz, $comp);
1256            
1257             # the enzyme doesn't cut in the new fragment
1258 4718 50       6182 return [] if (!$cut_positions);
1259            
1260             # now we are going to add things to _cut_positions
1261             # in this shema it doesn't matter if the site is there twice -
1262             # we will take care of that later. Because we are using position
1263             # rather than frag or anything else, we can just
1264             # remove duplicates.
1265 4718         3319 my @circ_cuts;
1266 4718         4555 foreach my $cut (@$cut_positions) {
1267 275 100       516 if ($cut == length($last)) {
    100          
1268             # the cut is actually at position 0, but we're going to call this the
1269             # length of the sequence so we don't confuse no cuts with a 0 cut
1270             # push (@circ_cuts, $self->{'_seq'}->length);
1271 2         4 push (@circ_cuts, 0);
1272              
1273             }
1274             elsif ($cut < length($last)) {
1275             # the cut is before the end of the sequence
1276             #check
1277 150         292 push (@circ_cuts, $self->{'_seq'}->length - (length($last) - $cut));
1278             }
1279             else {
1280             # the cut is at the start of the sequence (position >=1)
1281            
1282             # note, we put this at the beginning of the array rather than the end!
1283 123         187 unshift (@circ_cuts, $cut-length($last));
1284             }
1285             }
1286 4718         6275 return \@circ_cuts;
1287             }
1288              
1289              
1290              
1291              
1292              
1293             =head2 _expanded_string
1294              
1295             Title : _expanded_string
1296             Function : Expand nucleotide ambiguity codes to their representative letters
1297             Returns : The full length string
1298             Arguments : The string to be expanded.
1299              
1300             Stolen from the original RestrictionEnzyme.pm
1301              
1302             =cut
1303              
1304              
1305             sub _expanded_string {
1306 0     0     my ($self, $str) = @_;
1307              
1308 0           $str =~ s/N|X/\./g;
1309 0           $str =~ s/R/\[AG\]/g;
1310 0           $str =~ s/Y/\[CT\]/g;
1311 0           $str =~ s/S/\[GC\]/g;
1312 0           $str =~ s/W/\[AT\]/g;
1313 0           $str =~ s/M/\[AC\]/g;
1314 0           $str =~ s/K/\[TG\]/g;
1315 0           $str =~ s/B/\[CGT\]/g;
1316 0           $str =~ s/D/\[AGT\]/g;
1317 0           $str =~ s/H/\[ACT\]/g;
1318 0           $str =~ s/V/\[ACG\]/g;
1319              
1320 0           return $str;
1321             }
1322              
1323              
1324             1;