File Coverage

blib/lib/AI/Genetic/Pro/Macromolecule.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             package AI::Genetic::Pro::Macromolecule;
2             our $VERSION = '0.09280.0_001';
3              
4              
5              
6             # ABSTRACT: Genetic Algorithms to evolve DNA, RNA and Protein sequences
7              
8 1     1   2350 use Moose;
  1         804607  
  1         14  
9 1     1   9580 use MooseX::Types::Moose qw(Str Bool Int Num ArrayRef CodeRef);
  1         74800  
  1         14  
10 1     1   7999 use AI::Genetic::Pro::Macromolecule::Types qw(AIGeneticPro Probability);
  1         14870  
  1         10  
11 1     1   6473 use AI::Genetic::Pro;
  0            
  0            
12             use Moose::Util::TypeConstraints;
13             use List::Util 'max';
14             use Modern::Perl;
15             use MooseX::Throwable;
16             use namespace::autoclean;
17              
18             my %_alphabet_for = (
19             protein => [qw(A C D E F G H I K L M N P Q R S T V W Y)],
20             dna => [qw(A C G T)],
21             rna => [qw(A C G U)],
22             );
23              
24              
25             has fitness => (
26             is => 'ro',
27             isa => CodeRef,
28             required => 1,
29             );
30              
31              
32             has terminate => (
33             is => 'ro',
34             isa => CodeRef,
35             predicate => '_has_terminate',
36             );
37              
38             has '_actual_' . $_ => (
39             is => 'ro',
40             isa => CodeRef,
41             lazy_build => 1,
42             ) for qw(fitness terminate);
43              
44             sub _build__actual_fitness {
45             my $self = shift;
46              
47             return sub {
48             my ($ga, $chromosome) = @_;
49             my $seq = $ga->as_string($chromosome);
50             $seq =~ s/_//g;
51              
52             return $self->fitness->($seq);
53             }
54             }
55              
56             sub _build__actual_terminate {
57             my $self = shift;
58              
59             return sub { return $self->terminate };
60             }
61              
62              
63              
64             has variable_length => (
65             is => 'ro',
66             isa => Bool,
67             default => 1,
68             );
69              
70              
71             has length => (
72             is => 'ro',
73             isa => Num,
74             lazy_build => 1,
75             );
76              
77             has _ga => (
78             is => 'ro',
79             isa => AIGeneticPro,
80             init_arg => undef,
81             handles => [qw(evolve generation)],
82             lazy_build => 1,
83             );
84              
85             sub _build__ga {
86             my $self = shift;
87              
88             my $ga = AI::Genetic::Pro->new(
89              
90             -type => 'listvector',
91             -population => $self->population_size,
92             -crossover => $self->crossover,
93             -mutation => $self->mutation,
94             -parents => $self->parents,
95             -selection => $self->selection,
96             -strategy => $self->strategy,
97             -cache => $self->cache,
98             -history => 1,
99             -preserve => $self->preserve,
100             -variable_length => $self->variable_length,
101             -fitness => $self->_actual_fitness,
102             );
103              
104             # Consistency check for variable_length and input lengths
105             if (
106             $self->_has_initial_population and
107             !$self->variable_length and
108             $self->_seq_lengths_are_different
109             ) { die "Initial population lengths cannot be different when variable_length is set to 0.\n"; }
110              
111             if ( $self->_initial_population_size > $self->population_size ) {
112             warn "initial_population has more sequences than population_size allows\n"
113             }
114              
115             if ($self->_has_terminate) { $ga->terminate($self->_actual_terminate) };
116              
117             $ga->init([
118             map { $_alphabet_for{ lc $self->type } }
119             (1 .. $self->length)
120             ]);
121              
122             $ga->inject([ map { [ split '', $_ ] } @{$self->initial_population} ])
123             if $self->_has_initial_population;
124              
125             return $ga;
126             }
127              
128             sub _initial_population_size {
129             my $self = shift;
130              
131             if ($self->_has_initial_population) {
132             return scalar @{$self->initial_population};
133             }
134             else {
135             return 0;
136             }
137             }
138              
139             sub _seq_lengths_are_different {
140             # returns true if lengths of the inserted sequences are equal
141             my $self = shift;
142              
143             my $initial_length = length($self->initial_population->[0]);
144              
145             return grep { length $_ != $initial_length } @{$self->initial_population};
146             }
147              
148              
149             sub fittest {
150             my ($self, $n) = @_;
151             $n //= 1;
152              
153             my @fittest;
154             my @chromosomes = $self->_ga->getFittest($n, 1);
155              
156             foreach my $chrom (@chromosomes) {
157             my $seq = $self->_ga->as_string($chrom);
158             $seq =~ s/_//g;
159              
160             push @fittest, {
161             seq => $seq,
162             score => $self->_ga->as_value ($chrom),
163             };
164             }
165              
166             return ( $n == 1 ) ? $fittest[0] : @fittest;
167             }
168              
169              
170             sub history {
171             my $self = shift;
172              
173             my $history = $self->_ga->getHistory;
174              
175             return {
176             max => $history->[0],
177             mean => $history->[1],
178             min => $history->[2],
179             };
180             }
181              
182              
183             sub current_stats {
184             my $self = shift;
185              
186             my ($max, $mean, $min) = $self->_ga->getAvgFitness;
187              
188             return { max => $max, mean => $mean, min => $min };
189             }
190              
191              
192             sub current_population {
193             my $self = shift;
194              
195             my @population;
196              
197             my $chromosomes = $self->_ga->people;
198             foreach my $chrom (@$chromosomes) {
199              
200             my $seq = $self->_ga->as_string( $chrom );
201             $seq =~ s/_//g;
202              
203             my $score = $self->_ga->as_value($chrom);
204              
205             push @population, { seq => $seq, score => $score };
206              
207             }
208              
209             return @population;
210             }
211              
212              
213             sub _build_length {
214             my $self = shift;
215              
216             unless ( $self->_has_initial_population ) {
217             die "Either length or initial_population should be defined\n";
218             }
219              
220             my $max_length = max( map { length } @{$self->initial_population} );
221              
222             return $max_length;
223             }
224              
225              
226             has type => (
227             is => 'ro',
228             isa => enum([qw(protein Protein dna DNA rna RNA)]),
229             required => 1,
230             );
231              
232              
233             has initial_population => (
234             is => 'ro',
235             isa => ArrayRef[Str],
236             predicate => '_has_initial_population',
237             );
238              
239              
240             has cache => (
241             is => 'ro',
242             isa => Bool,
243             default => 1,
244             );
245              
246              
247             has mutation => (
248             is => 'ro',
249             isa => Probability,
250             default => 0.05,
251             );
252              
253              
254             has crossover => (
255             is => 'ro',
256             isa => Probability,
257             default => 0.95,
258             );
259              
260              
261             has population_size => (
262             is => 'ro',
263             isa => Int,
264             default => 300,
265             );
266              
267              
268             has parents => (
269             is => 'ro',
270             isa => Int,
271             default => 2,
272             );
273              
274              
275             has selection => (
276             is => 'ro',
277             isa => ArrayRef,
278             default => sub { ['Roulette'] },
279             );
280              
281              
282             has strategy => (
283             is => 'ro',
284             isa => ArrayRef,
285             default => sub { [ 'Points', 2 ] },
286             );
287              
288              
289             has preserve => (
290             is => 'ro',
291             isa => Int,
292             default => '5',
293             );
294              
295             __PACKAGE__->meta->make_immutable;
296             1;
297              
298              
299              
300              
301             =pod
302              
303             =head1 NAME
304              
305             AI::Genetic::Pro::Macromolecule - Genetic Algorithms to evolve DNA, RNA and Protein sequences
306              
307             =head1 VERSION
308              
309             version 0.09280.0_001
310              
311             =head1 SYNOPSIS
312              
313             use AI::Genetic::Pro::Macromolecule;
314              
315             my @proteins = ($seq1, $seq2, $seq3, ... );
316              
317             my $m = AI::Genetic::Pro::Macromolecule->new(
318             type => 'protein',
319             fitness => \&hydrophobicity,
320             initial_population => \@proteins,
321             );
322              
323             sub hydrophobicity {
324             my $seq = shift;
325             my $score = f($seq)
326              
327             return $score;
328             }
329              
330             $m->evolve(10) # evolve for 10 generations;
331              
332             my $most_hydrophobic = $m->fittest->{seq}; # get the best sequence
333             my $highest_score = $m->fittest->{score}; # get top score
334              
335             # Want the score stats throughout generations?
336             my $history = $m->history;
337              
338             my $mean_history = $history->{mean}; # [ mean1, mean2, mean3, ... ]
339             my $min_history = $history->{min}; # [ min1, min2, min3, ... ]
340             my $max_history = $history->{max}; # [ max1, max2, max3, ... ]
341              
342             =head1 DESCRIPTION
343              
344             AI::Genetic::Pro::Macromolecule is a wrapper over L<AI::Genetic::Pro>,
345             aimed at easily evolving protein, DNA or RNA sequences using arbitrary
346             fitness functions.
347              
348             Its purpose it to allow optimization of macromolecule sequences using
349             Genetic Algorithms, with as little set up time and burdain as possible.
350              
351             Standing atop L<AI::Genetic::Pro>, it is reasonably fast and memory
352             efficient. It is also highly customizable, although I've chosen what I
353             think are sensible defaults for every parameter, so that you don't have
354             to worry about them if you don't know what they mean.
355              
356              
357              
358             =head1 ATTRIBUTES
359              
360             =head2 fitness
361              
362             Accepts a C<CodeRef> that should assign a numeric score to each string
363             sequence that it's passed to it as an argument. Required.
364              
365             sub fitness {
366             my $seq = shift;
367              
368             # Do something with $seq and return a score
369             my $score = f($seq);
370              
371             return $score;
372             }
373              
374             my $m = AI::Genetic::Pro::Macromolecule->new(
375             fitness => \&fitness,
376             ...
377             );
378              
379              
380              
381             =head2 terminate
382              
383             Accepts a C<CodeRef>. It will be applied once at the end of each
384             generation. If returns true, evolution will stop, disregarding the
385             generation steps passed to the C<evolve> method.
386              
387             The C<CodeRef> should accept an C<AI::Genetic::Pro::Macromolecule> object
388             as argument, and should return either true or false.
389              
390             sub reached_max {
391             my $m = shift; # an AI::G::P::Macromolecule object
392              
393             my $highest_score = $m->fittest->{score};
394              
395             if ( $highest_score > 9000 ) {
396             warn "It's over 9000!";
397             return 1;
398             }
399             }
400              
401             my $m = AI::Genetic::Pro::Macromolecule->new(
402             terminate => \&reached_max,
403             ...
404             );
405              
406             In the above example, evolution will stop the moment the top score in
407             any generation exceeds the value 9000.
408              
409              
410              
411             =head2 variable_length
412              
413             Decide whether the sequences can have different lengths. Accepts a C<Bool>
414             value. Defaults to 1.
415              
416              
417              
418             =head2 length
419              
420             Manually set the allowed maximum length of the sequences, accepts C<Int>.
421              
422             This attribute is required unless an initial population is provided. In
423             that case, C<length> will be set as equal to the length of the longest
424             sequence provided if it's not explicity specified.
425              
426              
427              
428             =head2 type
429              
430             Macromolecule type: protein, dna, or rna. Required.
431              
432              
433              
434             =head2 initial_population
435              
436             Sequences to add to the initial pool before evolving. Accepts an
437             C<ArrayRef[Str]>.
438              
439             my $m = AI::Genetic::Pro::Macromolecule->new(
440             initial_population => ['ACGT', 'CAAC', 'GTTT'],
441             ...
442             );
443              
444              
445              
446             =head2 cache
447              
448             Accepts a C<Bool> value. When true, score results for each sequence will
449             be stored, to avoid costly and unnecesary recomputations. Set to 1 by
450             default.
451              
452              
453              
454             =head2 mutation
455              
456             Mutation rate, a C<Num> between 0 and 1. Default is 0.05.
457              
458              
459              
460             =head2 crossover
461              
462             Crossover rate, a C<Num> between 0 and 1. Default is 0.95.
463              
464              
465              
466             =head2 population_size
467              
468             Number of sequences per generation. Default is 300.
469              
470              
471              
472             =head2 parents
473              
474             Number of parents sequences in recombinations. Default is 2.
475              
476              
477              
478             =head2 selection
479              
480             Defines how sequences are selected to crossover. It expects an C<ArrayRef>:
481              
482             selection => [ $type, @params ]
483              
484             See docs in L<AI::Genetic::Pro> for details on available selection
485             strategies, parameters, and their meanings. Default is Roulette, in
486             which at first the best individuals/chromosomes are selected. From this
487             collection parents are selected with probability poportionaly to its
488             fitness.
489              
490              
491              
492             =head2 strategy
493              
494             Defines strategy of crossover operation. It expects an C<ArrayRef>:
495              
496             strategy => [ $strategy, @params ]
497              
498             See docs in L<AI::Genetic::Pro> for details on available crossover
499             strategies, parameters, and their meanings. Default is [ Points, 2 ], in
500             which parents are crossed at 2 points and the best child is moved to the
501             next generation.
502              
503              
504              
505             =head2 preserve
506              
507             Whether to inject the best sequences for next generation, and if so, how
508             many. Defaults to 5.
509              
510              
511              
512             =head1 METHODS
513              
514             =head2 evolve
515              
516             $m->evolve($n);
517              
518             Evolve the sequence population for the specified number of generations.
519             Accepts an optional single C<Int> argument. If $n is 0 or undef, it will
520             evolve undefinitely or C<terminate> returns true.
521              
522             =head2 generation
523              
524             Returns the current generation number.
525              
526              
527              
528             =head2 fittest
529              
530             Returns an C<Array[HashRef]> with the desired number of top scoring
531             sequences. The hash reference has two keys, 'seq' which points to the
532             sequence string, and 'score' which points to the sequence's score.
533              
534             my @top_2 = $m->fittest(2);
535             # (
536             # { seq => 'VIKP', score => 10 },
537             # { seq => 'VLKP', score => 9 },
538             # )
539              
540             When called with no arguments, it returns a C<HashRef> with the top
541             scoring sequence.
542              
543             my $fittest = $m->fittest;
544             # { seq => 'VIKP', score => 10 }
545              
546              
547              
548             =head2 history
549              
550             Returns a C<HashRef> with the minimum, maximum and mean score for
551             each generation.
552              
553             my $history = $m->history;
554             # {
555             # min => [ 0, 0, 0, 1, 2, ... ],
556             # max => [ 1, 2, 2, 3, 4, ... ],
557             # mean => [ 0.2, 0.3, 0.5, 1.5, 3, ... ],
558             # }
559              
560             To access the mean score for the C<$n>-th generation, for instance:
561              
562             $m->history->{mean}->[$n - 1];
563              
564              
565              
566             =head2 current_stats
567              
568             Returns a C<HashRef> with the minimum, maximum and mean score fore
569             the current generation.
570              
571             $m->current_stats;
572             # { min => 2, max => 10, mean => 3.5 }
573              
574              
575              
576             =head2 current_population
577              
578             Returns an C<Array[HashRef]> with all the sequences of the current
579             generation and their scores, in no particular order.
580              
581             my @seqs = $m->current_population;
582             # (
583             # { seq => 'VIKP', score => 10 },
584             # { seq => 'VLKP', score => 9 },
585             # ...
586             # )
587              
588              
589              
590             =head1 AUTHOR
591              
592             Bruno Vecchi <vecchi.b gmail.com>
593              
594             =head1 COPYRIGHT AND LICENSE
595              
596             This software is copyright (c) 2009 by Bruno Vecchi.
597              
598             This is free software; you can redistribute it and/or modify it under
599             the same terms as the Perl 5 programming language system itself.
600              
601             =cut
602              
603              
604              
605             __END__
606              
607              
608