-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPE_algorithms.py
673 lines (566 loc) · 31.4 KB
/
PE_algorithms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
import pandas as pd
import numpy as np
import random
from collections import defaultdict
import time
import math
from tqdm import tqdm
from dataclasses import dataclass
from typing import List
def brute_force_max_sum(sample_embeddings, num_proteins=10, full=False):
""" Use parallel processing to Calculate the brute force max sum of Euclidean distances between embeddings
:param sample_embeddings: dictionary containing an embedding label and embedding values
:param num_proteins: how many diverse embeddings to select for the final subset
:param full: whether the full dataset or a sample of the dataset is being used, if full save most diverse to csv
:return: The list of embedding labels of the most diverse embeddings
"""
# Convert the dictionary to a list of tuples
sample_tuples = list(sample_embeddings.items())
# list to store the diversity scores for each protein
diversity_scores = []
# Compute the total number of iterations
total_iterations = len(sample_tuples) * (len(sample_tuples) - 1)
# Initialize the progress bar
progress_bar = tqdm(total=total_iterations, desc="Calculating diversity", unit="iteration")
# Iterate over each protein embedding and label
for label, embedding in sample_tuples:
# Initialize the diversity score for the current protein
diversity_score = 0
# Calculate the sum of distances between the current protein and all other proteins
for other_label, other_embedding in sample_tuples:
if label != other_label:
distance = np.linalg.norm(embedding - other_embedding) # euclidean distance
diversity_score += distance
# Update the progress bar
progress_bar.update()
# Append the diversity score and protein label to the list
diversity_scores.append((diversity_score, label))
# Close the progress bar
progress_bar.close()
# Sort the diversity scores in descending order
diversity_scores.sort(reverse=True)
# If 'full' is True, save the results to a CSV file
if full:
df = pd.DataFrame(diversity_scores, columns=['MaxSum Value', 'Protein Label'])
df.to_csv('top_proteins.csv', index=False)
# Extract the top 'num_proteins' most diverse proteins
top_proteins = [protein_label for _, protein_label in diversity_scores[:num_proteins]]
return top_proteins
class MaxSumTabuSearch:
"""Alternative version of a Tabu search algorithm
"""
def __init__(self, embeddings, num_proteins=10, max_iterations=1000, local_iterations=50, tabu_tenure=10):
"""Initialise the class
:param embeddings: dict, the dataset
:param num_proteins: int, number of top proteins to return
:param max_iterations: int, max number of global iterations
:param local_iterations: int, max number of local iterations
:param tabu_tenure: int, number of iterations that a solution remains in tabu list
"""
self.embeddings = embeddings
self.num_proteins = num_proteins
self.max_iterations = max_iterations
self.local_iterations = local_iterations
self.best_subset = None
self.best_fitness = 0.0
self.tabu_tenure = tabu_tenure
self.tabu_list = {}
def run_tabu_search(self):
"""Run an alternative tabu search
Globally selects a small subset (num_proteins) then calls local search.
Always randomly selects from global pool of potential 'solutions'.
Updates best global fitness with best returned from local search.
Keeps track of previously selected best (local and global) in tabu list,
which is updated after every global iteration. Updating either reduces the
tabu tenure, or removes a solution from the list if it's tenure has dropped
to 0 thus allowing the solution to be selected again.
:return: list of most diverse proteins embeddings
"""
progress_bar = tqdm(total=self.max_iterations, desc="Running Tabu Search")
# Get the start time
start_time = time.time()
for _ in range(self.max_iterations):
# Generate an initial solution with a random selection of n proteins
initial_subset = random.sample(list(self.embeddings.keys()), self.num_proteins)
# Local Search: Multi-start random sample subset of i proteins over j iterations
improved_subset, fitness = self.local_search(initial_subset)
# Update the best solution if a better one is found
if fitness > self.best_fitness:
self.best_fitness = fitness
self.best_subset = improved_subset
progress_bar.update(1) # Update the progress bar
self.update_tabu_list()
# Get the end time
end_time = time.time()
# Calculate the total elapsed time
total_time = end_time - start_time
print("Total time:", total_time)
progress_bar.close() # Close the progress bar after finishing
return self.rank_proteins(self.best_subset)
def rank_proteins(self, subset):
""" sort the most diverse proteins by their max sum Euclidean distance in descending order
:param subset: the subset of most diverse proteins
:return: list of protein embeddings in descending order
"""
# TODO remove this line as is not needed, already done in local_search
individual_diversities = self.calculate_distances(subset)
ranked_proteins = sorted(individual_diversities.keys(), key=lambda x: individual_diversities[x], reverse=True)
return ranked_proteins[:self.num_proteins]
def calculate_distances(self, subset):
""" calculate the pairwise Euclidean distance of each protein embedding in the
subset to all other embeddings in the subset
:param subset: the subset of protein embeddings
:return: dictionary of protein labels and their Euclidean distances
"""
individual_diversities = {}
for i, protein in enumerate(subset):
total_distance = 0
emb1 = self.embeddings[protein]
for j, other_protein in enumerate(subset):
if i != j:
emb2 = self.embeddings[other_protein]
distance = np.linalg.norm(emb1 - emb2)
total_distance += distance
individual_diversities[protein] = total_distance
return individual_diversities
def local_search(self, subset):
"""Search local populations for most diverse proteins
Selects a list of 100 random proteins from the global search space and
at each iteration evaluates the solution, keeping track of the best local
solution.
:param subset: current best solution
:return: The best solution and their fitnessess
"""
best_subset = subset
best_fitness = self.max_sum_fitness(subset)
for _ in range(self.local_iterations):
# take a random sample of 100 proteins
random_sampled_prot = random.sample(list(self.embeddings.keys()), 100)
# calculate fitness
fitness = self.max_sum_fitness(random_sampled_prot)
solution_tuple = tuple(sorted(random_sampled_prot))
# Even if a solution is tabu, it can be accepted if it's better than the best known solution.
if (solution_tuple not in self.tabu_list) or (fitness > self.best_fitness):
if fitness > best_fitness:
best_fitness = fitness
best_subset = random_sampled_prot
# Add to tabu list with tabu tenure
self.tabu_list[solution_tuple] = self.tabu_tenure
return best_subset, best_fitness
def max_sum_fitness(self, protein_labels):
"""Sum the distances and return the total distance
:param protein_labels: dictionary where keys are protein labels and values are a list of distances
:return: total distance
"""
total_distance = sum(self.calculate_distances(protein_labels).values())
return total_distance
def update_tabu_list(self):
"""Decrement the tenure of each solution in the tabu list. Remove solutions with zero tenure."""
to_remove = [sol for sol, tenure in self.tabu_list.items() if tenure <= 1]
for sol in to_remove:
del self.tabu_list[sol]
for sol in self.tabu_list:
self.tabu_list[sol] -= 1
class TradMaxSumTabuSearchV1:
"""Traditional tabu search implementation with simple yet conservative population generation """
def __init__(self, embeddings, num_proteins=10, max_iterations=1000, tabu_tenure=10):
"""Initialise the class
:param embeddings: dict, the dataset {protein:[embeddings], protein:[embeddings], ...}
:param num_proteins: int, number of top proteins to return
:param max_iterations: int, max number of global iterations
:param tabu_tenure: int, number of iterations that a solution remains in tabu list
"""
self.embeddings = embeddings
self.num_proteins = num_proteins
self.max_iterations = max_iterations
self.best_subset = None
self.best_fitness = 0.0
self.tabu_list = set()
self.tabu_tenure = tabu_tenure
def run_tabu_search(self):
""" Run an alternative tabu search
Selects an initial subset (num_proteins) which gets passed to local search
where a neighbourhood of solutions are created by swapping out a single protein
label in the solution list with a random sample from the global list.
:return: list of most diverse proteins embeddings
"""
progress_bar = tqdm(total=self.max_iterations, desc="Running Tabu Search")
# Get the start time
start_time = time.time()
for _ in range(self.max_iterations):
# Generate an initial solution with a random selection of 10 proteins
initial_subset = random.sample(list(self.embeddings.keys()), self.num_proteins)
# Multi-start random sample subset of n proteins over j iterations
improved_subset, fitness = self.local_search(initial_subset)
# Update the best solution if a better one is found
if fitness > self.best_fitness:
self.best_fitness = fitness
self.best_subset = improved_subset
progress_bar.update(1) # Update the progress bar
# Get the end time
end_time = time.time()
# Calculate the total elapsed time
total_time = end_time - start_time
print("Total time:", total_time)
progress_bar.close() # Close the progress bar after finishing
return self.rank_proteins(self.best_subset)
def rank_proteins(self, subset):
"""Rank based on highest max sum
:param subset: the subset of protein embeddings
:return: the ranked proteins
"""
ranked_proteins = sorted(subset, reverse=True)
return ranked_proteins
def local_search(self, subset):
""" Create a neighborhood and conduct a local search to find the most diverse proteins
Only uses a local tabu list.
:param subset: global solution space
:return: tuple (best subset, best fitness)
"""
best_subset = subset
best_fitness = self.max_sum_fitness(subset)
tabu_list = {} # dictionary to store tabu solutions and their remaining tabu tenure
for _ in range(50):
neighborhood = self.generate_neighborhood(best_subset)
# Evaluate the fitness of solutions in the neighborhood
neighborhood_fitness = [self.max_sum_fitness(solution) for solution in neighborhood]
# Find the best non-tabu solution in the neighborhood
for i, solution in enumerate(neighborhood):
solution_tuple = tuple(solution) # Convert the list to a tuple
if solution_tuple not in tabu_list or tabu_list[solution_tuple] <= 0:
if neighborhood_fitness[i] > best_fitness:
best_fitness = neighborhood_fitness[i]
best_subset = solution
tabu_list[solution_tuple] = self.tabu_tenure # Use the tuple as the key
# break - exits early and might not find the best as these are not sorted
# Decrement tabu tenure for all solutions in the tabu list
for solution, tenure in tabu_list.items():
tabu_list[solution] = tenure - 1
return best_subset, best_fitness
def generate_neighborhood(self, subset):
"""Generate a set of new solutions by randomly changing an element from the initial random solution
A new solution (list) in the subset neighborhood space is the same subset list with one
item swapped out with a randomly selected label from the global solution space.
:param subset: current or initial solution to generate new solutions from
:return: list of new solutions / a neighbourhood
"""
neighborhood = []
for i in range(len(subset)):
# Generate solutions by making a single change to the current solution
new_solution = subset[:i] + [random.choice(list(self.embeddings.keys()))] + subset[i + 1:]
neighborhood.append(new_solution)
return neighborhood
def max_sum_fitness(self, protein_embeddings):
"""Calculate the max sum pairwise Euclidean distance between each embedding and all other embeddings
:param protein_embeddings: the protein embeddings
:return: the max sum of the embeddings
"""
total_distance = 0
# Calculate the sum of distances between embeddings in the subset
for i in range(len(protein_embeddings)):
emb1 = self.embeddings[protein_embeddings[i]]
for j in range(len(protein_embeddings)):
emb2 = self.embeddings[protein_embeddings[j]]
# Euclidean distance
distance = np.linalg.norm(emb1 - emb2)
total_distance += distance
return total_distance
class TradMaxSumTabuSearchV2:
""" Traditional tabu search to find the most diverse proteins embeddings """
def __init__(self, embeddings, num_proteins=10, max_iterations=1000,
local_iterations=100, local_sample_sizes=100):
"""Initialise the class
:param embeddings: dict, the dataset {protein:[embeddings], protein:[embeddings], ...}
:param num_proteins: int, number of top proteins to return
:param max_iterations: int, max number of global iterations
:param local_iterations: int, max number of local iterations
:param local_sample_sizes: the size of the local solution / num proteins in list
"""
self.embeddings = embeddings
self.num_proteins = num_proteins
self.max_iterations = max_iterations
self.best_subset = None
self.best_fitness = 0.0
self.tabu_list = set()
self.lss = local_sample_sizes
self.local_iterations = local_iterations
self.global_sampling_number = int(len(embeddings) / 20)
self.tabu_tenure = 10
def run_tabu_search(self):
"""Run the tabu search on the protein embeddings and find the most diverse.
Initialises on an initial subset (num_proteins) from the global solution space.
Select a smaller subset (global embeddings / 20) which serves as a local
solution space to explore. The best solution from local search is checked
for fitness against global solution set.
:return: list of the protein labels of the most diverse proteins, global history and local history
to see how often labels get picked
"""
local_hist = defaultdict(int) # for analysis of how many times labels are selected
global_hist = defaultdict(int)
progress_bar = tqdm(total=self.max_iterations, desc="Running Tabu Search")
# Generate an initial solution
initial_subset = random.sample(list(self.embeddings.keys()), self.num_proteins)
# Calculate fitness
fitness = self.max_sum_fitness(initial_subset)
self.check_fitness(fitness, initial_subset, global_hist)
# Get the start time
start_time = time.time()
for _ in range(self.max_iterations):
# get a large sample from the global embeddings to subsample in local search from
global_subset = random.sample(list(self.embeddings.keys()), self.global_sampling_number)
# Multi-start random sample subset of 1000 proteins over 1000 iterations
improved_subset, fitness, local_hist = self.local_search(global_subset, local_hist, self.lss)
# Update the best solution if a better one is found
self.check_fitness(fitness, improved_subset, global_hist)
progress_bar.update(1) # Update the progress bar
# Get the end time
end_time = time.time()
# Calculate the total elapsed time
total_time = end_time - start_time
print("Total time:", total_time)
progress_bar.close() # Close the progress bar after finishing
return self.rank_proteins(self.best_subset), global_hist, local_hist
def rank_proteins(self, subset):
"""Calculate the sum of the pairwise Euclidean distance and rank the proteins based on the max sum
:param subset: list of proteins labels unsorted
:return: ranked list of proteins labels
"""
ranked_proteins = sorted(subset, reverse=True)
return ranked_proteins[:self.num_proteins]
def check_fitness(self, fitness, subset, history):
"""check the fitness of the subset and update the best fitness
and history tracker
:param fitness: max sum euclidean distance
:param subset: dictionary of proteins embeddings
:param history: global history stored in a dictionary
"""
if fitness > self.best_fitness:
print("improved the subset")
self.best_fitness = fitness
self.best_subset = subset
for item in self.best_subset:
history[item] += 1
def local_search(self, subset, histo, samp_size):
"""Explore the local search space for the best solution subset (list of most diverse proteins)
Create a solution subset of local sample size, randomly sampled from the local subset and
evaluate its fitness, do this for x iterations. Only uses a local tabu list
:param subset: the local subset of embeddings to explore, randomly selected from the global solution space
:param histo: dict, local history of previously selected proteins - for analysis of random selection
:param samp_size: the local solution list size
:return: tuple(the best subset, the best fitness, selection history)
"""
best_fitness = 0
best_subset = None
tabu_list = defaultdict(int)
for _ in range(self.local_iterations):
# Get a random sample of the local subset dataset
random_sampled_prot = random.sample(subset, samp_size)
# Calculate fitness for the current random sample subset
fitness = self.max_sum_fitness(random_sampled_prot)
# Update the best solution if a better one is found
if fitness > best_fitness:
if tuple(random_sampled_prot) not in tabu_list.keys() or tabu_list[tuple(random_sampled_prot)] == 0:
best_fitness = fitness
best_subset = random_sampled_prot
tabu_list[tuple(best_subset)] = self.tabu_tenure # Set the tabu tenure for the chosen solution
# Decrement tabu tenure for all solutions in the tabu list
for solution in tabu_list:
tabu_list[solution] -= 1
return best_subset, best_fitness, histo
def max_sum_fitness(self, protein_labels):
"""Sum the distances and return the total distance
:param protein_labels: dictionary where keys are protein labels and values are a list of distances
:return: total distance
"""
total_distance = 0
# Calculate the sum of distances between embeddings in the subset
for i in range(len(protein_labels)):
emb1 = self.embeddings[protein_labels[i]]
for j in range(len(protein_labels)):
emb2 = self.embeddings[protein_labels[j]]
# Euclidean distance
distance = np.linalg.norm(emb1 - emb2)
total_distance += distance
return total_distance
@dataclass
class ProteinSolution:
"""Encapsulate the solution list and its fitness in an object for
easier genetic algorithm handling"""
fitness: float
proteins: List[str] # list of protein names
class MemeticGLS:
def __init__(self, embeddings, dna_size=20, max_epochs=1000, local_iterations=20, population_size=1000,
retain_percent=0.05):
"""Genetic Local Search to solve the Maximum Diversity Problem in relation to finding the list
of x most diverse protein embeddings
Globals search with genetic algorithm, local search with Tabu search
:param embeddings: dict, {protein_name: embedding}
:param dna_size: int, solution (most diverse protein list size) how many max diverse proteins to find
:param max_epochs: int, maximum number of iterations to run
:param local_iterations: int, maximum number of local refinement iterations
:param population_size: int, number of individuals in a population
"""
self.embeddings = embeddings
self.dna_sze = dna_size
self.max_epochs = max_epochs
self.best_subset = None
self.best_fitness = 0.0
self.local_iters = local_iterations
self.pop_size = population_size
self.tabu_tenure = 10
self.best_retain = math.ceil(retain_percent * self.pop_size)
def evolve_solution(self):
"""Search for most diverse proteins in the global search space using a genetic algorithm approach
creates new population by:
1) selecting the best x num from existing population,
2) crossing over the best x num from existing population with each other by taking the first half
of parent 1's dna (solution list) and the second half of parent 2's dna to make a new individual,
3) mutating the next best individuals (lower ranked individuals and twice the x best to retain),
mutate from the global solution space,
4) refine the next x individuals lower down the rankings. Refining mutates the individuals with
randomly selected proteins from the local search space and uses a tabu list to ensure that
no duplicates are added to the new population
6) the rest of the new population is randomly selected from the global space
:return: The best solution, list of all best individuals
"""
best_individuals = []
for epoch in range(self.max_epochs):
print(f"** EPOCH {epoch} out of {self.max_epochs}")
# update the size of the new population by subtracting the number
# of the best individuals (selected in the previous epoch) from the new population size
pop_size = self.pop_size - len(best_individuals)
population, local_space = self.generate_population(pop_size)
# now add the best individuals to population
population.extend(best_individuals)
# calculate the MaxSum of each individual in the population
self.calc_fitness(population)
# Rank individuals in the population based on their fitness
ranked_individuals = self.rank_individuals(population)
# terminate once epochs have come to an end - no need to do anymore evolving
if (epoch + 1) == self.max_epochs:
best_proteins = ranked_individuals[0].proteins
return best_proteins, ranked_individuals
# new population creation
# select best x individuals from this population for next epoch
new_population = ranked_individuals[:self.best_retain]
# select x individuals for crossover - crossover from global embedding space
new_population.extend(self.crossover(ranked_individuals[:self.best_retain]))
# select x individuals for mutation - mutate from global embedding space
mutate_start = self.best_retain
mutate_end = self.best_retain * 2
new_population.extend(self.mutate(ranked_individuals[mutate_start:mutate_end],
list(self.embeddings.keys())))
# do some refining from local space
# refine x individuals with local search - select individuals a bit further down the list
mutate_start = mutate_end
mutate_end = mutate_end + self.best_retain * 2
new_population.extend(self.tabu_refine(ranked_individuals[mutate_start:mutate_end],
local_space, ranked_individuals[:self.best_retain]))
best_individuals = new_population
def generate_population(self, population_size):
"""Randomly create a population of solutions, these are a sublist of protein embeddings selected from the
global embedding space
:param population_size: int the population size to generate
:return list[ProteinSolutions]
"""
population = []
# a set to store the unique keys of the current solution space
local_space = set()
for i in range(population_size):
embeddings_list = random.sample(list(self.embeddings.keys()), self.dna_sze)
population.append(ProteinSolution(fitness=0.0, proteins=embeddings_list))
local_space.update(embeddings_list)
return population, local_space
def calc_fitness(self, population):
"""Calculate the max sum of the distances of all the embeddings in each individual solution
from each other and set the fitness on the individual
:param population: the list of protein solutions in the population for a single epoch
:return: None
"""
# print("In calf fitness and population is ", population)
for individual in population:
# list of protein labels
solution_subset = individual.proteins
for i in range(len(solution_subset)):
# get the actual embeddings array using the protein label
chromosome1 = self.embeddings[solution_subset[i]]
for j in range(len(solution_subset)):
chromosome2 = self.embeddings[solution_subset[j]]
# Euclidean distance
distance = np.linalg.norm(chromosome1 - chromosome2)
individual.fitness += distance
def rank_individuals(self, population):
"""Sort the population by fitness
:param population: current population
:return: the sorted population
"""
return sorted(population, key=lambda x: x.fitness, reverse=True)
def crossover(self, individuals):
"""Create a new set of solutions by performing cross over on individuals in the list
Should end up with only 1 child per two parents - can also do the reverse and end up with two children
take first half of parent 1's dna and add second half of parent 2's dna, can do vice versa
:param individuals: set of individuals to perform cross over on
:return: the list of new child individuals
"""
# assert that the list is even
new_individuals = []
for i in range(len(individuals), 2):
protein_parent_1 = individuals[i]
if i + 1 < len(individuals):
protein_parent_2 = individuals[i + 1]
else:
# end of the uneven list, use the first protein solution for crossover again
protein_parent_2 = individuals[0]
proteins_cut_off = self.dna_sze / 2
new_proteins_list = protein_parent_1.proteins[:proteins_cut_off] + protein_parent_2[proteins_cut_off:]
new_individuals.append(ProteinSolution(fitness=0, proteins=new_proteins_list))
return new_individuals
def mutate(self, individuals, emb):
"""Create a new set of solutions by mutation
Only ever mutate to a maximum of 75% of the current individual's dna
Randomly mutate each individual's dna - so the rate that each individual
is mutated by is different and randomly chosen between 1 and max limit
:param individuals: the individuals to mutate
:param emb: the embedding space to randomly sample from
:return:
"""
# mutate no more that 75% of the current solution
# math.ciel rounds it up to the next nearest int
mutation_limit = math.ceil(self.dna_sze * 0.75)
for individual in individuals:
# select a random number of chromosomes to mutate
proteins_to_mutate = random.randint(1, mutation_limit)
# Select where along the chromosomes to snip/ not mutate
protein_cut_off = self.dna_sze - proteins_to_mutate
new_proteins = random.sample(emb, proteins_to_mutate)
# mutate
individual.proteins = individual.proteins[:protein_cut_off] + new_proteins
# get ready for new fitness check
individual.fitness = 0
return individuals
def tabu_refine(self, individuals, local_space, tabu_list):
"""Refine the list of individuals to create a new set of solutions based on mutation
with x number of proteins from the local space, ensure that the new solutions are not in the tabu list
the tabu list is the list of best individuals
:param individuals: the list of solutions to refine
:param local_space: the randomly selected subset for this epoch
:param tabu_list: the list of the best individuals already in the new population
:return refined_individuals: list of refined, through local mutation, individuals
"""
tabu_list = [x.proteins for x in tabu_list]
recreate_list = individuals
return_size = len(recreate_list)
max_retries = self.local_iters
new_population = []
while max_retries > 0:
refined_individuals = self.mutate(recreate_list, local_space)
recreate_list = []
for individual in refined_individuals:
if individual.proteins in tabu_list:
recreate_list.append(individual)
else:
new_population.append(individual)
if len(new_population) == return_size:
return new_population
max_retries -= 1
return new_population