-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Jonas Verhellen
authored and
Jonas Verhellen
committed
Jul 2, 2020
1 parent
63ddcba
commit f07d17a
Showing
28 changed files
with
252,705 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
import csv | ||
import random | ||
import itertools | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from typing import List, Tuple | ||
|
||
from datetime import datetime | ||
from sklearn.cluster import KMeans | ||
from sklearn.neighbors import KDTree | ||
|
||
from rdkit import Chem | ||
from rdkit import rdBase | ||
from rdkit.Chem import AllChem | ||
rdBase.DisableLog('rdApp.error') | ||
from rdkit.Chem import Lipinski | ||
|
||
class elite(): | ||
def __init__(self, index, descriptor): | ||
self.index = index | ||
self.fitness = 0.0 | ||
self.molecule = None | ||
self.descriptor = descriptor | ||
|
||
def update(self, fitness, molecule, descriptor): | ||
if self.fitness < fitness: | ||
self.fitness = fitness | ||
self.molecule = molecule | ||
self.descriptor = descriptor | ||
return None | ||
|
||
class archive: | ||
def __init__(self, archive_config, descriptor_config) -> None: | ||
self.archive_name = archive_config.name | ||
self.archive_size = archive_config.size | ||
kmeans = KMeans(n_clusters=self.archive_size, n_jobs=-1) | ||
kmeans = kmeans.fit(np.random.rand(archive_config.accuracy, len(descriptor_config.properties))) | ||
self.cvt_centers = kmeans.cluster_centers_ | ||
self.cvt = KDTree(self.cvt_centers, metric='euclidean') | ||
self.elites = [elite(index, cvt_center) for index, cvt_center in enumerate(self.cvt_centers, start=0)] | ||
with open('{}/statistics.csv'.format(self.archive_name), 'w') as file: | ||
file.write("## Argenomic Statistics File: {}".format(datetime.now())) | ||
file.close() | ||
return None | ||
|
||
def cvt_index(self, descriptor: List[float]) -> int: | ||
return self.cvt.query([descriptor], k=1)[1][0][0] | ||
|
||
def add_to_archive(self, molecules: List[Chem.Mol], descriptors: List[List[float]], fitnesses: List[float]) -> None: | ||
for molecule, descriptor, fitness in zip(molecules, descriptors, fitnesses): | ||
self.elites[self.cvt_index(descriptor)].update(fitness, molecule, descriptor) | ||
return None | ||
|
||
def sample(self, size: int) -> List[Chem.Mol]: | ||
pairs = [(elite.molecule, elite.fitness) for elite in self.elites if elite.fitness > 0.0] | ||
molecules, weights = map(list, zip(*pairs)) | ||
return random.choices(molecules, k=size, weights=weights) | ||
|
||
def sample_pairs(self, size: int) -> List[Tuple[Chem.Mol, Chem.Mol]]: | ||
pairs = [(elite.molecule, elite.fitness) for elite in self.elites if elite.fitness > 0.0] | ||
molecules, weights = map(list, zip(*pairs)) | ||
sample_molecules = random.choices(molecules, k=size, weights=weights) | ||
sample_pairs = np.random.choice(list(filter(None, sample_molecules)), size=(size, 2), replace=True) | ||
sample_pairs = [tuple(sample_pair) for sample_pair in sample_pairs] | ||
return sample_pairs | ||
|
||
def store_archive(self, generation: float) -> None: | ||
elites_smiles, elites_descriptors, elites_fitnesses = self.elites_data() | ||
data = {'elites': elites_smiles, 'descriptors': elites_descriptors, 'fitnesses': elites_fitnesses} | ||
pd.DataFrame(data=data).to_csv("{}/archive_{}.csv".format(self.archive_name, generation), index=False) | ||
return None | ||
|
||
def store_statistics(self, generation: float) -> None: | ||
elites_smiles, elites_descriptors, elites_fitnesses = self.elites_data() | ||
fractional_size = len(elites_smiles)/self.archive_size | ||
statistics = [generation, np.max(elites_fitnesses), np.mean(elites_fitnesses), np.std(elites_fitnesses), fractional_size] | ||
with open('{}/statistics.csv'.format(self.archive_name), 'a') as file: | ||
csv.writer(file).writerow(statistics) | ||
print('Generation: {}, Size: {:.2f}'.format(statistics[0], statistics[4])) | ||
print('Fitness Max: {:.7f}, Mean: {:.7f}, Std: {:.7f}'.format(statistics[1], statistics[2], statistics[3])) | ||
return None | ||
|
||
def elites_data(self) -> Tuple[List[str], List[float], List[float]]: | ||
elites_list = [elite for elite in self.elites if elite.molecule] | ||
elites_smiles = [Chem.MolToSmiles(elite.molecule) for elite in elites_list] | ||
elites_descriptors = [elite.descriptor for elite in elites_list] | ||
elites_fitnesses = [elite.fitness for elite in elites_list] | ||
return elites_smiles, elites_descriptors, elites_fitnesses | ||
|
||
|
||
class arbiter: | ||
""" | ||
A catalog class containing different druglike filters for small molecules. | ||
Includes the option to run the structural filters from ChEMBL. | ||
""" | ||
def __init__(self, arbiter_config) -> None: | ||
self.rules_dict = pd.read_csv("../data/smarts/alert_collection.csv") | ||
self.rules_dict= self.rules_dict[self.rules_dict.rule_set_name.isin(arbiter_config.rules)] | ||
self.rules_list = self.rules_dict["smarts"].values.tolist() | ||
self.tolerance_list = pd.to_numeric(self.rules_dict["max"]).values.tolist() | ||
self.pattern_list = [Chem.MolFromSmarts(smarts) for smarts in self.rules_list] | ||
|
||
def __call__(self, molecules:List[Chem.Mol]) -> List[Chem.Mol]: | ||
""" | ||
Applies the chosen filters (hologenicity, veber_infractions, | ||
ChEMBL structural alerts, ...) to a list of molecules. | ||
""" | ||
filtered_molecules = [] | ||
for molecule in molecules: | ||
if self.molecule_validity(molecule): | ||
filtered_molecules.append(molecule) | ||
return filtered_molecules | ||
|
||
def molecule_validity(self, molecule: Chem.Mol) -> bool: | ||
""" | ||
Checks if a given molecule passes through the chosen filters (hologenicity, | ||
veber_infractions, ChEMBL structural alerts, ...). | ||
""" | ||
toxicity = self.toxicity(molecule) | ||
hologenicity = self.hologenicity(molecule) | ||
veber_infraction = self.veber_infraction(molecule) | ||
validity = not (toxicity or hologenicity or veber_infraction) | ||
if molecule.HasSubstructMatch(Chem.MolFromSmarts('[R]')): | ||
ring_infraction = self.ring_infraction(molecule) | ||
validity = validity and not (ring_infraction) | ||
return validity | ||
|
||
def toxicity(self, molecule: Chem.Mol) -> bool: | ||
""" | ||
Checks if a given molecule fails the structural filters. | ||
""" | ||
for (pattern, tolerance) in zip(self.pattern_list, self.tolerance_list): | ||
if len(molecule.GetSubstructMatches(pattern)) > tolerance: | ||
return True | ||
return False | ||
|
||
@staticmethod | ||
def hologenicity(molecule: Chem.Mol) -> bool: | ||
""" | ||
Checks if a given molecule fails the hologenicity filters. | ||
""" | ||
fluorine_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[F]'))) > 6 | ||
bromide_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[Br]'))) > 3 | ||
chlorine_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[Cl]'))) > 3 | ||
return chlorine_saturation or bromide_saturation or fluorine_saturation | ||
|
||
@staticmethod | ||
def ring_infraction(molecule: Chem.Mol) -> bool: | ||
""" | ||
Checks if a given molecule fails the ring infraction filters. | ||
""" | ||
ring_allene = molecule.HasSubstructMatch(Chem.MolFromSmarts('[R]=[R]=[R]')) | ||
macro_cycle = max([len(j) for j in molecule.GetRingInfo().AtomRings()]) > 6 | ||
double_bond_in_small_ring = molecule.HasSubstructMatch(Chem.MolFromSmarts('[r3,r4]=[r3,r4]')) | ||
return ring_allene or macro_cycle or double_bond_in_small_ring | ||
|
||
@staticmethod | ||
def veber_infraction(molecule: Chem.Mol) -> bool: | ||
""" | ||
Checks if a given molecule fails the veber infraction filters. | ||
""" | ||
rotatable_bond_saturation = Lipinski.NumRotatableBonds(molecule) > 10 | ||
hydrogen_bond_saturation = Lipinski.NumHAcceptors(molecule) + Lipinski.NumHDonors(molecule) > 10 | ||
return rotatable_bond_saturation or hydrogen_bond_saturation |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
import sys | ||
import numpy as np | ||
import pandas as pd | ||
from typing import List, Tuple | ||
|
||
from rdkit import Chem | ||
from rdkit import rdBase | ||
from rdkit import RDConfig | ||
rdBase.DisableLog('rdApp.error') | ||
|
||
from rdkit.Chem import AllChem | ||
from rdkit.Chem import Crippen | ||
from rdkit.Chem import Lipinski | ||
from rdkit.Chem import Descriptors | ||
from rdkit.Chem import rdMolDescriptors | ||
from rdkit.DataStructs.cDataStructs import TanimotoSimilarity | ||
|
||
class descriptor: | ||
""" | ||
A strategy class for calculating the descriptor vector of a molecule. | ||
""" | ||
def __init__(self, config_descriptor) -> None: | ||
self.properties = [] | ||
self.ranges = config_descriptor.ranges | ||
self.property_names = config_descriptor.properties | ||
for name in self.property_names: | ||
module, fuction = name.split(".") | ||
module = getattr(sys.modules[__name__], module) | ||
self.properties.append(getattr(module, fuction)) | ||
return None | ||
|
||
def __call__(self, molecule: Chem.Mol) -> List[float]: | ||
""" | ||
Calculating the descriptor vector of a molecule. | ||
""" | ||
descriptor = [] | ||
for property, range in zip(self.properties, self.ranges): | ||
descriptor.append(self.rescale(property(molecule), range)) | ||
return descriptor | ||
|
||
@staticmethod | ||
def rescale(feature: List[float], range: List[float]) -> List[float]: | ||
""" | ||
Rescaling the feature to the unit range. | ||
""" | ||
rescaled_feature = (feature - range[0])/(range[1] - range[0]) | ||
return rescaled_feature | ||
|
||
class fitness: | ||
""" | ||
A strategy class for calculating the fitness of a molecule. | ||
""" | ||
def __init__(self, config_fitness) -> None: | ||
self.memoized_cache = dict() | ||
self.fingerprint_type = config_fitness.type | ||
self.target = Chem.MolFromSmiles(config_fitness.target) | ||
self.target_fingerprint = self.get_fingerprint(self.target, self.fingerprint_type) | ||
return None | ||
|
||
def __call__(self, molecule: Chem.Mol) -> float: | ||
smiles = Chem.MolToSmiles(molecule) | ||
if smiles in self.memoized_cache: | ||
fitness = self.memoized_cache[smiles] | ||
else: | ||
molecule_fingerprint = self.get_fingerprint(molecule, self.fingerprint_type) | ||
fitness = TanimotoSimilarity(self.target_fingerprint, molecule_fingerprint) | ||
self.memoized_cache[smiles] = fitness | ||
return fitness | ||
|
||
def get_fingerprint(self, molecule: Chem.Mol, fingerprint_type: str): | ||
method_name = 'get_' + fingerprint_type | ||
method = getattr(self, method_name) | ||
if method is None: | ||
raise Exception('{} is not a supported fingerprint type.'.format(fingerprint_type)) | ||
return method(molecule) | ||
|
||
def get_ECFP4(self, molecule: Chem.Mol): | ||
return AllChem.GetMorganFingerprint(molecule, 2) | ||
|
||
def get_ECFP6(self, molecule: Chem.Mol): | ||
return AllChem.GetMorganFingerprint(molecule, 3) | ||
|
||
def get_FCFP4(self, molecule: Chem.Mol): | ||
return AllChem.GetMorganFingerprint(molecule, 2, useFeatures=True) | ||
|
||
def get_FCFP6(self, molecule: Chem.Mol): | ||
return AllChem.GetMorganFingerprint(molecule, 3, useFeatures=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import random | ||
import logging | ||
import numpy as np | ||
import pandas as pd | ||
from typing import List, Tuple | ||
|
||
from rdkit import Chem | ||
from rdkit import rdBase | ||
rdBase.DisableLog('rdApp.error') | ||
|
||
from rdkit.Chem import AllChem | ||
from rdkit.Chem import rdMMPA | ||
|
||
class mutator: | ||
""" | ||
A catalog class containing and implementing mutations to small molecules | ||
according to the principles of positional analogue scanning. | ||
""" | ||
def __init__(self) -> None: | ||
self.mutation_data = pd.read_csv("../data/smarts/mutation_collection.tsv", sep='\t') | ||
|
||
def __call__(self, molecule:Chem.Mol) -> List[Chem.Mol]: | ||
sampled_mutation = self.mutation_data.sample(n=1, weights='probability').iloc[0] | ||
reaction = AllChem.ReactionFromSmarts(sampled_mutation['smarts']) | ||
try: | ||
molecules = [products[0] for products in reaction.RunReactants([molecule])] | ||
except: | ||
molecules = [] | ||
return molecules | ||
|
||
class crossover: | ||
""" | ||
A strategy class implementing a parent-centric crossover of small molecules. | ||
""" | ||
def __init__(self): | ||
pass | ||
|
||
def __call__(self, molecule_pair:Tuple[Chem.Mol, Chem.Mol]) -> List[Chem.Mol]: | ||
molecule_cores, molecule_sidechains = self.fragmentate(molecule_pair) | ||
molecules = self.merge(molecule_cores, molecule_sidechains) | ||
return molecules | ||
|
||
def merge(self, molecule_cores:List[Chem.Mol], molecule_sidechains:List[Chem.Mol]) -> List[Chem.Mol]: | ||
molecules = [] | ||
random.shuffle(molecule_sidechains) | ||
reaction = AllChem.ReactionFromSmarts('[*:1]-[1*].[1*]-[*:2]>>[*:1]-[*:2]') | ||
for core, sidechain in zip(molecule_cores, molecule_sidechains): | ||
molecules.append(reaction.RunReactants((core, sidechain))[0][0]) | ||
return molecules | ||
|
||
def fragmentate(self, molecule_pair:Tuple[Chem.Mol, Chem.Mol]) -> Tuple[List[Chem.Mol], List[Chem.Mol]]: | ||
molecule_cores = [] | ||
molecule_sidechains = [] | ||
for molecule in molecule_pair: | ||
molecule_frags = rdMMPA.FragmentMol(molecule, maxCuts=1, resultsAsMols=False) | ||
_, molecule_frags = map(list, zip(*molecule_frags)) | ||
for molecule_pair in molecule_frags: | ||
core, sidechain = molecule_pair.split(".") | ||
molecule_cores.append(Chem.MolFromSmiles(core.replace("[*:1]", "[1*]"))) | ||
molecule_sidechains.append(Chem.MolFromSmiles(sidechain.replace("[*:1]", "[1*]"))) | ||
return molecule_cores, molecule_sidechains |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
{ | ||
"data_file":"../data/smiles/ZINC_first_1000.smi", | ||
"batch_size":40, | ||
"initial_size":150, | ||
"archive":{ | ||
"name":"../results/test", | ||
"size":150, | ||
"accuracy":25000 | ||
}, | ||
"descriptor":{ | ||
"properties":["Descriptors.ExactMolWt", "Descriptors.MolLogP", "Descriptors.TPSA", "Crippen.MolMR"], | ||
"ranges":[[225.0, 555.0], [-0.5, 5.5], [0.0, 140.0], [40.0, 130.0]] | ||
}, | ||
"scoring":{ | ||
"cache_size":2500, | ||
"cores":4 | ||
}, | ||
"fitness":{ | ||
"target":"Cc1c(C)c2OC(C)(COc3ccc(CC4SC(=O)NC4=O)cc3)CCc2c(C)c1O", | ||
"type":"ECFP4" | ||
}, | ||
"arbiter":{ | ||
"rules":["Glaxo"] | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
,jonas,jonas-K55VD,26.03.2020 15:29,file:///home/jonas/.config/libreoffice/4; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# Sample Package Data | ||
|
||
This directory contains sample additional data you may want to include with your package. | ||
This is a place where non-code related additional information (such as data files, molecular structures, etc.) can | ||
go that you want to ship alongside your code. | ||
|
||
Please note that it is not recommended to place large files in your git directory. If your project requires files larger | ||
than a few megabytes in size it is recommended to host these files elsewhere. This is especially true for binary files | ||
as the `git` structure is unable to correctly take updates to these files and will store a complete copy of every version | ||
in your `git` history which can quickly add up. As a note most `git` hosting services like GitHub have a 1 GB per repository | ||
cap. | ||
|
||
## Including package data | ||
|
||
Modify your package's `setup.py` file and the `setup()` command. Include the | ||
[`package_data`](http://setuptools.readthedocs.io/en/latest/setuptools.html#basic-use) keyword and point it at the | ||
correct files. | ||
|
||
## Manifest | ||
|
||
* `look_and_say.dat`: first entries of the "Look and Say" integer series, sequence [A005150](https://oeis.org/A005150) |
Oops, something went wrong.