Skip to content

Commit

Permalink
Initial upload (part 1)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jonas Verhellen authored and Jonas Verhellen committed Jul 2, 2020
1 parent 63ddcba commit f07d17a
Show file tree
Hide file tree
Showing 28 changed files with 252,705 additions and 0 deletions.
Binary file not shown.
Binary file added argenomic/__pycache__/mechanism.cpython-36.pyc
Binary file not shown.
Binary file added argenomic/__pycache__/mechanism.cpython-37.pyc
Binary file not shown.
Binary file added argenomic/__pycache__/operations.cpython-36.pyc
Binary file not shown.
165 changes: 165 additions & 0 deletions argenomic/infrastructure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import csv
import random
import itertools

import numpy as np
import pandas as pd
from typing import List, Tuple

from datetime import datetime
from sklearn.cluster import KMeans
from sklearn.neighbors import KDTree

from rdkit import Chem
from rdkit import rdBase
from rdkit.Chem import AllChem
rdBase.DisableLog('rdApp.error')
from rdkit.Chem import Lipinski

class elite():
def __init__(self, index, descriptor):
self.index = index
self.fitness = 0.0
self.molecule = None
self.descriptor = descriptor

def update(self, fitness, molecule, descriptor):
if self.fitness < fitness:
self.fitness = fitness
self.molecule = molecule
self.descriptor = descriptor
return None

class archive:
def __init__(self, archive_config, descriptor_config) -> None:
self.archive_name = archive_config.name
self.archive_size = archive_config.size
kmeans = KMeans(n_clusters=self.archive_size, n_jobs=-1)
kmeans = kmeans.fit(np.random.rand(archive_config.accuracy, len(descriptor_config.properties)))
self.cvt_centers = kmeans.cluster_centers_
self.cvt = KDTree(self.cvt_centers, metric='euclidean')
self.elites = [elite(index, cvt_center) for index, cvt_center in enumerate(self.cvt_centers, start=0)]
with open('{}/statistics.csv'.format(self.archive_name), 'w') as file:
file.write("## Argenomic Statistics File: {}".format(datetime.now()))
file.close()
return None

def cvt_index(self, descriptor: List[float]) -> int:
return self.cvt.query([descriptor], k=1)[1][0][0]

def add_to_archive(self, molecules: List[Chem.Mol], descriptors: List[List[float]], fitnesses: List[float]) -> None:
for molecule, descriptor, fitness in zip(molecules, descriptors, fitnesses):
self.elites[self.cvt_index(descriptor)].update(fitness, molecule, descriptor)
return None

def sample(self, size: int) -> List[Chem.Mol]:
pairs = [(elite.molecule, elite.fitness) for elite in self.elites if elite.fitness > 0.0]
molecules, weights = map(list, zip(*pairs))
return random.choices(molecules, k=size, weights=weights)

def sample_pairs(self, size: int) -> List[Tuple[Chem.Mol, Chem.Mol]]:
pairs = [(elite.molecule, elite.fitness) for elite in self.elites if elite.fitness > 0.0]
molecules, weights = map(list, zip(*pairs))
sample_molecules = random.choices(molecules, k=size, weights=weights)
sample_pairs = np.random.choice(list(filter(None, sample_molecules)), size=(size, 2), replace=True)
sample_pairs = [tuple(sample_pair) for sample_pair in sample_pairs]
return sample_pairs

def store_archive(self, generation: float) -> None:
elites_smiles, elites_descriptors, elites_fitnesses = self.elites_data()
data = {'elites': elites_smiles, 'descriptors': elites_descriptors, 'fitnesses': elites_fitnesses}
pd.DataFrame(data=data).to_csv("{}/archive_{}.csv".format(self.archive_name, generation), index=False)
return None

def store_statistics(self, generation: float) -> None:
elites_smiles, elites_descriptors, elites_fitnesses = self.elites_data()
fractional_size = len(elites_smiles)/self.archive_size
statistics = [generation, np.max(elites_fitnesses), np.mean(elites_fitnesses), np.std(elites_fitnesses), fractional_size]
with open('{}/statistics.csv'.format(self.archive_name), 'a') as file:
csv.writer(file).writerow(statistics)
print('Generation: {}, Size: {:.2f}'.format(statistics[0], statistics[4]))
print('Fitness Max: {:.7f}, Mean: {:.7f}, Std: {:.7f}'.format(statistics[1], statistics[2], statistics[3]))
return None

def elites_data(self) -> Tuple[List[str], List[float], List[float]]:
elites_list = [elite for elite in self.elites if elite.molecule]
elites_smiles = [Chem.MolToSmiles(elite.molecule) for elite in elites_list]
elites_descriptors = [elite.descriptor for elite in elites_list]
elites_fitnesses = [elite.fitness for elite in elites_list]
return elites_smiles, elites_descriptors, elites_fitnesses


class arbiter:
"""
A catalog class containing different druglike filters for small molecules.
Includes the option to run the structural filters from ChEMBL.
"""
def __init__(self, arbiter_config) -> None:
self.rules_dict = pd.read_csv("../data/smarts/alert_collection.csv")
self.rules_dict= self.rules_dict[self.rules_dict.rule_set_name.isin(arbiter_config.rules)]
self.rules_list = self.rules_dict["smarts"].values.tolist()
self.tolerance_list = pd.to_numeric(self.rules_dict["max"]).values.tolist()
self.pattern_list = [Chem.MolFromSmarts(smarts) for smarts in self.rules_list]

def __call__(self, molecules:List[Chem.Mol]) -> List[Chem.Mol]:
"""
Applies the chosen filters (hologenicity, veber_infractions,
ChEMBL structural alerts, ...) to a list of molecules.
"""
filtered_molecules = []
for molecule in molecules:
if self.molecule_validity(molecule):
filtered_molecules.append(molecule)
return filtered_molecules

def molecule_validity(self, molecule: Chem.Mol) -> bool:
"""
Checks if a given molecule passes through the chosen filters (hologenicity,
veber_infractions, ChEMBL structural alerts, ...).
"""
toxicity = self.toxicity(molecule)
hologenicity = self.hologenicity(molecule)
veber_infraction = self.veber_infraction(molecule)
validity = not (toxicity or hologenicity or veber_infraction)
if molecule.HasSubstructMatch(Chem.MolFromSmarts('[R]')):
ring_infraction = self.ring_infraction(molecule)
validity = validity and not (ring_infraction)
return validity

def toxicity(self, molecule: Chem.Mol) -> bool:
"""
Checks if a given molecule fails the structural filters.
"""
for (pattern, tolerance) in zip(self.pattern_list, self.tolerance_list):
if len(molecule.GetSubstructMatches(pattern)) > tolerance:
return True
return False

@staticmethod
def hologenicity(molecule: Chem.Mol) -> bool:
"""
Checks if a given molecule fails the hologenicity filters.
"""
fluorine_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[F]'))) > 6
bromide_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[Br]'))) > 3
chlorine_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[Cl]'))) > 3
return chlorine_saturation or bromide_saturation or fluorine_saturation

@staticmethod
def ring_infraction(molecule: Chem.Mol) -> bool:
"""
Checks if a given molecule fails the ring infraction filters.
"""
ring_allene = molecule.HasSubstructMatch(Chem.MolFromSmarts('[R]=[R]=[R]'))
macro_cycle = max([len(j) for j in molecule.GetRingInfo().AtomRings()]) > 6
double_bond_in_small_ring = molecule.HasSubstructMatch(Chem.MolFromSmarts('[r3,r4]=[r3,r4]'))
return ring_allene or macro_cycle or double_bond_in_small_ring

@staticmethod
def veber_infraction(molecule: Chem.Mol) -> bool:
"""
Checks if a given molecule fails the veber infraction filters.
"""
rotatable_bond_saturation = Lipinski.NumRotatableBonds(molecule) > 10
hydrogen_bond_saturation = Lipinski.NumHAcceptors(molecule) + Lipinski.NumHDonors(molecule) > 10
return rotatable_bond_saturation or hydrogen_bond_saturation
87 changes: 87 additions & 0 deletions argenomic/mechanism.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import sys
import numpy as np
import pandas as pd
from typing import List, Tuple

from rdkit import Chem
from rdkit import rdBase
from rdkit import RDConfig
rdBase.DisableLog('rdApp.error')

from rdkit.Chem import AllChem
from rdkit.Chem import Crippen
from rdkit.Chem import Lipinski
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.DataStructs.cDataStructs import TanimotoSimilarity

class descriptor:
"""
A strategy class for calculating the descriptor vector of a molecule.
"""
def __init__(self, config_descriptor) -> None:
self.properties = []
self.ranges = config_descriptor.ranges
self.property_names = config_descriptor.properties
for name in self.property_names:
module, fuction = name.split(".")
module = getattr(sys.modules[__name__], module)
self.properties.append(getattr(module, fuction))
return None

def __call__(self, molecule: Chem.Mol) -> List[float]:
"""
Calculating the descriptor vector of a molecule.
"""
descriptor = []
for property, range in zip(self.properties, self.ranges):
descriptor.append(self.rescale(property(molecule), range))
return descriptor

@staticmethod
def rescale(feature: List[float], range: List[float]) -> List[float]:
"""
Rescaling the feature to the unit range.
"""
rescaled_feature = (feature - range[0])/(range[1] - range[0])
return rescaled_feature

class fitness:
"""
A strategy class for calculating the fitness of a molecule.
"""
def __init__(self, config_fitness) -> None:
self.memoized_cache = dict()
self.fingerprint_type = config_fitness.type
self.target = Chem.MolFromSmiles(config_fitness.target)
self.target_fingerprint = self.get_fingerprint(self.target, self.fingerprint_type)
return None

def __call__(self, molecule: Chem.Mol) -> float:
smiles = Chem.MolToSmiles(molecule)
if smiles in self.memoized_cache:
fitness = self.memoized_cache[smiles]
else:
molecule_fingerprint = self.get_fingerprint(molecule, self.fingerprint_type)
fitness = TanimotoSimilarity(self.target_fingerprint, molecule_fingerprint)
self.memoized_cache[smiles] = fitness
return fitness

def get_fingerprint(self, molecule: Chem.Mol, fingerprint_type: str):
method_name = 'get_' + fingerprint_type
method = getattr(self, method_name)
if method is None:
raise Exception('{} is not a supported fingerprint type.'.format(fingerprint_type))
return method(molecule)

def get_ECFP4(self, molecule: Chem.Mol):
return AllChem.GetMorganFingerprint(molecule, 2)

def get_ECFP6(self, molecule: Chem.Mol):
return AllChem.GetMorganFingerprint(molecule, 3)

def get_FCFP4(self, molecule: Chem.Mol):
return AllChem.GetMorganFingerprint(molecule, 2, useFeatures=True)

def get_FCFP6(self, molecule: Chem.Mol):
return AllChem.GetMorganFingerprint(molecule, 3, useFeatures=True)
61 changes: 61 additions & 0 deletions argenomic/operations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import random
import logging
import numpy as np
import pandas as pd
from typing import List, Tuple

from rdkit import Chem
from rdkit import rdBase
rdBase.DisableLog('rdApp.error')

from rdkit.Chem import AllChem
from rdkit.Chem import rdMMPA

class mutator:
"""
A catalog class containing and implementing mutations to small molecules
according to the principles of positional analogue scanning.
"""
def __init__(self) -> None:
self.mutation_data = pd.read_csv("../data/smarts/mutation_collection.tsv", sep='\t')

def __call__(self, molecule:Chem.Mol) -> List[Chem.Mol]:
sampled_mutation = self.mutation_data.sample(n=1, weights='probability').iloc[0]
reaction = AllChem.ReactionFromSmarts(sampled_mutation['smarts'])
try:
molecules = [products[0] for products in reaction.RunReactants([molecule])]
except:
molecules = []
return molecules

class crossover:
"""
A strategy class implementing a parent-centric crossover of small molecules.
"""
def __init__(self):
pass

def __call__(self, molecule_pair:Tuple[Chem.Mol, Chem.Mol]) -> List[Chem.Mol]:
molecule_cores, molecule_sidechains = self.fragmentate(molecule_pair)
molecules = self.merge(molecule_cores, molecule_sidechains)
return molecules

def merge(self, molecule_cores:List[Chem.Mol], molecule_sidechains:List[Chem.Mol]) -> List[Chem.Mol]:
molecules = []
random.shuffle(molecule_sidechains)
reaction = AllChem.ReactionFromSmarts('[*:1]-[1*].[1*]-[*:2]>>[*:1]-[*:2]')
for core, sidechain in zip(molecule_cores, molecule_sidechains):
molecules.append(reaction.RunReactants((core, sidechain))[0][0])
return molecules

def fragmentate(self, molecule_pair:Tuple[Chem.Mol, Chem.Mol]) -> Tuple[List[Chem.Mol], List[Chem.Mol]]:
molecule_cores = []
molecule_sidechains = []
for molecule in molecule_pair:
molecule_frags = rdMMPA.FragmentMol(molecule, maxCuts=1, resultsAsMols=False)
_, molecule_frags = map(list, zip(*molecule_frags))
for molecule_pair in molecule_frags:
core, sidechain = molecule_pair.split(".")
molecule_cores.append(Chem.MolFromSmiles(core.replace("[*:1]", "[1*]")))
molecule_sidechains.append(Chem.MolFromSmiles(sidechain.replace("[*:1]", "[1*]")))
return molecule_cores, molecule_sidechains
25 changes: 25 additions & 0 deletions configuration/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"data_file":"../data/smiles/ZINC_first_1000.smi",
"batch_size":40,
"initial_size":150,
"archive":{
"name":"../results/test",
"size":150,
"accuracy":25000
},
"descriptor":{
"properties":["Descriptors.ExactMolWt", "Descriptors.MolLogP", "Descriptors.TPSA", "Crippen.MolMR"],
"ranges":[[225.0, 555.0], [-0.5, 5.5], [0.0, 140.0], [40.0, 130.0]]
},
"scoring":{
"cache_size":2500,
"cores":4
},
"fitness":{
"target":"Cc1c(C)c2OC(C)(COc3ccc(CC4SC(=O)NC4=O)cc3)CCc2c(C)c1O",
"type":"ECFP4"
},
"arbiter":{
"rules":["Glaxo"]
}
}
1 change: 1 addition & 0 deletions data/.~lock.smiles_targets.ods#
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
,jonas,jonas-K55VD,26.03.2020 15:29,file:///home/jonas/.config/libreoffice/4;
21 changes: 21 additions & 0 deletions data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Sample Package Data

This directory contains sample additional data you may want to include with your package.
This is a place where non-code related additional information (such as data files, molecular structures, etc.) can
go that you want to ship alongside your code.

Please note that it is not recommended to place large files in your git directory. If your project requires files larger
than a few megabytes in size it is recommended to host these files elsewhere. This is especially true for binary files
as the `git` structure is unable to correctly take updates to these files and will store a complete copy of every version
in your `git` history which can quickly add up. As a note most `git` hosting services like GitHub have a 1 GB per repository
cap.

## Including package data

Modify your package's `setup.py` file and the `setup()` command. Include the
[`package_data`](http://setuptools.readthedocs.io/en/latest/setuptools.html#basic-use) keyword and point it at the
correct files.

## Manifest

* `look_and_say.dat`: first entries of the "Look and Say" integer series, sequence [A005150](https://oeis.org/A005150)
Loading

0 comments on commit f07d17a

Please sign in to comment.