Initial upload (part 1)

Jonas-Verhellen · Jul 2, 2020 · f07d17a · f07d17a
1 parent 63ddcba
commit f07d17a
Show file tree

Hide file tree

Showing 28 changed files with 252,705 additions and 0 deletions.
diff --git a/argenomic/__pycache__/infrastructure.cpython-36.pyc b/argenomic/__pycache__/infrastructure.cpython-36.pyc
diff --git a/argenomic/__pycache__/mechanism.cpython-36.pyc b/argenomic/__pycache__/mechanism.cpython-36.pyc
diff --git a/argenomic/__pycache__/mechanism.cpython-37.pyc b/argenomic/__pycache__/mechanism.cpython-37.pyc
diff --git a/argenomic/__pycache__/operations.cpython-36.pyc b/argenomic/__pycache__/operations.cpython-36.pyc
diff --git a/argenomic/infrastructure.py b/argenomic/infrastructure.py
@@ -0,0 +1,165 @@
+import csv
+import random
+import itertools
+
+import numpy as np
+import pandas as pd
+from typing import List, Tuple
+
+from datetime import datetime
+from sklearn.cluster import KMeans
+from sklearn.neighbors import KDTree
+
+from rdkit import Chem
+from rdkit import rdBase
+from rdkit.Chem import AllChem
+rdBase.DisableLog('rdApp.error')
+from rdkit.Chem import Lipinski
+
+class elite():
+    def __init__(self, index, descriptor):
+        self.index = index
+        self.fitness = 0.0
+        self.molecule = None
+        self.descriptor = descriptor
+
+    def update(self, fitness, molecule, descriptor):
+        if self.fitness < fitness:
+            self.fitness = fitness
+            self.molecule = molecule
+            self.descriptor = descriptor
+        return None
+
+class archive:
+    def __init__(self, archive_config, descriptor_config) -> None:
+        self.archive_name = archive_config.name
+        self.archive_size = archive_config.size
+        kmeans = KMeans(n_clusters=self.archive_size, n_jobs=-1)
+        kmeans = kmeans.fit(np.random.rand(archive_config.accuracy, len(descriptor_config.properties)))
+        self.cvt_centers = kmeans.cluster_centers_
+        self.cvt = KDTree(self.cvt_centers, metric='euclidean')
+        self.elites = [elite(index, cvt_center) for index, cvt_center in enumerate(self.cvt_centers, start=0)]
+        with open('{}/statistics.csv'.format(self.archive_name), 'w') as file:
+            file.write("## Argenomic Statistics File: {}".format(datetime.now()))
+            file.close()
+        return None
+
+    def cvt_index(self, descriptor: List[float]) -> int:
+        return self.cvt.query([descriptor], k=1)[1][0][0]
+
+    def add_to_archive(self, molecules: List[Chem.Mol], descriptors: List[List[float]], fitnesses: List[float]) -> None:
+        for molecule, descriptor, fitness in zip(molecules, descriptors, fitnesses):
+            self.elites[self.cvt_index(descriptor)].update(fitness, molecule, descriptor)
+        return None
+
+    def sample(self, size: int) -> List[Chem.Mol]:
+        pairs = [(elite.molecule, elite.fitness) for elite in self.elites if elite.fitness > 0.0]
+        molecules, weights = map(list, zip(*pairs))
+        return random.choices(molecules, k=size, weights=weights)
+
+    def sample_pairs(self, size: int) -> List[Tuple[Chem.Mol, Chem.Mol]]:
+        pairs = [(elite.molecule, elite.fitness) for elite in self.elites if elite.fitness > 0.0]
+        molecules, weights = map(list, zip(*pairs))
+        sample_molecules = random.choices(molecules, k=size, weights=weights)
+        sample_pairs = np.random.choice(list(filter(None, sample_molecules)), size=(size, 2), replace=True)
+        sample_pairs = [tuple(sample_pair) for sample_pair in sample_pairs]
+        return sample_pairs
+
+    def store_archive(self, generation: float) -> None:
+        elites_smiles, elites_descriptors, elites_fitnesses = self.elites_data()
+        data = {'elites': elites_smiles, 'descriptors': elites_descriptors, 'fitnesses': elites_fitnesses}
+        pd.DataFrame(data=data).to_csv("{}/archive_{}.csv".format(self.archive_name, generation), index=False)
+        return None
+
+    def store_statistics(self, generation: float) -> None:
+        elites_smiles, elites_descriptors, elites_fitnesses = self.elites_data()
+        fractional_size = len(elites_smiles)/self.archive_size
+        statistics = [generation, np.max(elites_fitnesses), np.mean(elites_fitnesses), np.std(elites_fitnesses), fractional_size]
+        with open('{}/statistics.csv'.format(self.archive_name), 'a') as file:
+            csv.writer(file).writerow(statistics)
+        print('Generation: {}, Size: {:.2f}'.format(statistics[0], statistics[4]))
+        print('Fitness Max: {:.7f}, Mean: {:.7f}, Std: {:.7f}'.format(statistics[1], statistics[2], statistics[3]))
+        return None
+
+    def elites_data(self) -> Tuple[List[str], List[float], List[float]]:
+        elites_list = [elite for elite in self.elites if elite.molecule]
+        elites_smiles = [Chem.MolToSmiles(elite.molecule) for elite in elites_list]
+        elites_descriptors = [elite.descriptor for elite in elites_list]
+        elites_fitnesses = [elite.fitness for elite in elites_list]
+        return elites_smiles, elites_descriptors, elites_fitnesses
+
+
+class arbiter:
+    """
+    A catalog class containing different druglike filters for small molecules.
+    Includes the option to run the structural filters from ChEMBL.
+    """
+    def __init__(self, arbiter_config) -> None:
+      self.rules_dict = pd.read_csv("../data/smarts/alert_collection.csv")
+      self.rules_dict= self.rules_dict[self.rules_dict.rule_set_name.isin(arbiter_config.rules)]
+      self.rules_list = self.rules_dict["smarts"].values.tolist()
+      self.tolerance_list = pd.to_numeric(self.rules_dict["max"]).values.tolist()
+      self.pattern_list = [Chem.MolFromSmarts(smarts) for smarts in self.rules_list]
+
+    def __call__(self, molecules:List[Chem.Mol]) -> List[Chem.Mol]:
+      """
+      Applies the chosen filters (hologenicity, veber_infractions,
+      ChEMBL structural alerts, ...) to a list of molecules.
+      """
+      filtered_molecules = []
+      for molecule in molecules:
+        if self.molecule_validity(molecule):
+          filtered_molecules.append(molecule)
+      return filtered_molecules
+
+    def molecule_validity(self, molecule: Chem.Mol) -> bool:
+      """
+      Checks if a given molecule passes through the chosen filters (hologenicity,
+      veber_infractions, ChEMBL structural alerts, ...).
+      """
+      toxicity = self.toxicity(molecule)
+      hologenicity = self.hologenicity(molecule)
+      veber_infraction = self.veber_infraction(molecule)
+      validity = not (toxicity or hologenicity or veber_infraction)
+      if molecule.HasSubstructMatch(Chem.MolFromSmarts('[R]')):
+        ring_infraction = self.ring_infraction(molecule)
+        validity = validity and not (ring_infraction)
+      return validity
+
+    def toxicity(self, molecule: Chem.Mol) -> bool:
+      """
+      Checks if a given molecule fails the structural filters.
+      """
+      for (pattern, tolerance) in zip(self.pattern_list, self.tolerance_list):
+            if len(molecule.GetSubstructMatches(pattern)) > tolerance:
+              return True
+      return False
+
+    @staticmethod
+    def hologenicity(molecule: Chem.Mol) -> bool:
+      """
+      Checks if a given molecule fails the hologenicity filters.
+      """
+      fluorine_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[F]'))) > 6
+      bromide_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[Br]'))) > 3
+      chlorine_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[Cl]'))) > 3
+      return chlorine_saturation or bromide_saturation or fluorine_saturation
+
+    @staticmethod
+    def ring_infraction(molecule: Chem.Mol) -> bool:
+      """
+      Checks if a given molecule fails the ring infraction filters.
+      """
+      ring_allene = molecule.HasSubstructMatch(Chem.MolFromSmarts('[R]=[R]=[R]'))
+      macro_cycle = max([len(j) for j in molecule.GetRingInfo().AtomRings()]) > 6
+      double_bond_in_small_ring = molecule.HasSubstructMatch(Chem.MolFromSmarts('[r3,r4]=[r3,r4]'))
+      return ring_allene or macro_cycle or double_bond_in_small_ring
+
+    @staticmethod
+    def veber_infraction(molecule: Chem.Mol) -> bool:
+      """
+      Checks if a given molecule fails the veber infraction filters.
+      """
+      rotatable_bond_saturation = Lipinski.NumRotatableBonds(molecule) > 10
+      hydrogen_bond_saturation = Lipinski.NumHAcceptors(molecule) + Lipinski.NumHDonors(molecule) > 10
+      return rotatable_bond_saturation or hydrogen_bond_saturation
diff --git a/argenomic/mechanism.py b/argenomic/mechanism.py
@@ -0,0 +1,87 @@
+import sys
+import numpy as np
+import pandas as pd
+from typing import List, Tuple
+
+from rdkit import Chem
+from rdkit import rdBase
+from rdkit import RDConfig
+rdBase.DisableLog('rdApp.error')
+
+from rdkit.Chem import AllChem
+from rdkit.Chem import Crippen
+from rdkit.Chem import Lipinski
+from rdkit.Chem import Descriptors
+from rdkit.Chem import rdMolDescriptors
+from rdkit.DataStructs.cDataStructs import TanimotoSimilarity
+
+class descriptor:
+    """
+    A strategy class for calculating the descriptor vector of a molecule.
+    """
+    def __init__(self, config_descriptor) -> None:
+        self.properties = []
+        self.ranges = config_descriptor.ranges
+        self.property_names = config_descriptor.properties
+        for name in self.property_names:
+            module, fuction = name.split(".")
+            module = getattr(sys.modules[__name__], module)
+            self.properties.append(getattr(module, fuction))
+        return None
+
+    def __call__(self, molecule: Chem.Mol) -> List[float]:
+        """
+        Calculating the descriptor vector of a molecule.
+        """
+        descriptor = []
+        for property, range in zip(self.properties, self.ranges):
+            descriptor.append(self.rescale(property(molecule), range))
+        return descriptor
+
+    @staticmethod
+    def rescale(feature: List[float], range: List[float]) -> List[float]:
+        """
+        Rescaling the feature to the unit range.
+        """
+        rescaled_feature = (feature - range[0])/(range[1] - range[0])
+        return rescaled_feature
+
+class fitness:
+    """
+    A strategy class for calculating the fitness of a molecule.
+    """
+    def __init__(self, config_fitness) -> None:
+        self.memoized_cache = dict()
+        self.fingerprint_type = config_fitness.type
+        self.target = Chem.MolFromSmiles(config_fitness.target)
+        self.target_fingerprint = self.get_fingerprint(self.target, self.fingerprint_type)
+        return None
+
+    def __call__(self, molecule: Chem.Mol) -> float:
+        smiles = Chem.MolToSmiles(molecule)
+        if smiles in self.memoized_cache:
+            fitness = self.memoized_cache[smiles]
+        else:
+            molecule_fingerprint = self.get_fingerprint(molecule, self.fingerprint_type)
+            fitness = TanimotoSimilarity(self.target_fingerprint, molecule_fingerprint)
+            self.memoized_cache[smiles] = fitness
+        return fitness
+
+    def get_fingerprint(self, molecule: Chem.Mol, fingerprint_type: str):
+        method_name = 'get_' + fingerprint_type
+        method = getattr(self, method_name)
+        if method is None:
+            raise Exception('{} is not a supported fingerprint type.'.format(fingerprint_type))
+        return method(molecule)
+
+    def get_ECFP4(self, molecule: Chem.Mol):
+        return AllChem.GetMorganFingerprint(molecule, 2)
+
+    def get_ECFP6(self, molecule: Chem.Mol):
+        return AllChem.GetMorganFingerprint(molecule, 3)
+
+    def get_FCFP4(self, molecule: Chem.Mol):
+        return AllChem.GetMorganFingerprint(molecule, 2, useFeatures=True)
+
+    def get_FCFP6(self, molecule: Chem.Mol):
+        return AllChem.GetMorganFingerprint(molecule, 3, useFeatures=True)
diff --git a/argenomic/operations.py b/argenomic/operations.py
@@ -0,0 +1,61 @@
+import random
+import logging
+import numpy as np
+import pandas as pd
+from typing import List, Tuple
+
+from rdkit import Chem
+from rdkit import rdBase
+rdBase.DisableLog('rdApp.error')
+
+from rdkit.Chem import AllChem
+from rdkit.Chem import rdMMPA
+
+class mutator:
+    """
+    A catalog class containing and implementing mutations to small molecules
+    according to the principles of positional analogue scanning.
+    """
+    def __init__(self) -> None:
+        self.mutation_data = pd.read_csv("../data/smarts/mutation_collection.tsv", sep='\t')
+
+    def __call__(self, molecule:Chem.Mol) -> List[Chem.Mol]:
+        sampled_mutation = self.mutation_data.sample(n=1, weights='probability').iloc[0]
+        reaction = AllChem.ReactionFromSmarts(sampled_mutation['smarts'])
+        try:
+            molecules = [products[0] for products in reaction.RunReactants([molecule])]
+        except:
+            molecules = []
+        return molecules
+
+class crossover:
+    """
+    A strategy class implementing a parent-centric crossover of small molecules.
+    """
+    def __init__(self):
+        pass
+
+    def __call__(self, molecule_pair:Tuple[Chem.Mol, Chem.Mol]) -> List[Chem.Mol]:
+        molecule_cores, molecule_sidechains = self.fragmentate(molecule_pair)
+        molecules = self.merge(molecule_cores, molecule_sidechains)
+        return molecules
+
+    def merge(self, molecule_cores:List[Chem.Mol], molecule_sidechains:List[Chem.Mol]) -> List[Chem.Mol]:
+        molecules = []
+        random.shuffle(molecule_sidechains)
+        reaction = AllChem.ReactionFromSmarts('[*:1]-[1*].[1*]-[*:2]>>[*:1]-[*:2]')
+        for core, sidechain in zip(molecule_cores, molecule_sidechains):
+            molecules.append(reaction.RunReactants((core, sidechain))[0][0])
+        return molecules
+
+    def fragmentate(self, molecule_pair:Tuple[Chem.Mol, Chem.Mol]) -> Tuple[List[Chem.Mol], List[Chem.Mol]]:
+        molecule_cores = []
+        molecule_sidechains = []
+        for molecule in molecule_pair:
+            molecule_frags = rdMMPA.FragmentMol(molecule, maxCuts=1, resultsAsMols=False)
+            _, molecule_frags = map(list, zip(*molecule_frags))
+            for molecule_pair in molecule_frags:
+                core, sidechain = molecule_pair.split(".")
+                molecule_cores.append(Chem.MolFromSmiles(core.replace("[*:1]", "[1*]")))
+                molecule_sidechains.append(Chem.MolFromSmiles(sidechain.replace("[*:1]", "[1*]")))
+        return molecule_cores, molecule_sidechains
diff --git a/configuration/config.json b/configuration/config.json
@@ -0,0 +1,25 @@
+{
+    "data_file":"../data/smiles/ZINC_first_1000.smi",
+    "batch_size":40,
+    "initial_size":150,
+    "archive":{
+        "name":"../results/test",
+        "size":150,
+        "accuracy":25000
+    },
+    "descriptor":{
+        "properties":["Descriptors.ExactMolWt", "Descriptors.MolLogP", "Descriptors.TPSA", "Crippen.MolMR"],
+        "ranges":[[225.0, 555.0], [-0.5, 5.5], [0.0, 140.0], [40.0, 130.0]]
+    },
+    "scoring":{
+        "cache_size":2500,
+        "cores":4
+    },
+    "fitness":{
+        "target":"Cc1c(C)c2OC(C)(COc3ccc(CC4SC(=O)NC4=O)cc3)CCc2c(C)c1O",
+        "type":"ECFP4"
+    },
+    "arbiter":{
+        "rules":["Glaxo"]
+    }
+}
diff --git a/data/.~lock.smiles_targets.ods# b/data/.~lock.smiles_targets.ods#
@@ -0,0 +1 @@
+,jonas,jonas-K55VD,26.03.2020 15:29,file:///home/jonas/.config/libreoffice/4;
diff --git a/data/README.md b/data/README.md
@@ -0,0 +1,21 @@
+# Sample Package Data
+
+This directory contains sample additional data you may want to include with your package.
+This is a place where non-code related additional information (such as data files, molecular structures,  etc.) can 
+go that you want to ship alongside your code.
+
+Please note that it is not recommended to place large files in your git directory. If your project requires files larger
+than a few megabytes in size it is recommended to host these files elsewhere. This is especially true for binary files
+as the `git` structure is unable to correctly take updates to these files and will store a complete copy of every version
+in your `git` history which can quickly add up. As a note most `git` hosting services like GitHub have a 1 GB per repository
+cap.
+
+## Including package data
+
+Modify your package's `setup.py` file and the `setup()` command. Include the 
+[`package_data`](http://setuptools.readthedocs.io/en/latest/setuptools.html#basic-use) keyword and point it at the 
+correct files.
+
+## Manifest
+
+* `look_and_say.dat`: first entries of the "Look and Say" integer series, sequence [A005150](https://oeis.org/A005150)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		,jonas,jonas-K55VD,26.03.2020 15:29,file:///home/jonas/.config/libreoffice/4;