feat: adding SMILES standardization utilities. [bump minor]

Signed-off-by: Matteo Manica <tte@zurich.ibm.com>
rxn4chemistry · Feb 9, 2024 · c3d9993 · c3d9993
1 parent 27e97cb
commit c3d9993
Show file tree

Hide file tree

Showing 6 changed files with 234 additions and 3 deletions.
diff --git a/src/rxn/chemutils/multicomponent_smiles.py b/src/rxn/chemutils/multicomponent_smiles.py
@@ -2,6 +2,7 @@
 Utilities related to "multi-component SMILES", i.e. strings containing multiple compounds
 in SMILES notation, which may include fragment bonds.
 """
+
 from functools import partial
 from typing import Callable, Iterable, List, Optional
 

diff --git a/src/rxn/chemutils/reaction_smiles.py b/src/rxn/chemutils/reaction_smiles.py
@@ -4,6 +4,7 @@
 In a separate file than miscellaneous.py or conversion.py in order to avoid
 cyclic dependencies.
 """
+
 from enum import auto
 
 from rxn.utilities.types import RxnEnum

diff --git a/src/rxn/chemutils/smiles_standardization.py b/src/rxn/chemutils/smiles_standardization.py
@@ -0,0 +1,146 @@
+import logging
+from typing import Optional
+
+from .conversion import inchi_to_mol, mol_to_inchi, mol_to_smiles, smiles_to_mol
+from .exceptions import InvalidInchi, InvalidSmiles
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+
+RXN_SMILES_SEPARATOR = ">>"
+
+
+def standardize_smiles(
+    smiles: str,
+    canonicalize: bool = True,
+    sanitize: bool = True,
+    find_radicals: bool = True,
+    inchify: bool = False,
+) -> str:
+    """Ensure that a SMILES follows a desired standard.
+
+    It allows canonicalization, sanitization and inchification keeping stereochemistry with isomericSmile=True.
+    It can process multiple molecules separated by ".".
+    Note that inchify set to True will also canonicalize the molecule.
+
+    Args:
+        smiles (str): SMILES representation of a molecule.
+        canonicalize (bool): canonicalize SMILES. Defaults to True.
+        sanitize (bool): sanitize SMILES. Defaults to True.
+        inchify (bool): inchify the SMILES. Defaults to False.
+
+    Returns:
+        a SMILES following the desired standard.
+    """
+    try:
+        molecule = smiles_to_mol(smiles, sanitize=sanitize, find_radicals=find_radicals)
+    except InvalidSmiles:
+        logger.error(f"SMILES parsing failure: {smiles}.")
+        raise
+
+    if inchify:
+        try:
+            inchi_string = mol_to_inchi(molecule)
+        except InvalidInchi:
+            logger.error(
+                f"Inchification failure for SMILES: {smiles}. Returning its canonical version."
+            )
+            return mol_to_smiles(molecule, isomericSmiles=True)
+        else:
+            # canonical set to True because we can't guarantee no canonicalization
+            try:
+                molecule_from_inchi = inchi_to_mol(inchi_string)
+            except InvalidInchi:
+                logger.error(
+                    f"De-inchification failure for InChi: {inchi_string}. Returning its canonical version."
+                )
+                return mol_to_smiles(molecule, isomericSmiles=True)
+            return mol_to_smiles(molecule_from_inchi, canonical=True)
+    if canonicalize:
+        return mol_to_smiles(molecule, isomericSmiles=True)
+    else:
+        return smiles
+
+
+def standardize_molecules(
+    molecules: str,
+    canonicalize: bool = True,
+    sanitize: bool = True,
+    inchify: bool = False,
+    fragment_bond: str = "~",
+    ordered_precursors: bool = True,
+    molecule_token_delimiter: Optional[str] = None,
+    is_enzymatic: bool = False,
+    enzyme_separator: str = "|",
+) -> str:
+    """Ensure that a set of molecules represented by a string follows a desired standard.
+
+    Args:
+        molecules: molecules SMILES. Molecules can be separated via a ".".
+            Fragments are supported with a custom `fragment_bond`.
+        canonicalize: canonicalize SMILES. Defaults to True.
+        sanitize: sanitize SMILES. Defaults to True.
+        inchify: inchify the SMILES. Defaults to False.
+        fragment_bond: fragment bond. Defaults to '~'.
+        ordered_precursors: order precursors. Defaults to True.
+        molecule_token_delimiter: delimiter for big molecule tokens. Defaults to None
+        is_enzymatic: the molecules are representing an enzymatic reaction. Defaults to False.
+        enzyme_separator: separator for molecules and the enzyme. Defaults to '|'.
+
+    Returns:
+        standardized molecules.
+
+    Examples:
+        Standardize multiple molecules:
+        >>> standardize_molecules('CCO.CC')
+        'CC.CCO'
+        Standardize multiple molecules including fragment information:
+        >>> standardize_molecules('CCO.CC~C')
+        'CCO.C~CC'
+    """
+    enzyme = ""
+    if is_enzymatic:
+        splitted_molecules = molecules.split(enzyme_separator)
+        molecules = splitted_molecules[0]
+        if len(splitted_molecules) > 1:
+            enzyme = splitted_molecules[1]
+            enzyme = "{}{}".format(enzyme_separator, enzyme)
+    if molecule_token_delimiter is not None:
+        molecules = molecules.replace(molecule_token_delimiter, "")
+    if fragment_bond in molecules:
+        standardized_molecules_list = [
+            # make sure we remove the fragment to have valid SMILES
+            standardize_smiles(
+                molecule.replace(fragment_bond, "."),
+                canonicalize=canonicalize,
+                sanitize=sanitize,
+                inchify=inchify,
+            ).replace(".", fragment_bond)
+            for molecule in molecules.split(".")
+        ]
+        if ordered_precursors:
+            standardized_molecules_list = sorted(standardized_molecules_list)
+        standardized_molecules = ".".join(standardized_molecules_list)
+    else:
+        if ordered_precursors:
+            # RDKit guarantees ordered precursors
+            standardized_molecules = standardize_smiles(
+                molecules,
+                canonicalize=canonicalize,
+                sanitize=sanitize,
+                inchify=inchify,
+            )
+        else:
+            standardized_molecules_list = [
+                standardize_smiles(
+                    molecule,
+                    canonicalize=canonicalize,
+                    sanitize=sanitize,
+                    inchify=inchify,
+                )
+                for molecule in molecules.split(".")
+            ]
+            standardized_molecules = ".".join(standardized_molecules_list)
+    # add optional enzyme information
+    standardized_molecules = "{}{}".format(standardized_molecules, enzyme)
+    return standardized_molecules
diff --git a/src/rxn/chemutils/tokenization.py b/src/rxn/chemutils/tokenization.py
@@ -28,8 +28,8 @@ def __init__(self, title: str, detail: str):
         Initialize TokenizationError.
 
         Args:
-            title (str): title of the error.
-            detail (str): decscription of the error.
+            title: title of the error.
+            detail: decscription of the error.
         """
         self.type = "TokenizationError"
         self.title = title

diff --git a/src/rxn/chemutils/utils.py b/src/rxn/chemutils/utils.py
@@ -1,6 +1,5 @@
 """Simple utilities not involving RDKit."""
 
-
 import re
 
 

diff --git a/tests/test_smiles_standardization.py b/tests/test_smiles_standardization.py
@@ -0,0 +1,84 @@
+import pytest
+
+from rxn.chemutils.exceptions import InvalidSmiles
+from rxn.chemutils.smiles_standardization import (
+    standardize_molecules,
+    standardize_smiles,
+)
+
+
+def test_standardize_smiles() -> None:
+    smiles = "C(O)C"
+    # case 1: canonicalization and sanitization
+    assert standardize_smiles(smiles) == "CCO"
+    # case 2: disabled canonicalization
+    assert standardize_smiles(smiles, canonicalize=False) == "C(O)C"
+    # case 3: disabled canonicalization with inchification (inherent canonicalization)
+    assert standardize_smiles(smiles, canonicalize=False, inchify=True) == "CCO"
+    # case 4: canonicalization different from inchification (tautomers interconversion)
+    smiles = "CNC(=O)C"
+    assert standardize_smiles(smiles) == "CNC(C)=O"
+    assert standardize_smiles(smiles, inchify=True) == "CN=C(C)O"
+    # case 5: inchification with metal disconnection
+    smiles = "CCCC[Li]"
+    assert standardize_smiles(smiles, inchify=True) == "[CH2]CCC.[Li]"
+    # case 6: testing an invalid SMILES and error handling
+    invalid_smiles = "C%5%%5"
+    with pytest.raises(InvalidSmiles):
+        standardize_smiles(invalid_smiles)
+
+
+def test_standardize_molecules() -> None:
+    # successful cases with different standardization flavours
+    # case 1: default fragment bond
+    molecules = "C(O)C.CCO.CC~C"
+    assert standardize_molecules(molecules) == "CCO.CCO.C~CC"
+    # case 2: custom fragment bond
+    molecules = "C(O)C.CCO.CC|C"
+    assert standardize_molecules(molecules, fragment_bond="|") == "CCO.CCO.C|CC"
+    # case 3: molecule token delimiter
+    molecules = "C(O)C.CCO.CC~C._C_"
+    assert (
+        standardize_molecules(molecules, molecule_token_delimiter="_")
+        == "C.CCO.CCO.C~CC"
+    )
+    # case 4: molecule token delimiter with disabled ordering (canonicalization order the fragments)
+    molecules = "C(O)C.CCO.CC~C._C_"
+    assert (
+        standardize_molecules(
+            molecules, molecule_token_delimiter="_", ordered_precursors=False
+        )
+        == "CCO.CCO.C~CC.C"
+    )
+    # case 5: molecule token delimiter with disabled ordering and canonicalization
+    molecules = "C(O)C.CCO.CC~C._C_"
+    assert (
+        standardize_molecules(
+            molecules,
+            canonicalize=False,
+            molecule_token_delimiter="_",
+            ordered_precursors=False,
+        )
+        == "C(O)C.CCO.CC~C.C"
+    )
+    # case 6: molecule token delimiter with disabled ordering and canonicalization, but enabled inchification
+    molecules = "C(O)C.CCO.CC~C._C_"
+    assert (
+        standardize_molecules(
+            molecules,
+            canonicalize=False,
+            inchify=True,
+            molecule_token_delimiter="_",
+            ordered_precursors=False,
+        )
+        == "CCO.CCO.C~CC.C"
+    )
+    # expected failures due to mismatch between the molecules string and the standard
+    # case 7: unexpected fragment bond
+    molecules = "C(O)C.CCO.CC|C"
+    with pytest.raises(InvalidSmiles):
+        standardize_molecules(molecules)
+    # case 8: unexpected molecule token delimiter
+    molecules = "C(O)C.CCO.CC~C._C_"
+    with pytest.raises(InvalidSmiles):
+        standardize_molecules(molecules)
Original file line number	Diff line number	Diff line change
		@@ -1,6 +1,5 @@
		"""Simple utilities not involving RDKit."""


		import re


Expand Down