Skip to content

Commit

Permalink
feat: adding SMILES standardization utilities. [bump minor]
Browse files Browse the repository at this point in the history
Signed-off-by: Matteo Manica <tte@zurich.ibm.com>
  • Loading branch information
Matteo Manica committed Feb 9, 2024
1 parent 27e97cb commit c3d9993
Show file tree
Hide file tree
Showing 6 changed files with 234 additions and 3 deletions.
1 change: 1 addition & 0 deletions src/rxn/chemutils/multicomponent_smiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Utilities related to "multi-component SMILES", i.e. strings containing multiple compounds
in SMILES notation, which may include fragment bonds.
"""

from functools import partial
from typing import Callable, Iterable, List, Optional

Expand Down
1 change: 1 addition & 0 deletions src/rxn/chemutils/reaction_smiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
In a separate file than miscellaneous.py or conversion.py in order to avoid
cyclic dependencies.
"""

from enum import auto

from rxn.utilities.types import RxnEnum
Expand Down
146 changes: 146 additions & 0 deletions src/rxn/chemutils/smiles_standardization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import logging
from typing import Optional

from .conversion import inchi_to_mol, mol_to_inchi, mol_to_smiles, smiles_to_mol
from .exceptions import InvalidInchi, InvalidSmiles

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())

RXN_SMILES_SEPARATOR = ">>"


def standardize_smiles(
smiles: str,
canonicalize: bool = True,
sanitize: bool = True,
find_radicals: bool = True,
inchify: bool = False,
) -> str:
"""Ensure that a SMILES follows a desired standard.
It allows canonicalization, sanitization and inchification keeping stereochemistry with isomericSmile=True.
It can process multiple molecules separated by ".".
Note that inchify set to True will also canonicalize the molecule.
Args:
smiles (str): SMILES representation of a molecule.
canonicalize (bool): canonicalize SMILES. Defaults to True.
sanitize (bool): sanitize SMILES. Defaults to True.
inchify (bool): inchify the SMILES. Defaults to False.
Returns:
a SMILES following the desired standard.
"""
try:
molecule = smiles_to_mol(smiles, sanitize=sanitize, find_radicals=find_radicals)
except InvalidSmiles:
logger.error(f"SMILES parsing failure: {smiles}.")
raise

if inchify:
try:
inchi_string = mol_to_inchi(molecule)
except InvalidInchi:
logger.error(
f"Inchification failure for SMILES: {smiles}. Returning its canonical version."
)
return mol_to_smiles(molecule, isomericSmiles=True)
else:
# canonical set to True because we can't guarantee no canonicalization
try:
molecule_from_inchi = inchi_to_mol(inchi_string)
except InvalidInchi:
logger.error(
f"De-inchification failure for InChi: {inchi_string}. Returning its canonical version."
)
return mol_to_smiles(molecule, isomericSmiles=True)
return mol_to_smiles(molecule_from_inchi, canonical=True)
if canonicalize:
return mol_to_smiles(molecule, isomericSmiles=True)
else:
return smiles


def standardize_molecules(
molecules: str,
canonicalize: bool = True,
sanitize: bool = True,
inchify: bool = False,
fragment_bond: str = "~",
ordered_precursors: bool = True,
molecule_token_delimiter: Optional[str] = None,
is_enzymatic: bool = False,
enzyme_separator: str = "|",
) -> str:
"""Ensure that a set of molecules represented by a string follows a desired standard.
Args:
molecules: molecules SMILES. Molecules can be separated via a ".".
Fragments are supported with a custom `fragment_bond`.
canonicalize: canonicalize SMILES. Defaults to True.
sanitize: sanitize SMILES. Defaults to True.
inchify: inchify the SMILES. Defaults to False.
fragment_bond: fragment bond. Defaults to '~'.
ordered_precursors: order precursors. Defaults to True.
molecule_token_delimiter: delimiter for big molecule tokens. Defaults to None
is_enzymatic: the molecules are representing an enzymatic reaction. Defaults to False.
enzyme_separator: separator for molecules and the enzyme. Defaults to '|'.
Returns:
standardized molecules.
Examples:
Standardize multiple molecules:
>>> standardize_molecules('CCO.CC')
'CC.CCO'
Standardize multiple molecules including fragment information:
>>> standardize_molecules('CCO.CC~C')
'CCO.C~CC'
"""
enzyme = ""
if is_enzymatic:
splitted_molecules = molecules.split(enzyme_separator)
molecules = splitted_molecules[0]
if len(splitted_molecules) > 1:
enzyme = splitted_molecules[1]
enzyme = "{}{}".format(enzyme_separator, enzyme)
if molecule_token_delimiter is not None:
molecules = molecules.replace(molecule_token_delimiter, "")
if fragment_bond in molecules:
standardized_molecules_list = [
# make sure we remove the fragment to have valid SMILES
standardize_smiles(
molecule.replace(fragment_bond, "."),
canonicalize=canonicalize,
sanitize=sanitize,
inchify=inchify,
).replace(".", fragment_bond)
for molecule in molecules.split(".")
]
if ordered_precursors:
standardized_molecules_list = sorted(standardized_molecules_list)
standardized_molecules = ".".join(standardized_molecules_list)
else:
if ordered_precursors:
# RDKit guarantees ordered precursors
standardized_molecules = standardize_smiles(
molecules,
canonicalize=canonicalize,
sanitize=sanitize,
inchify=inchify,
)
else:
standardized_molecules_list = [
standardize_smiles(
molecule,
canonicalize=canonicalize,
sanitize=sanitize,
inchify=inchify,
)
for molecule in molecules.split(".")
]
standardized_molecules = ".".join(standardized_molecules_list)
# add optional enzyme information
standardized_molecules = "{}{}".format(standardized_molecules, enzyme)
return standardized_molecules
4 changes: 2 additions & 2 deletions src/rxn/chemutils/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ def __init__(self, title: str, detail: str):
Initialize TokenizationError.
Args:
title (str): title of the error.
detail (str): decscription of the error.
title: title of the error.
detail: decscription of the error.
"""
self.type = "TokenizationError"
self.title = title
Expand Down
1 change: 0 additions & 1 deletion src/rxn/chemutils/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Simple utilities not involving RDKit."""


import re


Expand Down
84 changes: 84 additions & 0 deletions tests/test_smiles_standardization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import pytest

from rxn.chemutils.exceptions import InvalidSmiles
from rxn.chemutils.smiles_standardization import (
standardize_molecules,
standardize_smiles,
)


def test_standardize_smiles() -> None:
smiles = "C(O)C"
# case 1: canonicalization and sanitization
assert standardize_smiles(smiles) == "CCO"
# case 2: disabled canonicalization
assert standardize_smiles(smiles, canonicalize=False) == "C(O)C"
# case 3: disabled canonicalization with inchification (inherent canonicalization)
assert standardize_smiles(smiles, canonicalize=False, inchify=True) == "CCO"
# case 4: canonicalization different from inchification (tautomers interconversion)
smiles = "CNC(=O)C"
assert standardize_smiles(smiles) == "CNC(C)=O"
assert standardize_smiles(smiles, inchify=True) == "CN=C(C)O"
# case 5: inchification with metal disconnection
smiles = "CCCC[Li]"
assert standardize_smiles(smiles, inchify=True) == "[CH2]CCC.[Li]"
# case 6: testing an invalid SMILES and error handling
invalid_smiles = "C%5%%5"
with pytest.raises(InvalidSmiles):
standardize_smiles(invalid_smiles)


def test_standardize_molecules() -> None:
# successful cases with different standardization flavours
# case 1: default fragment bond
molecules = "C(O)C.CCO.CC~C"
assert standardize_molecules(molecules) == "CCO.CCO.C~CC"
# case 2: custom fragment bond
molecules = "C(O)C.CCO.CC|C"
assert standardize_molecules(molecules, fragment_bond="|") == "CCO.CCO.C|CC"
# case 3: molecule token delimiter
molecules = "C(O)C.CCO.CC~C._C_"
assert (
standardize_molecules(molecules, molecule_token_delimiter="_")
== "C.CCO.CCO.C~CC"
)
# case 4: molecule token delimiter with disabled ordering (canonicalization order the fragments)
molecules = "C(O)C.CCO.CC~C._C_"
assert (
standardize_molecules(
molecules, molecule_token_delimiter="_", ordered_precursors=False
)
== "CCO.CCO.C~CC.C"
)
# case 5: molecule token delimiter with disabled ordering and canonicalization
molecules = "C(O)C.CCO.CC~C._C_"
assert (
standardize_molecules(
molecules,
canonicalize=False,
molecule_token_delimiter="_",
ordered_precursors=False,
)
== "C(O)C.CCO.CC~C.C"
)
# case 6: molecule token delimiter with disabled ordering and canonicalization, but enabled inchification
molecules = "C(O)C.CCO.CC~C._C_"
assert (
standardize_molecules(
molecules,
canonicalize=False,
inchify=True,
molecule_token_delimiter="_",
ordered_precursors=False,
)
== "CCO.CCO.C~CC.C"
)
# expected failures due to mismatch between the molecules string and the standard
# case 7: unexpected fragment bond
molecules = "C(O)C.CCO.CC|C"
with pytest.raises(InvalidSmiles):
standardize_molecules(molecules)
# case 8: unexpected molecule token delimiter
molecules = "C(O)C.CCO.CC~C._C_"
with pytest.raises(InvalidSmiles):
standardize_molecules(molecules)

0 comments on commit c3d9993

Please sign in to comment.