-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: adding SMILES standardization utilities. [bump minor]
Signed-off-by: Matteo Manica <tte@zurich.ibm.com>
- Loading branch information
Matteo Manica
committed
Feb 9, 2024
1 parent
27e97cb
commit c3d9993
Showing
6 changed files
with
234 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
import logging | ||
from typing import Optional | ||
|
||
from .conversion import inchi_to_mol, mol_to_inchi, mol_to_smiles, smiles_to_mol | ||
from .exceptions import InvalidInchi, InvalidSmiles | ||
|
||
logger = logging.getLogger(__name__) | ||
logger.addHandler(logging.NullHandler()) | ||
|
||
RXN_SMILES_SEPARATOR = ">>" | ||
|
||
|
||
def standardize_smiles( | ||
smiles: str, | ||
canonicalize: bool = True, | ||
sanitize: bool = True, | ||
find_radicals: bool = True, | ||
inchify: bool = False, | ||
) -> str: | ||
"""Ensure that a SMILES follows a desired standard. | ||
It allows canonicalization, sanitization and inchification keeping stereochemistry with isomericSmile=True. | ||
It can process multiple molecules separated by ".". | ||
Note that inchify set to True will also canonicalize the molecule. | ||
Args: | ||
smiles (str): SMILES representation of a molecule. | ||
canonicalize (bool): canonicalize SMILES. Defaults to True. | ||
sanitize (bool): sanitize SMILES. Defaults to True. | ||
inchify (bool): inchify the SMILES. Defaults to False. | ||
Returns: | ||
a SMILES following the desired standard. | ||
""" | ||
try: | ||
molecule = smiles_to_mol(smiles, sanitize=sanitize, find_radicals=find_radicals) | ||
except InvalidSmiles: | ||
logger.error(f"SMILES parsing failure: {smiles}.") | ||
raise | ||
|
||
if inchify: | ||
try: | ||
inchi_string = mol_to_inchi(molecule) | ||
except InvalidInchi: | ||
logger.error( | ||
f"Inchification failure for SMILES: {smiles}. Returning its canonical version." | ||
) | ||
return mol_to_smiles(molecule, isomericSmiles=True) | ||
else: | ||
# canonical set to True because we can't guarantee no canonicalization | ||
try: | ||
molecule_from_inchi = inchi_to_mol(inchi_string) | ||
except InvalidInchi: | ||
logger.error( | ||
f"De-inchification failure for InChi: {inchi_string}. Returning its canonical version." | ||
) | ||
return mol_to_smiles(molecule, isomericSmiles=True) | ||
return mol_to_smiles(molecule_from_inchi, canonical=True) | ||
if canonicalize: | ||
return mol_to_smiles(molecule, isomericSmiles=True) | ||
else: | ||
return smiles | ||
|
||
|
||
def standardize_molecules( | ||
molecules: str, | ||
canonicalize: bool = True, | ||
sanitize: bool = True, | ||
inchify: bool = False, | ||
fragment_bond: str = "~", | ||
ordered_precursors: bool = True, | ||
molecule_token_delimiter: Optional[str] = None, | ||
is_enzymatic: bool = False, | ||
enzyme_separator: str = "|", | ||
) -> str: | ||
"""Ensure that a set of molecules represented by a string follows a desired standard. | ||
Args: | ||
molecules: molecules SMILES. Molecules can be separated via a ".". | ||
Fragments are supported with a custom `fragment_bond`. | ||
canonicalize: canonicalize SMILES. Defaults to True. | ||
sanitize: sanitize SMILES. Defaults to True. | ||
inchify: inchify the SMILES. Defaults to False. | ||
fragment_bond: fragment bond. Defaults to '~'. | ||
ordered_precursors: order precursors. Defaults to True. | ||
molecule_token_delimiter: delimiter for big molecule tokens. Defaults to None | ||
is_enzymatic: the molecules are representing an enzymatic reaction. Defaults to False. | ||
enzyme_separator: separator for molecules and the enzyme. Defaults to '|'. | ||
Returns: | ||
standardized molecules. | ||
Examples: | ||
Standardize multiple molecules: | ||
>>> standardize_molecules('CCO.CC') | ||
'CC.CCO' | ||
Standardize multiple molecules including fragment information: | ||
>>> standardize_molecules('CCO.CC~C') | ||
'CCO.C~CC' | ||
""" | ||
enzyme = "" | ||
if is_enzymatic: | ||
splitted_molecules = molecules.split(enzyme_separator) | ||
molecules = splitted_molecules[0] | ||
if len(splitted_molecules) > 1: | ||
enzyme = splitted_molecules[1] | ||
enzyme = "{}{}".format(enzyme_separator, enzyme) | ||
if molecule_token_delimiter is not None: | ||
molecules = molecules.replace(molecule_token_delimiter, "") | ||
if fragment_bond in molecules: | ||
standardized_molecules_list = [ | ||
# make sure we remove the fragment to have valid SMILES | ||
standardize_smiles( | ||
molecule.replace(fragment_bond, "."), | ||
canonicalize=canonicalize, | ||
sanitize=sanitize, | ||
inchify=inchify, | ||
).replace(".", fragment_bond) | ||
for molecule in molecules.split(".") | ||
] | ||
if ordered_precursors: | ||
standardized_molecules_list = sorted(standardized_molecules_list) | ||
standardized_molecules = ".".join(standardized_molecules_list) | ||
else: | ||
if ordered_precursors: | ||
# RDKit guarantees ordered precursors | ||
standardized_molecules = standardize_smiles( | ||
molecules, | ||
canonicalize=canonicalize, | ||
sanitize=sanitize, | ||
inchify=inchify, | ||
) | ||
else: | ||
standardized_molecules_list = [ | ||
standardize_smiles( | ||
molecule, | ||
canonicalize=canonicalize, | ||
sanitize=sanitize, | ||
inchify=inchify, | ||
) | ||
for molecule in molecules.split(".") | ||
] | ||
standardized_molecules = ".".join(standardized_molecules_list) | ||
# add optional enzyme information | ||
standardized_molecules = "{}{}".format(standardized_molecules, enzyme) | ||
return standardized_molecules |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,5 @@ | ||
"""Simple utilities not involving RDKit.""" | ||
|
||
|
||
import re | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import pytest | ||
|
||
from rxn.chemutils.exceptions import InvalidSmiles | ||
from rxn.chemutils.smiles_standardization import ( | ||
standardize_molecules, | ||
standardize_smiles, | ||
) | ||
|
||
|
||
def test_standardize_smiles() -> None: | ||
smiles = "C(O)C" | ||
# case 1: canonicalization and sanitization | ||
assert standardize_smiles(smiles) == "CCO" | ||
# case 2: disabled canonicalization | ||
assert standardize_smiles(smiles, canonicalize=False) == "C(O)C" | ||
# case 3: disabled canonicalization with inchification (inherent canonicalization) | ||
assert standardize_smiles(smiles, canonicalize=False, inchify=True) == "CCO" | ||
# case 4: canonicalization different from inchification (tautomers interconversion) | ||
smiles = "CNC(=O)C" | ||
assert standardize_smiles(smiles) == "CNC(C)=O" | ||
assert standardize_smiles(smiles, inchify=True) == "CN=C(C)O" | ||
# case 5: inchification with metal disconnection | ||
smiles = "CCCC[Li]" | ||
assert standardize_smiles(smiles, inchify=True) == "[CH2]CCC.[Li]" | ||
# case 6: testing an invalid SMILES and error handling | ||
invalid_smiles = "C%5%%5" | ||
with pytest.raises(InvalidSmiles): | ||
standardize_smiles(invalid_smiles) | ||
|
||
|
||
def test_standardize_molecules() -> None: | ||
# successful cases with different standardization flavours | ||
# case 1: default fragment bond | ||
molecules = "C(O)C.CCO.CC~C" | ||
assert standardize_molecules(molecules) == "CCO.CCO.C~CC" | ||
# case 2: custom fragment bond | ||
molecules = "C(O)C.CCO.CC|C" | ||
assert standardize_molecules(molecules, fragment_bond="|") == "CCO.CCO.C|CC" | ||
# case 3: molecule token delimiter | ||
molecules = "C(O)C.CCO.CC~C._C_" | ||
assert ( | ||
standardize_molecules(molecules, molecule_token_delimiter="_") | ||
== "C.CCO.CCO.C~CC" | ||
) | ||
# case 4: molecule token delimiter with disabled ordering (canonicalization order the fragments) | ||
molecules = "C(O)C.CCO.CC~C._C_" | ||
assert ( | ||
standardize_molecules( | ||
molecules, molecule_token_delimiter="_", ordered_precursors=False | ||
) | ||
== "CCO.CCO.C~CC.C" | ||
) | ||
# case 5: molecule token delimiter with disabled ordering and canonicalization | ||
molecules = "C(O)C.CCO.CC~C._C_" | ||
assert ( | ||
standardize_molecules( | ||
molecules, | ||
canonicalize=False, | ||
molecule_token_delimiter="_", | ||
ordered_precursors=False, | ||
) | ||
== "C(O)C.CCO.CC~C.C" | ||
) | ||
# case 6: molecule token delimiter with disabled ordering and canonicalization, but enabled inchification | ||
molecules = "C(O)C.CCO.CC~C._C_" | ||
assert ( | ||
standardize_molecules( | ||
molecules, | ||
canonicalize=False, | ||
inchify=True, | ||
molecule_token_delimiter="_", | ||
ordered_precursors=False, | ||
) | ||
== "CCO.CCO.C~CC.C" | ||
) | ||
# expected failures due to mismatch between the molecules string and the standard | ||
# case 7: unexpected fragment bond | ||
molecules = "C(O)C.CCO.CC|C" | ||
with pytest.raises(InvalidSmiles): | ||
standardize_molecules(molecules) | ||
# case 8: unexpected molecule token delimiter | ||
molecules = "C(O)C.CCO.CC~C._C_" | ||
with pytest.raises(InvalidSmiles): | ||
standardize_molecules(molecules) |