Skip to content

Commit

Permalink
Merge pull request #173 from Pakman450/mol2_read_main
Browse files Browse the repository at this point in the history
request to add a mol2 file reader
  • Loading branch information
hadim authored Mar 30, 2023
2 parents a923213 + 5f1b805 commit 5740109
Show file tree
Hide file tree
Showing 5 changed files with 342 additions and 0 deletions.
1 change: 1 addition & 0 deletions datamol/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@
from .io import to_sdf
from .io import to_smi
from .io import read_smi
from .io import read_mol2file
from .io import read_molblock
from .io import to_molblock
from .io import to_xlsx
Expand Down
47 changes: 47 additions & 0 deletions datamol/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,53 @@ def to_sdf(
writer.close()


def read_mol2file(
urlpath: Union[str, os.PathLike, IO],
sanitize: bool = True,
cleanup_substructures: bool = True,
remove_hs: bool = True,
fail_if_invalid: bool = False,
) -> List[Mol]:
"""Read a Mol2 File
Args:
urlpath: Path to a file or a file-like object. Path can be remote or local.
sanitize: Whether to sanitize the molecules.
remove_hs: Whether to remove the existing hydrogens in the SDF files.
cleanup_substructures: Whether to clean up substructure in the Mol2 Files.
fail_if_invalid: If set to true, the parser will raise an exception if the molecule is invalid
instead of returning None.
"""

block = []
mols = []
with fsspec.open(urlpath, compression="infer") as f:
fReadLines = f.readlines()
# reversing due to ambiguous end line for mol2 files
fReadLines.reverse()
for line in fReadLines:
# ignores any header info
if b"#" not in line:
block.append(str(line, "utf-8"))
# since reversed, this is the 'end' a mol2
if b"@<TRIPOS>MOLECULE" in line:
block.reverse()
mol2block = ",".join(block).replace(",", "")
mol = rdmolfiles.MolFromMol2Block(
mol2block,
sanitize=sanitize,
removeHs=remove_hs,
cleanupSubstructures=cleanup_substructures,
)
if mol is None and fail_if_invalid:
raise ValueError(f"Invalid molecule: {mol2block}")
mols.append(mol)
block = []

mols.reverse()
return mols


def read_molblock(
molblock: str,
sanitize: bool = True,
Expand Down
23 changes: 23 additions & 0 deletions news/my-feature-or-branch.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
**Added:**

* A multi-mol2 file reader that converts into rdkit objects

**Changed:**

* <news item>

**Deprecated:**

* <news item>

**Removed:**

* <news item>

**Fixed:**

* <news item>

**Security:**

* <news item>
243 changes: 243 additions & 0 deletions tests/data/test.mol2
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
@<TRIPOS>MOLECULE
mol_first
11 11 1 0 0
SMALL
AMBER ff14SB

@<TRIPOS>ATOM
1 C1 -0.0167 1.3778 0.0096 C.ar 1 UNK 0.0267
2 C2 0.0021 -0.0041 0.0020 C.ar 1 UNK -0.0438
3 C3 1.2218 -0.6631 -0.0131 C.ar 1 UNK -0.0592
4 C4 2.3820 0.0960 -0.0201 C.ar 1 UNK -0.0438
5 C5 2.2849 1.4746 -0.0118 C.ar 1 UNK 0.0267
6 N6 1.1072 2.0677 0.0026 N.ar 1 UNK -0.2647
7 H7 -0.9627 1.8988 0.0169 H 1 UNK 0.0840
8 H8 -0.9217 -0.5635 0.0075 H 1 UNK 0.0639
9 H9 1.2671 -1.7422 -0.0190 H 1 UNK 0.0624
10 H10 3.3495 -0.3839 -0.0316 H 1 UNK 0.0639
11 H11 3.1838 2.0731 -0.0171 H 1 UNK 0.0840
@<TRIPOS>BOND
1 1 6 ar
2 1 2 ar
3 1 7 1
4 2 3 ar
5 2 8 1
6 3 4 ar
7 3 9 1
8 4 5 ar
9 4 10 1
10 5 6 ar
11 5 11 1
@<TRIPOS>SUBSTRUCTURE
1 UNK 1 RESIDUE 4 A UNK 0 ROOT

@<TRIPOS>MOLECULE
mol_sec
9 9 1 0 0
SMALL
AMBER ff14SB


@<TRIPOS>ATOM
1 C1 1.2973 -0.3859 -0.0124 C.2 1 UNK 0.0838
2 N2 0.0021 -0.0041 0.0020 N.pl3 1 UNK -0.3106
3 H3 -0.7708 -0.5902 0.0062 H 1 UNK 0.1532
4 C4 -0.0165 1.3646 0.0095 C.2 1 UNK 0.0120
5 C5 1.2671 1.7717 -0.0005 C.2 1 UNK 0.0422
6 N6 2.0482 0.6814 -0.0138 N.2 1 UNK -0.2480
7 H7 1.6529 -1.4057 -0.0216 H 1 UNK 0.1014
8 H8 -0.8923 1.9965 0.0173 H 1 UNK 0.0806
9 H9 1.6079 2.7966 0.0017 H 1 UNK 0.0854
@<TRIPOS>BOND
1 1 6 2
2 1 2 1
3 1 7 1
4 2 3 1
5 2 4 1
6 4 5 2
7 4 8 1
8 5 6 1
9 5 9 1
@<TRIPOS>SUBSTRUCTURE
1 UNK 1 RESIDUE 4 A UNK 0 ROOT

@<TRIPOS>MOLECULE
mol_third
9 9 1 0 0
SMALL
AMBER ff14SB


@<TRIPOS>ATOM
1 C1 1.2973 -0.3859 -0.0124 C 1 UNK 0.0838
2 N2 0.0021 -0.0041 0.0020 N 1 UNK -0.3106
3 H3 -0.7708 -0.5902 0.0062 H 1 UNK 0.1532
4 C4 -0.0165 1.3646 0.0095 C 1 UNK 0.0120
5 C5 1.2671 1.7717 -0.0005 C 1 UNK 0.0422
6 N6 2.0482 0.6814 -0.0138 N 1 UNK -0.2480
7 H7 1.6529 -1.4057 -0.0216 H 1 UNK 0.1014
8 H8 -0.8923 1.9965 0.0173 H 1 UNK 0.0806
9 H9 1.6079 2.7966 0.0017 H 1 UNK 0.0854
@<TRIPOS>BOND
1 1 6 2
2 1 2 1
3 1 7 1
4 2 3 1
5 2 4 1
6 4 5 2
7 4 8 1
8 5 6 1
9 5 9 1
@<TRIPOS>SUBSTRUCTURE
1 UNK 1 RESIDUE 4 A UNK 0 ROOT

@<TRIPOS>MOLECULE
mol_sec_f
9 9 1 0 0
SMALL
AMBER ff14SB


@<TRIPOS>ATOM
1 C1 1.2973 -0.3859 -0.0124 C.2 1 UNK 0.0838
2 N2 0.0021 -0.0041 0.0020 N.pl3 1 UNK -0.3106
3 H3 -0.7708 -0.5902 0.0062 H 1 UNK 0.1532
4 C4 -0.0165 1.3646 0.0095 C.2 1 UNK 0.0120
5 C5 1.2671 1.7717 -0.0005 C.2 1 UNK 0.0422
6 N6 2.0482 0.6814 -0.0138 N.2 1 UNK -0.2480
7 H7 1.6529 -1.4057 -0.0216 H 1 UNK 0.1014
8 H8 -0.8923 1.9965 0.0173 H 1 UNK 0.0806
9 H9 1.6079 2.7966 0.0017 H 1 UNK 0.0854

1 1 6 2
2 1 2 1
3 1 7 1
4 2 3 1
5 2 4 1
6 4 5 2
7 4 8 1
8 5 6 1
9 5 9 1
@<TRIPOS>SUBSTRUCTURE
1 UNK 1 RESIDUE 4 A UNK 0 ROOT

@<TRIPOS>MOLECULE
mol_sec_f1
9 9 1 0 0
SMALL
AMBER ff14SB



1 C1 1.2973 -0.3859 -0.0124 C.2 1 UNK 0.0838
2 N2 0.0021 -0.0041 0.0020 N.pl3 1 UNK -0.3106
3 H3 -0.7708 -0.5902 0.0062 H 1 UNK 0.1532
4 C4 -0.0165 1.3646 0.0095 C.2 1 UNK 0.0120
5 C5 1.2671 1.7717 -0.0005 C.2 1 UNK 0.0422
6 N6 2.0482 0.6814 -0.0138 N.2 1 UNK -0.2480
7 H7 1.6529 -1.4057 -0.0216 H 1 UNK 0.1014
8 H8 -0.8923 1.9965 0.0173 H 1 UNK 0.0806
9 H9 1.6079 2.7966 0.0017 H 1 UNK 0.0854
@<TRIPOS>BOND
1 1 6 2
2 1 2 1
3 1 7 1
4 2 3 1
5 2 4 1
6 4 5 2
7 4 8 1
8 5 6 1
9 5 9 1
@<TRIPOS>SUBSTRUCTURE
1 UNK 1 RESIDUE 4 A UNK 0 ROOT

@<TRIPOS>MOLECULE
mol_sec_f3
9 9 1 0 0
SMALL
AMBER ff14SB

@<TRIPOS>ATOM
@<TRIPOS>BOND
1 1 6 2
2 1 2 1
3 1 7 1
4 2 3 1
5 2 4 1
6 4 5 2
7 4 8 1
8 5 6 1
9 5 9 1
@<TRIPOS>SUBSTRUCTURE
1 UNK 1 RESIDUE 4 A UNK 0 ROOT

@<TRIPOS>MOLECULE
mol_sec_f4
9 9 1 0 0
SMALL
AMBER ff14SB

@<TRIPOS>ATOM
@<TRIPOS>BOND
@<TRIPOS>SUBSTRUCTURE
1 UNK 1 RESIDUE 4 A UNK 0 ROOT



@<TRIPOS>MOLECULE



@<TRIPOS>ATOM
1 C1 1.2973 -0.3859 -0.0124 C.2 1 UNK 0.0838
2 N2 0.0021 -0.0041 0.0020 N.pl3 1 UNK -0.3106
3 H3 -0.7708 -0.5902 0.0062 H 1 UNK 0.1532
4 C4 -0.0165 1.3646 0.0095 C.2 1 UNK 0.0120
5 C5 1.2671 1.7717 -0.0005 C.2 1 UNK 0.0422
6 N6 2.0482 0.6814 -0.0138 N.2 1 UNK -0.2480
7 H7 1.6529 -1.4057 -0.0216 H 1 UNK 0.1014
8 H8 -0.8923 1.9965 0.0173 H 1 UNK 0.0806
9 H9 1.6079 2.7966 0.0017 H 1 UNK 0.0854

1 1 6 2
2 1 2 1
3 1 7 1
4 2 3 1
5 2 4 1
6 4 5 2
7 4 8 1
8 5 6 1
9 5 9 1
@<TRIPOS>SUBSTRUCTURE
1 UNK 1 RESIDUE 4 A UNK 0 ROOT

@<TRIPOS>MOLECULE
mol_sec
9 9 1 0 0
SMALL
AMBER ff14SB


@<TRIPOS>ATOM
1 C1 1.2973 -0.3859 -0.0124 C 1 UNK 0.0838
2 N2 0.0021 -0.0041 0.0020 N 1 UNK -0.3106
3 H3 -0.7708 -0.5902 0.0062 H 1 UNK 0.1532
4 C4 -0.0165 1.3646 0.0095 C 1 UNK 0.0120
5 C5 1.2671 1.7717 -0.0005 C 1 UNK 0.0422
6 N6 2.0482 0.6814 -0.0138 N 1 UNK -0.2480
7 H7 1.6529 -1.4057 -0.0216 H 1 UNK 0.1014
8 H8 -0.8923 1.9965 0.0173 H 1 UNK 0.0806
9 H9 1.6079 2.7966 0.0017 H 1 UNK 0.0854
@<TRIPOS>BOND
1 1 6 2
2 1 2 1
3 1 7 1
4 2 3 1
5 2 4 1
6 4 5 2
7 4 8 1
8 5 6 1
9 5 9 1
@<TRIPOS>SUBSTRUCTURE
1 UNK 1 RESIDUE 4 A UNK 0 ROOT

28 changes: 28 additions & 0 deletions tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,34 @@ def test_sdf_props_and_conformer_preserved(tmp_path):
np.testing.assert_almost_equal(conf.GetPositions(), pos, decimal=4)


def test_read_mol2(datadir):
data_path = datadir / "test.mol2"

# to list of mols
mols = dm.read_mol2file(data_path)

assert isinstance(mols[0], Chem.rdchem.Mol)
assert isinstance(mols[1], Chem.rdchem.Mol)
assert isinstance(mols[2], Chem.rdchem.Mol)
# cases where mol2 formats are damaged
assert mols[3] is None
assert mols[4] is None
assert mols[5] is None
assert mols[6] is None
assert mols[7] is None

firstMol = dm.to_mol("c1ccncc1")
secondMol = dm.to_mol("c1c[nH]cn1")

assert dm.same_mol(mols[0], firstMol)
assert dm.same_mol(mols[1], secondMol)
assert dm.same_mol(mols[2], secondMol)

# a case where exception is raised because of None values
with pytest.raises(ValueError):
mols = dm.read_mol2file(data_path, fail_if_invalid=True)


def test_read_save_molblock():
mol = dm.to_mol("Cn1c(=O)c2c(ncn2C)n(C)c1=O")

Expand Down

0 comments on commit 5740109

Please sign in to comment.