-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmol_prep.py
130 lines (100 loc) · 4.25 KB
/
mol_prep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
## Work-in-progress
## Functions:
# 1) preprocess() - preprocess compounds into RDKit molecules
# 2) rdkit_2d_descriptors() - generate RDKit 2D descriptors
# 3) feat_imp_plot() - convert feature importance array into a dataframe and bar plot
from rdkit.Chem import Descriptors
import datamol as dm
import pandas as pd
import seaborn as sns
#class preprocess_mol:
# def __init__(self, row) -> None:
# self.row = pd.DataFrame
# ?use self.smiles = smiles
# then generate mols from smiles might be easier
# disable rdkit messages
dm.disable_rdkit_log()
# The following function code were adapted with thanks from datamol.io
def preprocess(row):
"""
Function to preprocess, fix, standardise and sanitise compounds
:param smiles_column: SMILES column name derived from ChEMBL database from an input dataframe
:param mol: RDKit molecules
:return: preprocessed RDKit molecules, standardised SMILES, SELFIES,
InChI and InChI keys in the dataframe
"""
# smiles_column = strings object
smiles_column = "canonical_smiles"
# Convert each compound into a RDKit molecule in the smiles column
mol = dm.to_mol(row[smiles_column], ordered=True)
# Fix common errors in the molecules
mol = dm.fix_mol(mol)
# Sanitise the molecules
mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False)
# Standardise the molecules
mol = dm.standardize_mol(
mol,
# Switch on to disconnect metal ions
disconnect_metals=True,
normalize=True,
reionize=True,
# Switch on "uncharge" to neutralise charges
uncharge=True,
# Taking care of stereochemistries of compounds
# Note: this uses the older approach of "AssignStereochemistry()" from RDKit
# https://github.com/datamol-io/datamol/blob/main/datamol/mol.py#L488
stereo=True,
)
# Adding following rows of different molecular representations
row["rdkit_mol"] = dm.to_mol(mol)
row["standard_smiles"] = dm.standardize_smiles(dm.to_smiles(mol))
row["selfies"] = dm.to_selfies(mol)
row["inchi"] = dm.to_inchi(mol)
row["inchikey"] = dm.to_inchikey(mol)
return row
def rdkit_2d_descriptors(df):
"""
Function to calculate RDKit 2D descriptors for a list of RDKit molecules
:param df: an input dataframe containing RDKit molecules
:param mol: RDKit molecules
:return: a dataframe containing RDKit 2D descriptors
"""
# Create a mol list based on RDKit molecules (series object)
mol_list = df["rdkit_mol"]
# Convert series object into a list
mol_list = list(mol_list)
# Calculate RDKit 2D molecular descriptors
rdkit_mol_ls = [Descriptors.CalcMolDescriptors(mol) for mol in mol_list]
# Convert the list of molecules with RDKit 2D descriptors into a dataframe
df_prep_2d = pd.DataFrame(rdkit_mol_ls)
#print(df_prep_2d.shape)
return df_prep_2d
def feat_imp_plot(feat_imp_array, X_df):
"""
Function to convert feature importance array into a dataframe,
which is then used to plot a bar graph showing feature importance ranking
in the random forest (RF) model
:param feat_imp_array: array obtained from feature_importances_ attribute
or permutation_importance function in scikit-learn
:param X_df: feature column names from X variable dataframe to be used in the plot
:return: a barplot showing a feature importances ranking in the RF model
"""
# Convert the feat_imp array into dataframe
feat_imp_df = pd.DataFrame(feat_imp_array)
# Obtain feature names via column names of dataframe
# Rename the index as "features"
feature = X_df.columns.rename("features")
# Convert the index to dataframe
feature_name_df = feature.to_frame(index = False)
# Concatenate feature_imp_df & feature_name_df
feature_df = pd.concat([feat_imp_df, feature_name_df], axis=1)
# Rename the column for feature importances
feature_df = feature_df.rename(columns = {0: "feature_importances"})
# Sort values of feature importances in descending order
feature_df = feature_df.sort_values("feature_importances", ascending=False)
# Seaborn bar plot
sns.barplot(
feature_df,
x = "feature_importances",
y = "features"
)