-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathext_chem.py
173 lines (147 loc) · 6.4 KB
/
ext_chem.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import random
from rdkit import Chem
from rdkit.Chem import Draw
from pubchempy import get_compounds, PubChemPyError
import io
import base64
from PIL import Image
# for debug
from colorama import init, Fore, Style
init()
red = lambda txt: Fore.RED + txt + Style.RESET_ALL
blue = lambda txt: Fore.BLUE + txt + Style.RESET_ALL
green = lambda txt: Fore.GREEN + txt + Style.RESET_ALL
def general_append(smiles: list[str], longest_carbon_chain: int) -> None:
# alkene (at least 2 C atoms) 1/3 probability to include
if longest_carbon_chain >= 2 and random.randint(1, 3) == 1:
no_of_alkenes = random.randint(1, longest_carbon_chain-1)
for _ in range(no_of_alkenes):
pos = random.randint(0, len(smiles)-2)
# check invalid, also lower the probability of more alkenes
if smiles[pos] == "C=C" or smiles[pos+1] == "C=C": # to avoid cumulene (consecutive double bonds)
continue
# extract the selected atoms
first = smiles.pop(pos)
second = smiles.pop(pos)
# insert alkene group at selected place
smiles.insert(pos, f"{first}={second}")
# 0 ~ 4, normal distribution
no_of_substituents = random.choices([i for i in range(0, 5)], weights=[min(i, 4 - i) for i in range(0, 5)])[0]
substituents = [
"Br", "Cl", "F", "C", "CC"
]
for _ in range(no_of_substituents):
substituent = random.choice(substituents)
pos = random.randint(0 if 1 > longest_carbon_chain-1 else 1, longest_carbon_chain-1)
smiles.insert(pos, f"({substituent})")
def random_smiles_easy() -> str:
# meth ~ oct, normal distribution, max = 3
longest_carbon_chain = random.choices([i for i in range(1, 9)], weights=[min([i, 9 - i, 3]) for i in range(1, 9)])[0]
smiles = ["C"] * longest_carbon_chain
cyclic = False
# 1/4 probability cyclic
if longest_carbon_chain >= 3 and random.randint(1, 4) == 1:
cyclic = True
smiles.insert(1, "1")
smiles.append("1")
general_append(smiles, longest_carbon_chain)
rnd = random.randint(1, 4)
# either hydroxyl group or carboxyl group
if rnd == 1:
smiles.insert(random.randint(0, len(smiles)-1), "(O)")
elif rnd == 2 and not cyclic:
if random.randint(1, 2) == 1: # front
smiles.insert(0, "OC(=O)")
else: # back
smiles.append("C(=O)O")
smiles = "".join(smiles)
# i have no idea how SMILES work
if valid(smiles):
return smiles
else:
# print("Invalid:", rnd, smiles)
return random_smiles_easy()
def random_smiles_hard() -> str:
# meth ~ oct, distribution prone to more carbons, no limit/max
longest_carbon_chain = random.choices([i for i in range(1, 9)], weights=[min([i, 11 - i]) for i in range(1, 9)])[0]
smiles = ["C"] * longest_carbon_chain
# alkene 2/3 probability to include
if longest_carbon_chain >= 2 and random.randint(1, 3) in (1, 2):
# print(blue("Alkene:"))
no_of_alkenes = random.randint(1, min(2, longest_carbon_chain-1)) # at most 2 alkenes
for _ in range(no_of_alkenes):
pos = random.randint(0, len(smiles)-2)
if smiles[pos] == "C=C" or smiles[pos+1] == "C=C": # to avoid cumulene (consecutive double bonds)
continue
first = smiles.pop(pos)
second = smiles.pop(pos)
smiles.insert(pos, f"{first}={second}")
if longest_carbon_chain > 2 and len(smiles) > 2:
rnd = random.randint(1, 4)
else: # no ketone
rnd = random.randint(1, 3)
# print(rnd)
if rnd == 1: # aldehyde
if smiles[0] == "C=C": # to avoid O=C=C, which is invalid
# break the double bond
smiles.pop(0)
smiles = ["C", "C"] + smiles
smiles.insert(0, "O=") # front
if random.randint(1, 2) == 1: # 1/2 probability at the back
if smiles[-1] == "C=C": # same logic
smiles.pop()
smiles = smiles + ["C", "C"]
smiles.insert(len(smiles), "=O")
elif rnd == 2: # amide
smiles.pop()
smiles.insert(0, "O=C(N)") # front
if random.randint(1, 2) == 1: # 1/2 probability at the back
smiles.insert(len(smiles), "(=O)N")
else:
# easier to draw less number of functional groups
no_of_func_grps = random.choices([1, 2, 3], weights=[3, 2, 1])[0]
for _ in range(no_of_func_grps):
if rnd == 3: # amine
smiles.insert(random.randint(1, len(smiles)), "(N)")
elif rnd == 4: # ketone
for _ in range(20): # run some time to avoid infinite loop, in case no valid position can be found
pos = random.randint(2, len(smiles)-1)
if smiles[pos-1] != "(=O)" and smiles[pos] != "(=O)": # avoid double ketone sticking to each other
break
smiles.insert(pos, "(=O)")
smiles = "".join(smiles)
if valid(smiles):
return smiles
else:
# print(red(f"Invalid: {rnd = }, {smiles = }"))
return random_smiles_hard()
def iupac_name(smiles: str) -> str:
try:
compounds = get_compounds(smiles, namespace="smiles")
if compounds[0].iupac_name is not None:
return compounds[0].iupac_name.replace("(", "").replace(")", "") # to prevent something like dibromo(chloro)methane
else:
# print(f"Failed: {smiles}")
return None
except PubChemPyError as e:
# print(e)
return None
def valid(smiles: str):
# both modules are ready
return iupac_name(smiles) is not None and Chem.MolFromSmiles(smiles) is not None
def generate_image(smiles: str) -> Image.Image:
return Draw.MolToImage(Chem.MolFromSmiles(smiles))
def generate_base64_image(smiles: str) -> str:
return image_to_base64(generate_image(smiles))
def image_to_base64(image: Image.Image) -> str:
img_byte_arr = io.BytesIO()
image.save(img_byte_arr, format="PNG")
img_byte_arr = img_byte_arr.getvalue()
img_base64 = base64.b64encode(img_byte_arr).decode("utf-8")
return img_base64
# DEBUG
if __name__ == "__main__":
...
for _ in range(10):
smiles = random_smiles_hard()
print(smiles, iupac_name(smiles))