-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathreaction_combiner.py
196 lines (162 loc) · 7.35 KB
/
reaction_combiner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
from itertools import chain, repeat, zip_longest
from typing import Iterable, Iterator, Sequence, Tuple
from rxn.utilities.misc import get_multipliers
from .miscellaneous import merge_reactions
from .reaction_equation import ReactionEquation, canonicalize_compounds, sort_compounds
from .reaction_smiles import (
ReactionFormat,
parse_any_reaction_smiles,
to_reaction_smiles,
)
from .tokenization import detokenize_smiles
class ReactionCombiner:
"""
Class to combine sets of precursors with sets of products, or sets of partial
reactions with other sets of partial reactions.
This class is typically useful when one needs to produce the full reaction
SMILES starting from multiple files, such as A) one file for the precursors
and one for the products, or B) two files containing each one part of a
chemical equation.
This class is particularly useful when the said files have different sizes,
which can be the case when multiple predictions are made for each line of one
of these files.
"""
def __init__(
self,
standardize: bool = False,
reaction_format: ReactionFormat = ReactionFormat.STANDARD_WITH_TILDE,
fallback_reaction: str = ">>",
):
"""
Args:
standardize: whether to standardize (i.e. canonicalize and reorder) the reaction SMILES.
reaction_format: which format should be used for the reaction SMILES.
fallback_reaction: text / reaction to produce when a reaction is invalid.
"""
self.standardize = standardize
self.reaction_format = reaction_format
self.fallback_reaction = fallback_reaction
def combine(
self, fragments_1: Sequence[str], fragments_2: Sequence[str]
) -> Iterator[str]:
"""
See docstring of function ``combine_sequences``.
"""
yield from self.combine_sequences(fragments_1, fragments_2)
def combine_sequences(
self, fragments_1: Sequence[str], fragments_2: Sequence[str]
) -> Iterator[str]:
"""
Combine the two sequences of fragments into an iterator of reactions.
Args:
fragments_1: Sequence of sets of precursors strings (such as "CC.O.[Na+]~[Cl-]"),
or list of partial reactions.
fragments_2: Sequence of sets of product(s) strings, or list of partial
reactions.
Returns:
Iterator over the resulting reaction SMILES.
"""
fragments_1_multiplier, fragments_2_multiplier = self._get_multipliers(
fragments_1, fragments_2
)
yield from self.combine_iterators(
fragments_1=fragments_1,
fragments_2=fragments_2,
fragments_1_multiplier=fragments_1_multiplier,
fragments_2_multiplier=fragments_2_multiplier,
)
def combine_iterators(
self,
fragments_1: Iterable[str],
fragments_2: Iterable[str],
fragments_1_multiplier: int = 1,
fragments_2_multiplier: int = 1,
) -> Iterator[str]:
"""
Combine the two iterators of fragments into an iterator of reactions.
Args:
fragments_1: Sequence of sets of precursors strings (such as "CC.O.[Na+]~[Cl-]"),
or list of partial reactions.
fragments_2: Sequence of sets of product(s) strings, or list of partial
reactions.
fragments_1_multiplier: how many times to duplicate the fragments_1.
fragments_2_multiplier: how many times to duplicate the fragments_2.
Raises:
RuntimeError: if one of the iterators isn't fully consumed.
ValueError: when one is not exactly a multiple of the other.
Returns:
Iterator over the resulting reaction SMILES.
"""
self._validate_multipliers(fragments_1_multiplier, fragments_2_multiplier)
# repeat itemwise the elements: https://stackoverflow.com/a/45799320
fragment_1_iterator = chain.from_iterable(
(repeat(e, fragments_1_multiplier) for e in fragments_1)
)
fragment_2_iterator = chain.from_iterable(
(repeat(e, fragments_2_multiplier) for e in fragments_2)
)
for fragment_1, fragment_2 in zip_longest(
fragment_1_iterator, fragment_2_iterator
):
if fragment_1 is None or fragment_2 is None:
raise RuntimeError("Mismatch in expected iterator length")
yield self._to_reaction_smiles(fragment_1, fragment_2)
def _to_reaction_smiles(self, fragment_1: str, fragment_2: str) -> str:
try:
return self._try_to_reaction_smiles(fragment_1, fragment_2)
except Exception:
return self.fallback_reaction
def _try_to_reaction_smiles(self, fragment_1: str, fragment_2: str) -> str:
# 1) get the initial reaction SMILES
reaction_equation = self._to_raw_reaction(fragment_1, fragment_2)
# 2) standardize if necessary
if self.standardize:
reaction_equation = sort_compounds(
canonicalize_compounds(reaction_equation)
)
return to_reaction_smiles(
reaction_equation, reaction_format=self.reaction_format
)
def _to_raw_reaction(self, fragment_1: str, fragment_2: str) -> ReactionEquation:
"""Get a ReactionEquation from the two strings."""
fragment_1 = detokenize_smiles(fragment_1)
fragment_2 = detokenize_smiles(fragment_2)
fragment_1_is_reaction = ">" in fragment_1
fragment_2_is_reaction = ">" in fragment_2
# Case A: both are given in the reaction format
if fragment_1_is_reaction and fragment_2_is_reaction:
reaction_1 = parse_any_reaction_smiles(fragment_1)
reaction_2 = parse_any_reaction_smiles(fragment_2)
return merge_reactions(reaction_1, reaction_2)
# Case A: fragment_1 represents the precursor(s), fragment_2 the product(s)
if not fragment_1_is_reaction and not fragment_2_is_reaction:
reaction_smiles = fragment_1 + ">>" + fragment_2
return parse_any_reaction_smiles(reaction_smiles)
raise ValueError(
f'Cannot determine how to combine "{fragment_1}" and "{fragment_2}"'
)
def _get_multipliers(
self, fragments_1: Sequence[str], fragments_2: Sequence[str]
) -> Tuple[int, int]:
"""Get the multipliers to use when iterating through the respective fragments.
Raises:
ValueError: when one is not exactly a multiple of the other.
Returns:
Tuple: fragments_1 multiplier, fragments_2 multiplier
"""
a = len(fragments_1)
b = len(fragments_2)
m_a, m_b = get_multipliers(a, b)
return m_a, m_b
def _validate_multipliers(self, multiplier_1: int, multiplier_2: int) -> None:
"""
Make sure that the given multipliers can be used with the reaction combiner.
Raises:
ValueError: when one is not exactly a multiple of the other.
"""
# Fail if one is not exactly a multiple of the other
if 1 not in {multiplier_1, multiplier_2}:
raise ValueError(
"The number of fragments of reactions are not an exact multiple of "
f"each other: the multipliers are {multiplier_1} and {multiplier_2}."
)