Skip to content

Commit

Permalink
minor wraps up edits for next beta
Browse files Browse the repository at this point in the history
  • Loading branch information
JacksonBurns committed Apr 2, 2024
1 parent abbda8e commit 8351850
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 30 deletions.
3 changes: 2 additions & 1 deletion examples/oom_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,8 @@ def __init__(
# mock the target scaler used for reporting some human-readable metrics
self.target_scaler = SimpleNamespace(n_features_in_=1, inverse_transform=lambda i: np.array(i))

def setup(self, stage=None): ... # skip feature scaling and dataset splitting
def setup(self, stage=None):
... # skip feature scaling and dataset splitting

def _init_dataloader(self, shuffle, idxs):
return TorchDataloader(
Expand Down
89 changes: 63 additions & 26 deletions fastprop/fastprop_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,12 @@ class ArbitraryDataset(TorchDataset):
"""

def __init__(self, data, targets):
"""Initialize a basic torch-compatible data
Args:
data (torch.tensor): features
targets (torch.tensor): targets
"""
self.data = data
self.length = len(targets)
self.targets = targets
Expand Down Expand Up @@ -107,24 +113,24 @@ def __init__(
"""Core fastprop model.
Args:
feature_scaler (sklearn scaler): Scaler used on feature variables, used for reporting metrics in human-scale.
target_scaler (sklearn scaler): Scaler used on target variables, used for reporting metrics in human-scale.
num_epochs (int): Maximum allowed number of training epochs.
input_size (int): Number of input neurons.
hidden_size (int): Number of neurons in the hidden layers.
learning_rate (float): Learning rate.
readout_size (int): Number of targets to readout.
learning_rate (float): Learning rate for Adam.
fnn_layers (int): Number of layers in the FNN.
problem_type (str): Problem type, i.e. regression, multiclass, multilabel, or binary.
verbose (bool, optional): Reduces some logging if true. Defaults to False.
num_classes (int, optional): Number of classes for multiclass classification. Defaults to None.
cleaned_data (numpy.ndarray or torch.Tensor): Descriptors, already subjected to preprocessing (dropping operations)
targets (numpy.ndarray or torch.Tensor): Scaled targets in the same order as the descriptors.
batch_size (int): Samples/per batch - for small feature sets like in fastprop, set as high as possible.
random_seed (int): Seed for RNG.
train_size (float): Fraction of data for training.
val_size (float): Fraction of data for validation.
test_size (float): Fraction of data for test.
sampler (str): Type of sampler to use, see astartes for a list of implemented samplers.
smiles (list[str], optional): SMILES strings corresponding to the molecules for use in some samplers. Defaults to None.
cleaned_data (numpy.ndarray): Descriptors with no missing values.
targets (numpy.ndarray): Scaled targets in the same order as the descriptors.
target_names (sequence): Sequence of names for the targets.
batch_size (int): Number of molecules per training batch in SGD.
random_seed (int): Seed for splitting.
train_size (float): Fraction of data for training, nonzero.
val_size (float): Fraction of data for validation, nonzero.
test_size (float): Fraction of data for test, nonzero.
sampler (string): Sampling approach passed to astartes, i.e. random, scaffold
smiles (sequence): Sequence of SMILES strings corresponding to the features.
verbose (bool, optional): Print extra information. Defaults to True.
"""
super().__init__()
# used for data preparation and training
Expand Down Expand Up @@ -167,6 +173,7 @@ def __init__(
self.save_hyperparameters(ignore=("cleaned_data", "targets", "smiles"))

def _split(self):
"""Sets self.*_idxs for training, validation, and test"""
logger.info(f"Sampling dataset with {self.sampler} sampler.")
split_kwargs = dict(
train_size=self.train_size,
Expand Down Expand Up @@ -197,6 +204,7 @@ def _split(self):
) = train_val_test_split_molecules(self.smiles, **split_kwargs)

def setup(self, stage=None):
"""Split and rescale the data."""
if stage == "fit":
self._split()
logger.info("Imputing and rescaling input features.")
Expand Down Expand Up @@ -228,6 +236,15 @@ def setup(self, stage=None):
self.targets = torch.tensor(self.targets, dtype=torch.float32)

def _init_dataloader(self, shuffle, idxs):
"""Helper method to initialize dataloaders.
Args:
shuffle (bool): Passed to torch's DataLoader
idxs (np.array): Indexes of overall dataset to include in dataloader.
Returns:
torch.utils.data.Dataloader: Dataloader instance.
"""
return torch.utils.data.DataLoader(
ArbitraryDataset(
[self.data[i] for i in idxs],
Expand All @@ -249,6 +266,17 @@ def test_dataloader(self):
return self._init_dataloader(False, self.test_idxs)

def get_metrics(problem_type):
"""Get the metrics for training and early stopping based on the problem type.
Args:
problem_type (str): Regression, multilabel, multiclass, or binary.
Raises:
RuntimeError: Unsupported problem types
Returns:
str: names for the two metrics
"""
if problem_type == "regression":
return "mse", "rmse"
elif problem_type == "multilabel":
Expand All @@ -266,7 +294,7 @@ def forward(self, x):
return x

def log(self, name, value, **kwargs):
if (in_distributed := distributed.is_initialized()):
if in_distributed := distributed.is_initialized():
if not isinstance(value, torch.Tensor):
value = torch.tensor(value)
value = value.to(self.device)
Expand Down Expand Up @@ -294,6 +322,7 @@ def test_step(self, batch, batch_idx):
return loss

def predict_step(self, X):
# calls forward, but applies the appropriate transforms and activations
X = self.mean_imputer.transform(X)
X = self.feature_scaler.transform(X)
X = torch.tensor(X, dtype=torch.float32, device=self.device)
Expand All @@ -307,6 +336,7 @@ def predict_step(self, X):
return torch.nn.functional.softmax(logits, dim=1).detach().cpu()

def _machine_loss(self, batch, reduction="mean", return_all=False):
# reports the rescaled loss directly on the logits for computational efficiency
x, y = batch
y_hat = self.forward(x)
if self.problem_type == "regression":
Expand Down Expand Up @@ -411,8 +441,7 @@ def train_and_test(
Args:
outdir (str): Output directory for log files.
datamodule (ArbitraryDataModule): Lightning-style datamodule.
model (LightingModule): fastprop model architecture itself.
lightning_module (LightingModule): fastprop model architecture itself.
patience (int, optional): Maximum number of epochs to wait before stopping early. Defaults to 5.
verbose (bool, optional): Set to false for less output. Defaults to True.
no_logs (bool, optional): Set to true to disable logs. Defaults to False.
Expand Down Expand Up @@ -502,21 +531,29 @@ def _training_loop(
Args:
number_repeats (int): Number of repetitions.
number_features (int): Number of features in the input layer.
target_scaler (sklearn scaler): Scaler used on target variables, used for reporting metrics in human-scale.
number_epochs (int): Maximum allowed number of training epochs.
input_size (int): Number of input neurons.
hidden_size (int): Number of neurons in the hidden layers.
learning_rate (float): Learning rate.
readout_size (int): Number of targets to readout.
learning_rate (float): Learning rate for Adam.
fnn_layers (int): Number of layers in the FNN.
output_directory (str): Destination directory for file output.
patience (int, optional): Maximum number of epochs to wait before stopping early. Defaults to 5.
problem_type (str): Problem type, i.e. regression, multiclass, multilabel, or binary.
num_classes (int, optional): Number of classes for multiclass classification. Defaults to None.
output_directory (str): Output directory for log files.
datamodule (pl.DataModule): Basic data module.
patience (int): Maximum number of epochs to wait before stopping training early.
hopt (bool, optional): Set to true when running hyperparameter optimization to turn off logs, logging, etc. Defaults to False.
train_size (float): Fraction of data for training, nonzero.
val_size (float): Fraction of data for validation, nonzero.
test_size (float): Fraction of data for test, nonzero.
sampler (string): Sampling approach passed to astartes, i.e. random, scaffold
smiles (sequence): Sequence of SMILES strings corresponding to the features.
cleaned_data (numpy.ndarray): Descriptors with no missing values.
targets (numpy.ndarray): Scaled targets in the same order as the descriptors.
target_names (sequence): Sequence of names for the targets.
batch_size (int): Number of molecules per training batch in SGD.
random_seed (int): Seed for splitting.
hopt (bool, optional): Disables checkpointing and printing when True. Defaults to False.
Returns:
list[dict{metric: score}]: Output of lightning model.test and model.validate, one pair per repetition.
_type_: _description_
"""
if not hopt:
logger.info(f"Run 'tensorboard --logdir {os.path.join(output_directory, 'tensorboard_logs')}' to track training progress.")
Expand Down
5 changes: 3 additions & 2 deletions fastprop/utils/calculate_descriptors.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,13 @@ def _f(in_tuple):
return mordred_descs


def calculate_mordred_desciptors(descriptors, rdkit_mols, n_procs, strategy: Literal["fast", "low-memory"] = "fast", ignore_3d=True):
def calculate_mordred_desciptors(descriptors, rdkit_mols, n_procs: int = 2, strategy: Literal["fast", "low-memory"] = "fast", ignore_3d=True):
"""Wraps the mordred descriptor calculator.
Args:
descriptors (Mordred descriptor instances): Descriptors to calculate
rdkit_mols (list[rdkit mols]): List of RDKit molecules.
n_procs (int): Number of parallel processes.
n_procs (int): Number of parallel processes. Defaults to 2.
strategy (Literal["fast", "low-memory", optional): Parallelization strategy. Defaults to "fast".
ignore_3d (bool, optional): Include 3D descriptors, if in given list. Defaults to True.
Expand Down Expand Up @@ -80,6 +80,7 @@ def _get_descs(precomputed, input_file, output_directory, descriptors, enable_ca
descriptors (list): fastprop set of descriptors to calculate.
enable_cache (bool): Allow/disallow caching mechanism.
mols (list): RDKit molecules.
as_df (bool): Set to true to return the result as a pandas DataFrame. Defaults to False.
"""
descs = None
if precomputed:
Expand Down
1 change: 1 addition & 0 deletions fastprop/utils/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def load_from_csv(fpath, smiles_column, target_columns):


def load_saved_desc(fpath):
# loads descriptors previously saved by fastprop, forces any non-numeric values (missing, strings, etc) to be nan.
d = pd.read_csv(fpath, low_memory=False)
d = d.apply(pd.to_numeric, errors="coerce")
descs = d[d.columns[1:]].to_numpy(dtype=float)
Expand Down
3 changes: 2 additions & 1 deletion paper/paper.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ affiliations:
name: Massachusetts Institute of Technology, Cambridge, MA
- id: "*"
name: "Corresponding: whgreen@mit.edu"
date: January 31, 2024
date: 2 April, 2024
geometry: margin=1in
bibliography: paper.bib
citation-style: journal-of-cheminformatics
Expand Down Expand Up @@ -246,6 +246,7 @@ The authors prefer more readily interpretable metrics such as (Weighted) Mean Ab

All metrics are defined according to their typical formulae which are readily available online and are implemented in common software packages.
Those presented here are summarized below, first for regression:

- Mean Absolute Error (MAE): Absolute difference between predictions and ground truth averaged across dataset; scale-dependent.
- Root Mean Squared Error (RMSE): Absolute differences _squared_ and then averaged; scale-dependent.
- Mean Absolute Percentage Error (MAPE): MAE except that differences are relative (i.e. divided by the ground truth); scale-independent, range 0 (best) and up.
Expand Down
Binary file modified paper/paper.pdf
Binary file not shown.

0 comments on commit 8351850

Please sign in to comment.