diff --git a/examples/oom_training.py b/examples/oom_training.py index 15ca4d7..978275b 100644 --- a/examples/oom_training.py +++ b/examples/oom_training.py @@ -225,7 +225,8 @@ def __init__( # mock the target scaler used for reporting some human-readable metrics self.target_scaler = SimpleNamespace(n_features_in_=1, inverse_transform=lambda i: np.array(i)) - def setup(self, stage=None): ... # skip feature scaling and dataset splitting + def setup(self, stage=None): + ... # skip feature scaling and dataset splitting def _init_dataloader(self, shuffle, idxs): return TorchDataloader( diff --git a/fastprop/fastprop_core.py b/fastprop/fastprop_core.py index 5b3d022..ebd1d36 100644 --- a/fastprop/fastprop_core.py +++ b/fastprop/fastprop_core.py @@ -65,6 +65,12 @@ class ArbitraryDataset(TorchDataset): """ def __init__(self, data, targets): + """Initialize a basic torch-compatible data + + Args: + data (torch.tensor): features + targets (torch.tensor): targets + """ self.data = data self.length = len(targets) self.targets = targets @@ -107,24 +113,24 @@ def __init__( """Core fastprop model. Args: - feature_scaler (sklearn scaler): Scaler used on feature variables, used for reporting metrics in human-scale. - target_scaler (sklearn scaler): Scaler used on target variables, used for reporting metrics in human-scale. num_epochs (int): Maximum allowed number of training epochs. + input_size (int): Number of input neurons. hidden_size (int): Number of neurons in the hidden layers. - learning_rate (float): Learning rate. + readout_size (int): Number of targets to readout. + learning_rate (float): Learning rate for Adam. fnn_layers (int): Number of layers in the FNN. problem_type (str): Problem type, i.e. regression, multiclass, multilabel, or binary. - verbose (bool, optional): Reduces some logging if true. Defaults to False. - num_classes (int, optional): Number of classes for multiclass classification. Defaults to None. - cleaned_data (numpy.ndarray or torch.Tensor): Descriptors, already subjected to preprocessing (dropping operations) - targets (numpy.ndarray or torch.Tensor): Scaled targets in the same order as the descriptors. - batch_size (int): Samples/per batch - for small feature sets like in fastprop, set as high as possible. - random_seed (int): Seed for RNG. - train_size (float): Fraction of data for training. - val_size (float): Fraction of data for validation. - test_size (float): Fraction of data for test. - sampler (str): Type of sampler to use, see astartes for a list of implemented samplers. - smiles (list[str], optional): SMILES strings corresponding to the molecules for use in some samplers. Defaults to None. + cleaned_data (numpy.ndarray): Descriptors with no missing values. + targets (numpy.ndarray): Scaled targets in the same order as the descriptors. + target_names (sequence): Sequence of names for the targets. + batch_size (int): Number of molecules per training batch in SGD. + random_seed (int): Seed for splitting. + train_size (float): Fraction of data for training, nonzero. + val_size (float): Fraction of data for validation, nonzero. + test_size (float): Fraction of data for test, nonzero. + sampler (string): Sampling approach passed to astartes, i.e. random, scaffold + smiles (sequence): Sequence of SMILES strings corresponding to the features. + verbose (bool, optional): Print extra information. Defaults to True. """ super().__init__() # used for data preparation and training @@ -167,6 +173,7 @@ def __init__( self.save_hyperparameters(ignore=("cleaned_data", "targets", "smiles")) def _split(self): + """Sets self.*_idxs for training, validation, and test""" logger.info(f"Sampling dataset with {self.sampler} sampler.") split_kwargs = dict( train_size=self.train_size, @@ -197,6 +204,7 @@ def _split(self): ) = train_val_test_split_molecules(self.smiles, **split_kwargs) def setup(self, stage=None): + """Split and rescale the data.""" if stage == "fit": self._split() logger.info("Imputing and rescaling input features.") @@ -228,6 +236,15 @@ def setup(self, stage=None): self.targets = torch.tensor(self.targets, dtype=torch.float32) def _init_dataloader(self, shuffle, idxs): + """Helper method to initialize dataloaders. + + Args: + shuffle (bool): Passed to torch's DataLoader + idxs (np.array): Indexes of overall dataset to include in dataloader. + + Returns: + torch.utils.data.Dataloader: Dataloader instance. + """ return torch.utils.data.DataLoader( ArbitraryDataset( [self.data[i] for i in idxs], @@ -249,6 +266,17 @@ def test_dataloader(self): return self._init_dataloader(False, self.test_idxs) def get_metrics(problem_type): + """Get the metrics for training and early stopping based on the problem type. + + Args: + problem_type (str): Regression, multilabel, multiclass, or binary. + + Raises: + RuntimeError: Unsupported problem types + + Returns: + str: names for the two metrics + """ if problem_type == "regression": return "mse", "rmse" elif problem_type == "multilabel": @@ -266,7 +294,7 @@ def forward(self, x): return x def log(self, name, value, **kwargs): - if (in_distributed := distributed.is_initialized()): + if in_distributed := distributed.is_initialized(): if not isinstance(value, torch.Tensor): value = torch.tensor(value) value = value.to(self.device) @@ -294,6 +322,7 @@ def test_step(self, batch, batch_idx): return loss def predict_step(self, X): + # calls forward, but applies the appropriate transforms and activations X = self.mean_imputer.transform(X) X = self.feature_scaler.transform(X) X = torch.tensor(X, dtype=torch.float32, device=self.device) @@ -307,6 +336,7 @@ def predict_step(self, X): return torch.nn.functional.softmax(logits, dim=1).detach().cpu() def _machine_loss(self, batch, reduction="mean", return_all=False): + # reports the rescaled loss directly on the logits for computational efficiency x, y = batch y_hat = self.forward(x) if self.problem_type == "regression": @@ -411,8 +441,7 @@ def train_and_test( Args: outdir (str): Output directory for log files. - datamodule (ArbitraryDataModule): Lightning-style datamodule. - model (LightingModule): fastprop model architecture itself. + lightning_module (LightingModule): fastprop model architecture itself. patience (int, optional): Maximum number of epochs to wait before stopping early. Defaults to 5. verbose (bool, optional): Set to false for less output. Defaults to True. no_logs (bool, optional): Set to true to disable logs. Defaults to False. @@ -502,21 +531,29 @@ def _training_loop( Args: number_repeats (int): Number of repetitions. - number_features (int): Number of features in the input layer. - target_scaler (sklearn scaler): Scaler used on target variables, used for reporting metrics in human-scale. number_epochs (int): Maximum allowed number of training epochs. + input_size (int): Number of input neurons. hidden_size (int): Number of neurons in the hidden layers. - learning_rate (float): Learning rate. + readout_size (int): Number of targets to readout. + learning_rate (float): Learning rate for Adam. fnn_layers (int): Number of layers in the FNN. + output_directory (str): Destination directory for file output. + patience (int, optional): Maximum number of epochs to wait before stopping early. Defaults to 5. problem_type (str): Problem type, i.e. regression, multiclass, multilabel, or binary. - num_classes (int, optional): Number of classes for multiclass classification. Defaults to None. - output_directory (str): Output directory for log files. - datamodule (pl.DataModule): Basic data module. - patience (int): Maximum number of epochs to wait before stopping training early. - hopt (bool, optional): Set to true when running hyperparameter optimization to turn off logs, logging, etc. Defaults to False. + train_size (float): Fraction of data for training, nonzero. + val_size (float): Fraction of data for validation, nonzero. + test_size (float): Fraction of data for test, nonzero. + sampler (string): Sampling approach passed to astartes, i.e. random, scaffold + smiles (sequence): Sequence of SMILES strings corresponding to the features. + cleaned_data (numpy.ndarray): Descriptors with no missing values. + targets (numpy.ndarray): Scaled targets in the same order as the descriptors. + target_names (sequence): Sequence of names for the targets. + batch_size (int): Number of molecules per training batch in SGD. + random_seed (int): Seed for splitting. + hopt (bool, optional): Disables checkpointing and printing when True. Defaults to False. Returns: - list[dict{metric: score}]: Output of lightning model.test and model.validate, one pair per repetition. + _type_: _description_ """ if not hopt: logger.info(f"Run 'tensorboard --logdir {os.path.join(output_directory, 'tensorboard_logs')}' to track training progress.") diff --git a/fastprop/utils/calculate_descriptors.py b/fastprop/utils/calculate_descriptors.py index 50089e9..28882f0 100644 --- a/fastprop/utils/calculate_descriptors.py +++ b/fastprop/utils/calculate_descriptors.py @@ -27,13 +27,13 @@ def _f(in_tuple): return mordred_descs -def calculate_mordred_desciptors(descriptors, rdkit_mols, n_procs, strategy: Literal["fast", "low-memory"] = "fast", ignore_3d=True): +def calculate_mordred_desciptors(descriptors, rdkit_mols, n_procs: int = 2, strategy: Literal["fast", "low-memory"] = "fast", ignore_3d=True): """Wraps the mordred descriptor calculator. Args: descriptors (Mordred descriptor instances): Descriptors to calculate rdkit_mols (list[rdkit mols]): List of RDKit molecules. - n_procs (int): Number of parallel processes. + n_procs (int): Number of parallel processes. Defaults to 2. strategy (Literal["fast", "low-memory", optional): Parallelization strategy. Defaults to "fast". ignore_3d (bool, optional): Include 3D descriptors, if in given list. Defaults to True. @@ -80,6 +80,7 @@ def _get_descs(precomputed, input_file, output_directory, descriptors, enable_ca descriptors (list): fastprop set of descriptors to calculate. enable_cache (bool): Allow/disallow caching mechanism. mols (list): RDKit molecules. + as_df (bool): Set to true to return the result as a pandas DataFrame. Defaults to False. """ descs = None if precomputed: diff --git a/fastprop/utils/load_data.py b/fastprop/utils/load_data.py index 10ad267..3a6ebfc 100644 --- a/fastprop/utils/load_data.py +++ b/fastprop/utils/load_data.py @@ -52,6 +52,7 @@ def load_from_csv(fpath, smiles_column, target_columns): def load_saved_desc(fpath): + # loads descriptors previously saved by fastprop, forces any non-numeric values (missing, strings, etc) to be nan. d = pd.read_csv(fpath, low_memory=False) d = d.apply(pd.to_numeric, errors="coerce") descs = d[d.columns[1:]].to_numpy(dtype=float) diff --git a/paper/paper.md b/paper/paper.md index c2c5b4d..2d7ec30 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -11,7 +11,7 @@ affiliations: name: Massachusetts Institute of Technology, Cambridge, MA - id: "*" name: "Corresponding: whgreen@mit.edu" -date: January 31, 2024 +date: 2 April, 2024 geometry: margin=1in bibliography: paper.bib citation-style: journal-of-cheminformatics @@ -246,6 +246,7 @@ The authors prefer more readily interpretable metrics such as (Weighted) Mean Ab All metrics are defined according to their typical formulae which are readily available online and are implemented in common software packages. Those presented here are summarized below, first for regression: + - Mean Absolute Error (MAE): Absolute difference between predictions and ground truth averaged across dataset; scale-dependent. - Root Mean Squared Error (RMSE): Absolute differences _squared_ and then averaged; scale-dependent. - Mean Absolute Percentage Error (MAPE): MAE except that differences are relative (i.e. divided by the ground truth); scale-independent, range 0 (best) and up. diff --git a/paper/paper.pdf b/paper/paper.pdf index 80dd3a9..beeeb2b 100644 Binary files a/paper/paper.pdf and b/paper/paper.pdf differ