Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework validation for measurements and pending_experiments #456

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
46 changes: 10 additions & 36 deletions baybe/campaign.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from baybe.utils.boolean import eq_dataframe
from baybe.utils.dataframe import filter_df, fuzzy_row_match
from baybe.utils.plotting import to_string
from baybe.utils.validation import validate_parameter_input, validate_target_input

if TYPE_CHECKING:
from botorch.posteriors import Posterior
Expand Down Expand Up @@ -201,48 +202,24 @@ def add_measurements(
Each addition of data is considered a new batch. Added results are checked for
validity. Categorical values need to have an exact match. For numerical values,
a campaign flag determines if values that lie outside a specified tolerance
are accepted.
Note that this modifies the provided data in-place.
are accepted. Possible validation exceptions are documented in
:func:`baybe.utils.validation.validate_target_input` and
:func:`baybe.utils.validation.validate_parameter_input`.

Args:
data: The data to be added (with filled values for targets). Preferably
created via :func:`baybe.campaign.Campaign.recommend`.
numerical_measurements_must_be_within_tolerance: Flag indicating if
numerical parameters need to be within their tolerances.

Raises:
ValueError: If one of the targets has missing values or NaNs in the provided
dataframe.
TypeError: If the target has non-numeric entries in the provided dataframe.
"""
# Invalidate recommendation cache first (in case of uncaught exceptions below)
self._cached_recommendation = pd.DataFrame()

# Check if all targets have valid values
for target in self.targets:
if data[target.name].isna().any():
raise ValueError(
f"The target '{target.name}' has missing values or NaNs in the "
f"provided dataframe. Missing target values are not supported."
)
if data[target.name].dtype.kind not in "iufb":
raise TypeError(
f"The target '{target.name}' has non-numeric entries in the "
f"provided dataframe. Non-numeric target values are not supported."
)

# Check if all targets have valid values
for param in self.parameters:
if data[param.name].isna().any():
raise ValueError(
f"The parameter '{param.name}' has missing values or NaNs in the "
f"provided dataframe. Missing parameter values are not supported."
)
if param.is_numerical and (data[param.name].dtype.kind not in "iufb"):
raise TypeError(
f"The numerical parameter '{param.name}' has non-numeric entries in"
f" the provided dataframe."
)
# Validate target and parameter input values
validate_target_input(data, self.targets)
validate_parameter_input(
data, self.parameters, numerical_measurements_must_be_within_tolerance
)

# Read in measurements and add them to the database
self.n_batches_done += 1
Expand All @@ -257,10 +234,7 @@ def add_measurements(
# Update metadata
if self.searchspace.type in (SearchSpaceType.DISCRETE, SearchSpaceType.HYBRID):
idxs_matched = fuzzy_row_match(
self.searchspace.discrete.exp_rep,
data,
self.parameters,
numerical_measurements_must_be_within_tolerance,
self.searchspace.discrete.exp_rep, data, self.parameters
)
self._searchspace_metadata.loc[idxs_matched, _MEASURED] = True

Expand Down
4 changes: 4 additions & 0 deletions baybe/recommenders/pure/bayesian/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from baybe.searchspace import SearchSpace
from baybe.surrogates import CustomONNXSurrogate, GaussianProcessSurrogate
from baybe.surrogates.base import IndependentGaussianSurrogate, SurrogateProtocol
from baybe.utils.validation import validate_parameter_input


@define
Expand Down Expand Up @@ -123,6 +124,9 @@ def recommend(
if isinstance(self._surrogate_model, CustomONNXSurrogate):
CustomONNXSurrogate.validate_compatibility(searchspace)

if pending_experiments is not None:
validate_parameter_input(pending_experiments, searchspace.parameters)

self._setup_botorch_acqf(
searchspace, objective, measurements, pending_experiments
)
Expand Down
2 changes: 1 addition & 1 deletion baybe/simulation/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def simulate_experiment(
campaign = deepcopy(campaign)

# Add the initial data
if initial_data is not None:
if (initial_data is not None) and (len(initial_data) > 0):
campaign.add_measurements(initial_data)

# For impute_mode 'ignore', do not recommend space entries that are not
Expand Down
9 changes: 1 addition & 8 deletions baybe/telemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,14 +240,7 @@ def telemetry_record_recommended_measurement_percentage(
if is_enabled():
if len(cached_recommendation) > 0:
recommended_measurements_percentage = (
len(
fuzzy_row_match(
cached_recommendation,
measurements,
parameters,
numerical_measurements_must_be_within_tolerance,
)
)
len(fuzzy_row_match(cached_recommendation, measurements, parameters))
AVHopp marked this conversation as resolved.
Show resolved Hide resolved
/ len(cached_recommendation)
* 100.0
)
Expand Down
96 changes: 61 additions & 35 deletions baybe/utils/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import functools
import logging
from collections.abc import Callable, Collection, Iterable, Sequence
from collections.abc import Callable, Iterable, Sequence
from typing import TYPE_CHECKING, Literal, TypeVar, overload

import numpy as np
Expand Down Expand Up @@ -70,7 +70,7 @@ def to_tensor(*x: np.ndarray | pd.DataFrame) -> Tensor | tuple[Tensor, ...]:

def add_fake_measurements(
data: pd.DataFrame,
targets: Collection[Target],
targets: Iterable[Target],
good_reference_values: dict[str, list] | None = None,
good_intervals: dict[str, tuple[float, float]] | None = None,
bad_intervals: dict[str, tuple[float, float]] | None = None,
Expand Down Expand Up @@ -278,6 +278,55 @@ def add_parameter_noise(
return data


def create_fake_input(
parameters: Iterable[Parameter],
targets: Iterable[Target],
n_rows: int = 1,
**kwargs: dict,
) -> pd.DataFrame:
"""Create fake valid input for :meth:`baybe.campaign.Campaign.add_measurements`.
Scienfitz marked this conversation as resolved.
Show resolved Hide resolved

If noisy parameter values are desired, it is recommended to apply
:func:`baybe.utils.dataframe.add_parameter_noise` to the output of this function.

Args:
parameters: The parameters.
targets: The targets.
n_rows: Number of desired rows.
**kwargs: Additional arguments to be passed to
:func:`baybe.utils.dataframe.add_fake_measurements`.

Returns:
Dataframe corresponding to fake measurement input.

Raises:
ValueError: If less than one row was requested.
"""
# Assert at least one fake entry is being generated
if n_rows < 1:
raise ValueError(
f"'{create_fake_input.__name__}' must at least create one row, but the "
f"requested number was: {n_rows}."
)

# Create fake parameter values from their definitions
content = {}
for p in parameters:
if p.is_discrete:
vals = np.random.choice(p.values, n_rows, replace=True)
else:
vals = np.random.uniform(p.bounds.lower, p.bounds.upper, n_rows)

content[p.name] = vals

data = pd.DataFrame.from_dict(content)

# Add fake target values
add_fake_measurements(data, targets, **kwargs)

return data


def df_drop_single_value_columns(
df: pd.DataFrame, lst_exclude: list = None
) -> pd.DataFrame:
Expand Down Expand Up @@ -416,33 +465,31 @@ def fuzzy_row_match(
left_df: pd.DataFrame,
right_df: pd.DataFrame,
parameters: Sequence[Parameter],
numerical_measurements_must_be_within_tolerance: bool,
) -> pd.Index:
"""Match row of the right dataframe to the rows of the left dataframe.

This is useful for validity checks and to automatically match measurements to
entries in the search space, e.g. to detect which ones have been measured.
For categorical parameters, there needs to be an exact match with any of the
allowed values. For numerical parameters, the user can decide via a flag
whether values outside the tolerance should be accepted.
This is useful for matching measurements to entries in the search space, e.g. to
Scienfitz marked this conversation as resolved.
Show resolved Hide resolved
detect which ones have been measured. For categorical parameters, there needs to be
an exact match with any of the allowed values. For numerical parameters, the user
can decide via a flag whether values outside the tolerance should be accepted.

Args:
left_df: The data that serves as lookup reference.
right_df: The data that should be checked for matching rows in the left
dataframe.
parameters: List of baybe parameter objects that are needed to identify
potential tolerances.
numerical_measurements_must_be_within_tolerance: If ``True``, numerical
parameters are matched with the search space elements only if there is a
match within the parameter tolerance. If ``False``, the closest match is
considered, irrespective of the distance.

Returns:
The index of the matching rows in ``left_df``.

Raises:
ValueError: If some rows are present in the right but not in the left dataframe.
ValueError: If the input data has invalid values.

Note:
This function assumes that the dataframes contain only allowed values as
specified in the parameter objects. No further validation to assert this is
done.
"""
# Assert that all parameters appear in the given dataframe
if not all(col in right_df.columns for col in left_df.columns):
Expand All @@ -451,30 +498,9 @@ def fuzzy_row_match(
" in the left dataframe."
)

inds_matched = []

# Iterate over all input rows
inds_matched = []
for ind, row in right_df.iterrows():
# Check if the row represents a valid input
valid = True
for param in parameters:
if param.is_numerical:
if numerical_measurements_must_be_within_tolerance:
valid &= param.is_in_range(row[param.name])
else:
valid &= param.is_in_range(row[param.name])
if not valid:
raise ValueError(
f"Input data on row with the index {row.name} has invalid "
f"values in parameter '{param.name}'. "
f"For categorical parameters, values need to exactly match a "
f"valid choice defined in your config. "
f"For numerical parameters, a match is accepted only if "
f"the input value is within the specified tolerance/range. Set "
f"the flag 'numerical_measurements_must_be_within_tolerance' "
f"to 'False' to disable this behavior."
)

# Differentiate category-like and discrete numerical parameters
cat_cols = [p.name for p in parameters if not p.is_numerical]
num_cols = [p.name for p in parameters if (p.is_numerical and p.is_discrete)]
Expand Down
104 changes: 102 additions & 2 deletions baybe/utils/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,16 @@
from __future__ import annotations

import math
from collections.abc import Callable
from typing import Any
from collections.abc import Callable, Iterable
from typing import TYPE_CHECKING, Any

import pandas as pd
from attrs import Attribute

if TYPE_CHECKING:
from baybe.parameters.base import Parameter
from baybe.targets.base import Target


def validate_not_nan(self: Any, attribute: Attribute, value: Any) -> None:
"""Attrs-compatible validator to forbid 'nan' values."""
Expand Down Expand Up @@ -68,3 +73,98 @@ def validator(self: Any, attribute: Attribute, value: Any) -> None:

non_inf_float = _make_restricted_float_validator(allow_nan=True, allow_inf=False)
"""Validator for non-infinite floats."""


def validate_target_input(data: pd.DataFrame, targets: Iterable[Target]) -> None:
"""Validate input dataframe columns corresponding to targets.

Args:
data: The input dataframe to be validated.
targets: The allowed targets.

Raises:
ValueError: If the input dataframe is empty.
ValueError: If any target data contain NaN.
TypeError: If any numerical target data contain non-numeric values.
Scienfitz marked this conversation as resolved.
Show resolved Hide resolved
ValueError: If any binary target data contain values not part of the targets'
allowed values.
"""
from baybe.targets import BinaryTarget, NumericalTarget

if len(data) < 1:
raise ValueError("The provided input dataframe cannot be empty.")

for t in targets:
if data[t.name].isna().any():
raise ValueError(
f"The target '{t.name}' has missing values in the provided dataframe."
)

if isinstance(t, NumericalTarget):
if data[t.name].dtype.kind not in "iufb":
raise TypeError(
f"The numerical target '{t.name}' has non-numeric entries in the "
f"provided dataframe."
)
elif isinstance(t, BinaryTarget):
allowed = {t.failure_value, t.success_value}
if invalid := set(data[t.name].unique()) - allowed:
raise ValueError(
f"The binary target '{t.name}' has invalid entries {invalid} "
f"in the provided dataframe. Allowed values are: {allowed}."
)


def validate_parameter_input(
data: pd.DataFrame,
parameters: Iterable[Parameter],
numerical_measurements_must_be_within_tolerance: bool = False,
) -> None:
"""Validate input dataframe columns corresponding to parameters.

Args:
data: The input dataframe to be validated.
parameters: The allowed parameters.
numerical_measurements_must_be_within_tolerance: If ``True``, numerical
parameter values must match to parameter values within the
parameter-specific tolerance.

Raises:
ValueError: If the input dataframe is empty.
ValueError: If a parameter contains NaN.
TypeError: If a parameter contains non-numeric values.
Scienfitz marked this conversation as resolved.
Show resolved Hide resolved
"""
if len(data) < 1:
raise ValueError("The provided input dataframe cannot be empty.")

for p in parameters:
if data[p.name].isna().any():
raise ValueError(
f"The parameter '{p.name}' has missing values in the provided "
f"dataframe."
)
if p.is_numerical and (data[p.name].dtype.kind not in "iufb"):
raise TypeError(
f"The numerical parameter '{p.name}' has non-numeric entries in the "
Scienfitz marked this conversation as resolved.
Show resolved Hide resolved
f"provided dataframe."
)

# Check if all rows have valid inputs matching allowed parameter values
Scienfitz marked this conversation as resolved.
Show resolved Hide resolved
for ind, row in data.iterrows():
valid = True
if p.is_numerical:
if numerical_measurements_must_be_within_tolerance:
valid &= p.is_in_range(row[p.name])
else:
valid &= p.is_in_range(row[p.name])
if not valid:
raise ValueError(
f"Input data on row with the index {row.name} has invalid "
f"values in parameter '{p.name}'. "
f"For categorical parameters, values need to exactly match a "
f"valid choice defined in your config. "
f"For numerical parameters, a match is accepted only if "
f"the input value is within the specified tolerance/range. Set "
f"the flag 'numerical_measurements_must_be_within_tolerance' "
f"to 'False' to disable this behavior."
)
Loading
Loading