emdgroup · Scienfitz · Jan 3, 2025 · Jan 3, 2025 · Jan 3, 2025 · Jan 3, 2025
@@ -43,6 +43,7 @@
 from baybe.utils.boolean import eq_dataframe
 from baybe.utils.dataframe import filter_df, fuzzy_row_match
 from baybe.utils.plotting import to_string
+from baybe.utils.validation import validate_parameter_input, validate_target_input
 
 if TYPE_CHECKING:
     from botorch.posteriors import Posterior
@@ -201,48 +202,24 @@ def add_measurements(
         Each addition of data is considered a new batch. Added results are checked for
         validity. Categorical values need to have an exact match. For numerical values,
         a campaign flag determines if values that lie outside a specified tolerance
-        are accepted.
-        Note that this modifies the provided data in-place.
+        are accepted. Possible validation exceptions are documented in
+        :func:`baybe.utils.validation.validate_target_input` and
+        :func:`baybe.utils.validation.validate_parameter_input`.
 
         Args:
             data: The data to be added (with filled values for targets). Preferably
                 created via :func:`baybe.campaign.Campaign.recommend`.
             numerical_measurements_must_be_within_tolerance: Flag indicating if
                 numerical parameters need to be within their tolerances.
-
-        Raises:
-            ValueError: If one of the targets has missing values or NaNs in the provided
-                dataframe.
-            TypeError: If the target has non-numeric entries in the provided dataframe.
         """
         # Invalidate recommendation cache first (in case of uncaught exceptions below)
         self._cached_recommendation = pd.DataFrame()
 
-        # Check if all targets have valid values
-        for target in self.targets:
-            if data[target.name].isna().any():
-                raise ValueError(
-                    f"The target '{target.name}' has missing values or NaNs in the "
-                    f"provided dataframe. Missing target values are not supported."
-                )
-            if data[target.name].dtype.kind not in "iufb":
-                raise TypeError(
-                    f"The target '{target.name}' has non-numeric entries in the "
-                    f"provided dataframe. Non-numeric target values are not supported."
-                )
-
-        # Check if all targets have valid values
-        for param in self.parameters:
-            if data[param.name].isna().any():
-                raise ValueError(
-                    f"The parameter '{param.name}' has missing values or NaNs in the "
-                    f"provided dataframe. Missing parameter values are not supported."
-                )
-            if param.is_numerical and (data[param.name].dtype.kind not in "iufb"):
-                raise TypeError(
-                    f"The numerical parameter '{param.name}' has non-numeric entries in"
-                    f" the provided dataframe."
-                )
+        # Validate target and parameter input values
+        validate_target_input(data, self.targets)
+        validate_parameter_input(
+            data, self.parameters, numerical_measurements_must_be_within_tolerance
+        )
 
         # Read in measurements and add them to the database
         self.n_batches_done += 1
@@ -257,10 +234,7 @@ def add_measurements(
         # Update metadata
         if self.searchspace.type in (SearchSpaceType.DISCRETE, SearchSpaceType.HYBRID):
             idxs_matched = fuzzy_row_match(
-                self.searchspace.discrete.exp_rep,
-                data,
-                self.parameters,
-                numerical_measurements_must_be_within_tolerance,
+                self.searchspace.discrete.exp_rep, data, self.parameters
             )
             self._searchspace_metadata.loc[idxs_matched, _MEASURED] = True
 

@@ -17,6 +17,7 @@
 from baybe.searchspace import SearchSpace
 from baybe.surrogates import CustomONNXSurrogate, GaussianProcessSurrogate
 from baybe.surrogates.base import IndependentGaussianSurrogate, SurrogateProtocol
+from baybe.utils.validation import validate_parameter_input
 
 
 @define
@@ -123,6 +124,9 @@ def recommend(
         if isinstance(self._surrogate_model, CustomONNXSurrogate):
             CustomONNXSurrogate.validate_compatibility(searchspace)
 
+        if pending_experiments is not None:
+            validate_parameter_input(pending_experiments, searchspace.parameters)
+
         self._setup_botorch_acqf(
             searchspace, objective, measurements, pending_experiments
         )

@@ -118,7 +118,7 @@ def simulate_experiment(
         campaign = deepcopy(campaign)
 
         # Add the initial data
-        if initial_data is not None:
+        if (initial_data is not None) and (len(initial_data) > 0):
             campaign.add_measurements(initial_data)
 
         # For impute_mode 'ignore', do not recommend space entries that are not

@@ -240,14 +240,7 @@ def telemetry_record_recommended_measurement_percentage(
     if is_enabled():
         if len(cached_recommendation) > 0:
             recommended_measurements_percentage = (
-                len(
-                    fuzzy_row_match(
-                        cached_recommendation,
-                        measurements,
-                        parameters,
-                        numerical_measurements_must_be_within_tolerance,
-                    )
-                )
+                len(fuzzy_row_match(cached_recommendation, measurements, parameters))
                 / len(cached_recommendation)
                 * 100.0
             )

@@ -4,7 +4,7 @@
 
 import functools
 import logging
-from collections.abc import Callable, Collection, Iterable, Sequence
+from collections.abc import Callable, Iterable, Sequence
 from typing import TYPE_CHECKING, Literal, TypeVar, overload
 
 import numpy as np
@@ -70,7 +70,7 @@ def to_tensor(*x: np.ndarray | pd.DataFrame) -> Tensor | tuple[Tensor, ...]:
 
 def add_fake_measurements(
     data: pd.DataFrame,
-    targets: Collection[Target],
+    targets: Iterable[Target],
     good_reference_values: dict[str, list] | None = None,
     good_intervals: dict[str, tuple[float, float]] | None = None,
     bad_intervals: dict[str, tuple[float, float]] | None = None,
@@ -278,6 +278,55 @@ def add_parameter_noise(
     return data
 
 
+def create_fake_input(
+    parameters: Iterable[Parameter],
+    targets: Iterable[Target],
+    n_rows: int = 1,
+    **kwargs: dict,
+) -> pd.DataFrame:
+    """Create fake valid input for :meth:`baybe.campaign.Campaign.add_measurements`.
+
+    If noisy parameter values are desired, it is recommended to apply
+    :func:`baybe.utils.dataframe.add_parameter_noise` to the output of this function.
+
+    Args:
+        parameters: The parameters.
+        targets: The targets.
+        n_rows: Number of desired rows.
+        **kwargs: Additional arguments to be passed to
+            :func:`baybe.utils.dataframe.add_fake_measurements`.
+
+    Returns:
+        Dataframe corresponding to fake measurement input.
+
+    Raises:
+        ValueError: If less than one row was requested.
+    """
+    # Assert at least one fake entry is being generated
+    if n_rows < 1:
+        raise ValueError(
+            f"'{create_fake_input.__name__}' must at least create one row, but the "
+            f"requested number was: {n_rows}."
+        )
+
+    # Create fake parameter values from their definitions
+    content = {}
+    for p in parameters:
+        if p.is_discrete:
+            vals = np.random.choice(p.values, n_rows, replace=True)
+        else:
+            vals = np.random.uniform(p.bounds.lower, p.bounds.upper, n_rows)
+
+        content[p.name] = vals
+
+    data = pd.DataFrame.from_dict(content)
+
+    # Add fake target values
+    add_fake_measurements(data, targets, **kwargs)
+
+    return data
+
+
 def df_drop_single_value_columns(
     df: pd.DataFrame, lst_exclude: list = None
 ) -> pd.DataFrame:
@@ -416,33 +465,31 @@ def fuzzy_row_match(
     left_df: pd.DataFrame,
     right_df: pd.DataFrame,
     parameters: Sequence[Parameter],
-    numerical_measurements_must_be_within_tolerance: bool,
 ) -> pd.Index:
     """Match row of the right dataframe to the rows of the left dataframe.
 
-    This is useful for validity checks and to automatically match measurements to
-    entries in the search space, e.g. to detect which ones have been measured.
-    For categorical parameters, there needs to be an exact match with any of the
-    allowed values. For numerical parameters, the user can decide via a flag
-    whether values outside the tolerance should be accepted.
+    This is useful for matching measurements to entries in the search space, e.g. to
+    detect which ones have been measured. For categorical parameters, there needs to be
+    an exact match with any of the allowed values. For numerical parameters, the user
+    can decide via a flag whether values outside the tolerance should be accepted.
 
     Args:
         left_df: The data that serves as lookup reference.
         right_df: The data that should be checked for matching rows in the left
             dataframe.
         parameters: List of baybe parameter objects that are needed to identify
             potential tolerances.
-        numerical_measurements_must_be_within_tolerance: If ``True``, numerical
-            parameters are matched with the search space elements only if there is a
-            match within the parameter tolerance. If ``False``, the closest match is
-            considered, irrespective of the distance.
 
     Returns:
         The index of the matching rows in ``left_df``.
 
     Raises:
         ValueError: If some rows are present in the right but not in the left dataframe.
-        ValueError: If the input data has invalid values.
+
+    Note:
+        This function assumes that the dataframes contain only allowed values as
+        specified in the parameter objects. No further validation to assert this is
+        done.
     """
     # Assert that all parameters appear in the given dataframe
     if not all(col in right_df.columns for col in left_df.columns):
@@ -451,30 +498,9 @@ def fuzzy_row_match(
             " in the left dataframe."
         )
 
-    inds_matched = []
-
     # Iterate over all input rows
+    inds_matched = []
     for ind, row in right_df.iterrows():
-        # Check if the row represents a valid input
-        valid = True
-        for param in parameters:
-            if param.is_numerical:
-                if numerical_measurements_must_be_within_tolerance:
-                    valid &= param.is_in_range(row[param.name])
-            else:
-                valid &= param.is_in_range(row[param.name])
-            if not valid:
-                raise ValueError(
-                    f"Input data on row with the index {row.name} has invalid "
-                    f"values in parameter '{param.name}'. "
-                    f"For categorical parameters, values need to exactly match a "
-                    f"valid choice defined in your config. "
-                    f"For numerical parameters, a match is accepted only if "
-                    f"the input value is within the specified tolerance/range. Set "
-                    f"the flag 'numerical_measurements_must_be_within_tolerance' "
-                    f"to 'False' to disable this behavior."
-                )
-
         # Differentiate category-like and discrete numerical parameters
         cat_cols = [p.name for p in parameters if not p.is_numerical]
         num_cols = [p.name for p in parameters if (p.is_numerical and p.is_discrete)]

@@ -3,11 +3,16 @@
 from __future__ import annotations
 
 import math
-from collections.abc import Callable
-from typing import Any
+from collections.abc import Callable, Iterable
+from typing import TYPE_CHECKING, Any
 
+import pandas as pd
 from attrs import Attribute
 
+if TYPE_CHECKING:
+    from baybe.parameters.base import Parameter
+    from baybe.targets.base import Target
+
 
 def validate_not_nan(self: Any, attribute: Attribute, value: Any) -> None:
     """Attrs-compatible validator to forbid 'nan' values."""
@@ -68,3 +73,98 @@ def validator(self: Any, attribute: Attribute, value: Any) -> None:
 
 non_inf_float = _make_restricted_float_validator(allow_nan=True, allow_inf=False)
 """Validator for non-infinite floats."""
+
+
+def validate_target_input(data: pd.DataFrame, targets: Iterable[Target]) -> None:
+    """Validate input dataframe columns corresponding to targets.
+
+    Args:
+        data: The input dataframe to be validated.
+        targets: The allowed targets.
+
+    Raises:
+        ValueError: If the input dataframe is empty.
+        ValueError: If any target data contain NaN.
+        TypeError: If any numerical target data contain non-numeric values.
+        ValueError: If any binary target data contain values not part of the targets'
+            allowed values.
+    """
+    from baybe.targets import BinaryTarget, NumericalTarget
+
+    if len(data) < 1:
+        raise ValueError("The provided input dataframe cannot be empty.")
+
+    for t in targets:
+        if data[t.name].isna().any():
+            raise ValueError(
+                f"The target '{t.name}' has missing values in the provided dataframe."
+            )
+
+        if isinstance(t, NumericalTarget):
+            if data[t.name].dtype.kind not in "iufb":
+                raise TypeError(
+                    f"The numerical target '{t.name}' has non-numeric entries in the "
+                    f"provided dataframe."
+                )
+        elif isinstance(t, BinaryTarget):
+            allowed = {t.failure_value, t.success_value}
+            if invalid := set(data[t.name].unique()) - allowed:
+                raise ValueError(
+                    f"The binary target '{t.name}' has invalid entries {invalid} "
+                    f"in the provided dataframe. Allowed values are: {allowed}."
+                )
+
+
+def validate_parameter_input(
+    data: pd.DataFrame,
+    parameters: Iterable[Parameter],
+    numerical_measurements_must_be_within_tolerance: bool = False,
+) -> None:
+    """Validate input dataframe columns corresponding to parameters.
+
+    Args:
+        data: The input dataframe to be validated.
+        parameters: The allowed parameters.
+        numerical_measurements_must_be_within_tolerance: If ``True``, numerical
+            parameter values must match to parameter values within the
+            parameter-specific tolerance.
+
+    Raises:
+        ValueError: If the input dataframe is empty.
+        ValueError: If a parameter contains NaN.
+        TypeError: If a parameter contains non-numeric values.
+    """
+    if len(data) < 1:
+        raise ValueError("The provided input dataframe cannot be empty.")
+
+    for p in parameters:
+        if data[p.name].isna().any():
+            raise ValueError(
+                f"The parameter '{p.name}' has missing values in the provided "
+                f"dataframe."
+            )
+        if p.is_numerical and (data[p.name].dtype.kind not in "iufb"):
+            raise TypeError(
+                f"The numerical parameter '{p.name}' has non-numeric entries in the "
+                f"provided dataframe."
+            )
+
+        # Check if all rows have valid inputs matching allowed parameter values
+        for ind, row in data.iterrows():
+            valid = True
+            if p.is_numerical:
+                if numerical_measurements_must_be_within_tolerance:
+                    valid &= p.is_in_range(row[p.name])
+            else:
+                valid &= p.is_in_range(row[p.name])
+            if not valid:
+                raise ValueError(
+                    f"Input data on row with the index {row.name} has invalid "
+                    f"values in parameter '{p.name}'. "
+                    f"For categorical parameters, values need to exactly match a "
+                    f"valid choice defined in your config. "
+                    f"For numerical parameters, a match is accepted only if "
+                    f"the input value is within the specified tolerance/range. Set "
+                    f"the flag 'numerical_measurements_must_be_within_tolerance' "
+                    f"to 'False' to disable this behavior."
+                )