From 2db906927e7a9be711d2d91f53b08c45352e7e0b Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Tue, 14 Jan 2025 17:44:29 +0100 Subject: [PATCH] feat: transform multiple columns of `Table` at once (#982) ### Summary of Changes * Rename `transform_column` to `transform_columns` * Rename parameter `name` to `selector` * `selector` can now be a list of column names * `transformer` can now optionally have a second parameter to receive the entire row --- benchmarks/table/row_operations.py | 6 +- docs/tutorials/data_processing.ipynb | 4 +- .../_check_column_has_no_missing_values.py | 2 + .../_check_columns_exist_module.py | 16 +-- .../_validation/_check_indices_module.py | 2 + src/safeds/data/tabular/containers/_table.py | 75 +++++++++--- .../_table/test_transform_column.py | 69 ----------- .../_table/test_transform_columns.py | 111 ++++++++++++++++++ 8 files changed, 186 insertions(+), 99 deletions(-) delete mode 100644 tests/safeds/data/tabular/containers/_table/test_transform_column.py create mode 100644 tests/safeds/data/tabular/containers/_table/test_transform_columns.py diff --git a/benchmarks/table/row_operations.py b/benchmarks/table/row_operations.py index 376d909f4..c040e1dcb 100644 --- a/benchmarks/table/row_operations.py +++ b/benchmarks/table/row_operations.py @@ -50,8 +50,8 @@ def _run_split_rows() -> None: table_2._lazy_frame.collect() -def _run_transform_column() -> None: - table.transform_column("column_0", lambda value: value * 2)._lazy_frame.collect() +def _run_transform_columns() -> None: + table.transform_columns("column_0", lambda value: value * 2)._lazy_frame.collect() if __name__ == "__main__": @@ -101,7 +101,7 @@ def _run_transform_column() -> None: number=REPETITIONS, ), "transform_column": timeit( - _run_transform_column, + _run_transform_columns, number=REPETITIONS, ), } diff --git a/docs/tutorials/data_processing.ipynb b/docs/tutorials/data_processing.ipynb index e52b5c721..a49e73663 100644 --- a/docs/tutorials/data_processing.ipynb +++ b/docs/tutorials/data_processing.ipynb @@ -886,9 +886,7 @@ "output_type": "execute_result" } ], - "source": [ - "titanic_slice.transform_column(\"parents_children\", lambda cell: cell > 0)" - ] + "source": "titanic_slice.transform_columns(\"parents_children\", lambda cell: cell > 0)" } ], "metadata": { diff --git a/src/safeds/_validation/_check_column_has_no_missing_values.py b/src/safeds/_validation/_check_column_has_no_missing_values.py index c78b3bf42..90b3989db 100644 --- a/src/safeds/_validation/_check_column_has_no_missing_values.py +++ b/src/safeds/_validation/_check_column_has_no_missing_values.py @@ -1,3 +1,5 @@ +"""The module name must differ from the function name, so it can be re-exported properly with apipkg.""" + from __future__ import annotations from typing import TYPE_CHECKING diff --git a/src/safeds/_validation/_check_columns_exist_module.py b/src/safeds/_validation/_check_columns_exist_module.py index fe765f1b3..7e1724921 100644 --- a/src/safeds/_validation/_check_columns_exist_module.py +++ b/src/safeds/_validation/_check_columns_exist_module.py @@ -13,16 +13,16 @@ from safeds.data.tabular.typing import Schema -def _check_columns_exist(table_or_schema: Table | Schema, requested_names: str | list[str]) -> None: +def _check_columns_exist(table_or_schema: Table | Schema, selector: str | list[str]) -> None: """ - Check whether the specified column names exist, and raise an error if they do not. + Check whether the specified columns exist, and raise an error if they do not. Parameters ---------- table_or_schema: The table or schema to check. - requested_names: - The column names to check. + selector: + The columns to check. Raises ------ @@ -33,16 +33,16 @@ def _check_columns_exist(table_or_schema: Table | Schema, requested_names: str | if isinstance(table_or_schema, Table): table_or_schema = table_or_schema.schema - if isinstance(requested_names, str): - requested_names = [requested_names] + if isinstance(selector, str): + selector = [selector] - if len(requested_names) > 1: + if len(selector) > 1: # Create a set for faster containment checks known_names: Container = set(table_or_schema.column_names) else: known_names = table_or_schema.column_names - unknown_names = [name for name in requested_names if name not in known_names] + unknown_names = [name for name in selector if name not in known_names] if unknown_names: message = _build_error_message(table_or_schema, unknown_names) raise ColumnNotFoundError(message) from None diff --git a/src/safeds/_validation/_check_indices_module.py b/src/safeds/_validation/_check_indices_module.py index d3efcedf1..0dd27141b 100644 --- a/src/safeds/_validation/_check_indices_module.py +++ b/src/safeds/_validation/_check_indices_module.py @@ -1,3 +1,5 @@ +"""The module name must differ from the function name, so it can be re-exported properly with apipkg.""" + from __future__ import annotations from typing import TYPE_CHECKING diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 32afbecea..17ff76d2d 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -596,8 +596,8 @@ def add_computed_column( - [add_columns][safeds.data.tabular.containers._table.Table.add_columns]: Add column objects to the table. - [add_index_column][safeds.data.tabular.containers._table.Table.add_index_column] - - [transform_column][safeds.data.tabular.containers._table.Table.transform_column]: - Transform an existing column with a custom function. + - [transform_columns][safeds.data.tabular.containers._table.Table.transform_columns]: + Transform existing columns with a custom function. """ _check_columns_dont_exist(self, name) @@ -1179,22 +1179,23 @@ def select_columns( self._lazy_frame.select(selector), ) - def transform_column( + def transform_columns( self, - name: str, - transformer: Callable[[Cell], Cell], + selector: str | list[str], + transformer: Callable[[Cell], Cell] | Callable[[Cell, Row], Cell], ) -> Table: """ - Transform a column with a custom function and return the result as a new table. + Transform columns with a custom function and return the result as a new table. **Note:** The original table is not modified. Parameters ---------- - name: - The name of the column to transform. + selector: + The names of the columns to transform. transformer: - The function that computes the new values of the column. + The function that computes the new values. It may take either a single cell or a cell and the entire row as + arguments (see examples). Returns ------- @@ -1210,7 +1211,7 @@ def transform_column( -------- >>> from safeds.data.tabular.containers import Table >>> table = Table({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.transform_column("a", lambda cell: cell + 1) + >>> table.transform_columns("a", lambda cell: cell + 1) +-----+-----+ | a | b | | --- | --- | @@ -1221,6 +1222,28 @@ def transform_column( | 4 | 6 | +-----+-----+ + >>> table.transform_columns(["a", "b"], lambda cell: cell + 1) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 2 | 5 | + | 3 | 6 | + | 4 | 7 | + +-----+-----+ + + >>> table.transform_columns("a", lambda cell, row: cell + row["b"]) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 5 | 4 | + | 7 | 5 | + | 9 | 6 | + +-----+-----+ + Related ------- - [add_computed_column][safeds.data.tabular.containers._table.Table.add_computed_column]: @@ -1228,14 +1251,34 @@ def transform_column( - [transform_table][safeds.data.tabular.containers._table.Table.transform_table]: Transform the entire table with a fitted transformer. """ - _check_columns_exist(self, name) - import polars as pl - expression = transformer(_LazyCell(pl.col(name))) + _check_columns_exist(self, selector) + + if isinstance(selector, str): + selector = [selector] + + parameter_count = transformer.__code__.co_argcount + if parameter_count == 1: + # Transformer only takes a cell + expressions = [ + transformer( # type: ignore[call-arg] + _LazyCell(pl.col(name)), + )._polars_expression.alias(name) + for name in selector + ] + else: + # Transformer takes a cell and the entire row + expressions = [ + transformer( # type: ignore[call-arg] + _LazyCell(pl.col(name)), + _LazyVectorizedRow(self), + )._polars_expression.alias(name) + for name in selector + ] return Table._from_polars_lazy_frame( - self._lazy_frame.with_columns(expression._polars_expression.alias(name)), + self._lazy_frame.with_columns(*expressions), ) # ------------------------------------------------------------------------------------------------------------------ @@ -2384,8 +2427,8 @@ def transform_table(self, fitted_transformer: TableTransformer) -> Table: ------- - [inverse_transform_table][safeds.data.tabular.containers._table.Table.inverse_transform_table]: Inverse-transform the table with a fitted, invertible transformer. - - [transform_column][safeds.data.tabular.containers._table.Table.transform_column]: - Transform a single column with a custom function. + - [transform_columns][safeds.data.tabular.containers._table.Table.transform_columns]: + Transform columns with a custom function. """ return fitted_transformer.transform(self) diff --git a/tests/safeds/data/tabular/containers/_table/test_transform_column.py b/tests/safeds/data/tabular/containers/_table/test_transform_column.py deleted file mode 100644 index 3391a44c0..000000000 --- a/tests/safeds/data/tabular/containers/_table/test_transform_column.py +++ /dev/null @@ -1,69 +0,0 @@ -from collections.abc import Callable - -import pytest - -from safeds.data.tabular.containers import Cell, Table -from safeds.exceptions import ColumnNotFoundError - - -@pytest.mark.parametrize( - ("table_factory", "name", "transformer", "expected"), - [ - ( - lambda: Table({"col1": []}), - "col1", - lambda _: Cell.from_literal(None), - Table({"col1": []}), - ), - ( - lambda: Table({"col1": []}), - "col1", - lambda cell: 2 * cell, - Table({"col1": []}), - ), - ( - lambda: Table({"col1": [1, 2, 3]}), - "col1", - lambda _: Cell.from_literal(None), - Table({"col1": [None, None, None]}), - ), - ( - lambda: Table({"col1": [1, 2, 3]}), - "col1", - lambda cell: 2 * cell, - Table({"col1": [2, 4, 6]}), - ), - ], - ids=[ - "no rows (constant value)", - "no rows (computed value)", - "non-empty (constant value)", - "non-empty (computed value)", - ], -) -class TestHappyPath: - def test_should_transform_column( - self, - table_factory: Callable[[], Table], - name: str, - transformer: Callable[[Cell], Cell], - expected: Table, - ) -> None: - actual = table_factory().transform_column(name, transformer) - assert actual == expected - - def test_should_not_mutate_receiver( - self, - table_factory: Callable[[], Table], - name: str, - transformer: Callable[[Cell], Cell], - expected: Table, # noqa: ARG002 - ) -> None: - original = table_factory() - original.transform_column(name, transformer) - assert original == table_factory() - - -def test_should_raise_if_column_not_found() -> None: - with pytest.raises(ColumnNotFoundError): - Table({}).transform_column("col1", lambda cell: cell * 2) diff --git a/tests/safeds/data/tabular/containers/_table/test_transform_columns.py b/tests/safeds/data/tabular/containers/_table/test_transform_columns.py new file mode 100644 index 000000000..ecd079750 --- /dev/null +++ b/tests/safeds/data/tabular/containers/_table/test_transform_columns.py @@ -0,0 +1,111 @@ +from collections.abc import Callable + +import pytest + +from safeds.data.tabular.containers import Cell, Row, Table +from safeds.exceptions import ColumnNotFoundError + + +@pytest.mark.parametrize( + ("table_factory", "selector", "transformer", "expected"), + [ + # no rows (constant value) + ( + lambda: Table({"col1": []}), + "col1", + lambda _: Cell.from_literal(None), + Table({"col1": []}), + ), + # no rows (computed value) + ( + lambda: Table({"col1": []}), + "col1", + lambda cell: 2 * cell, + Table({"col1": []}), + ), + # non-empty (constant value) + ( + lambda: Table({"col1": [1, 2]}), + "col1", + lambda _: Cell.from_literal(None), + Table({"col1": [None, None]}), + ), + # non-empty (computed value) + ( + lambda: Table({"col1": [1, 2]}), + "col1", + lambda cell: 2 * cell, + Table({"col1": [2, 4]}), + ), + # multiple columns transformed (constant value) + ( + lambda: Table({"col1": [1, 2], "col2": [3, 4]}), + ["col1", "col2"], + lambda _: Cell.from_literal(None), + Table({"col1": [None, None], "col2": [None, None]}), + ), + # multiple columns transformed (computed value) + ( + lambda: Table({"col1": [1, 2], "col2": [3, 4]}), + ["col1", "col2"], + lambda cell: 2 * cell, + Table({"col1": [2, 4], "col2": [6, 8]}), + ), + # lambda takes row parameter + ( + lambda: Table({"col1": [1, 2], "col2": [3, 4]}), + "col1", + lambda cell, row: 2 * cell + row["col2"], + Table({"col1": [5, 8], "col2": [3, 4]}), + ), + ], + ids=[ + "no rows (constant value)", + "no rows (computed value)", + "non-empty (constant value)", + "non-empty (computed value)", + "multiple columns transformed (constant value)", + "multiple columns transformed (computed value)", + "lambda takes row parameter", + ], +) +class TestHappyPath: + def test_should_transform_columns( + self, + table_factory: Callable[[], Table], + selector: str, + transformer: Callable[[Cell], Cell] | Callable[[Cell, Row], Cell], + expected: Table, + ) -> None: + actual = table_factory().transform_columns(selector, transformer) + assert actual == expected + + def test_should_not_mutate_receiver( + self, + table_factory: Callable[[], Table], + selector: str, + transformer: Callable[[Cell], Cell] | Callable[[Cell, Row], Cell], + expected: Table, # noqa: ARG002 + ) -> None: + original = table_factory() + original.transform_columns(selector, transformer) + assert original == table_factory() + + +@pytest.mark.parametrize( + ("table", "selector"), + [ + (Table({"col1": [1, 2]}), "col2"), + (Table({"col1": [1, 2]}), ["col1", "col2"]), + ], + ids=[ + "one column name", + "multiple column names", + ], +) +def test_should_raise_if_column_not_found( + table: Table, + selector: str, +) -> None: + with pytest.raises(ColumnNotFoundError): + table.transform_columns(selector, lambda cell: cell * 2)