Skip to content

Commit

Permalink
feat: polars implementation of table (#744)
Browse files Browse the repository at this point in the history
Closes #638
Closes #641
Closes #649
Closes #712

### Summary of Changes

Implement our table using polars as backend.

---------

Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com>
  • Loading branch information
lars-reimann and megalinter-bot authored May 9, 2024
1 parent 0564b52 commit fc49895
Show file tree
Hide file tree
Showing 63 changed files with 5,011 additions and 465 deletions.
50 changes: 50 additions & 0 deletions benchmarks/table/column_operations_polars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from timeit import timeit

from safeds.data.tabular.containers import ExperimentalTable

from benchmarks.table.utils import create_synthetic_table_polars

REPETITIONS = 10


def _run_remove_columns_with_missing_values() -> None:
table.remove_columns_with_missing_values()._lazy_frame.collect()


def _run_remove_non_numeric_columns() -> None:
table.remove_non_numeric_columns()._lazy_frame.collect()


def _run_summarize_statistics() -> None:
table.summarize_statistics()._lazy_frame.collect()


if __name__ == "__main__":
# Create a synthetic Table
table = create_synthetic_table_polars(100, 5000)

# Run the benchmarks
timings: dict[str, float] = {
"remove_columns_with_missing_values": timeit(
_run_remove_columns_with_missing_values,
number=REPETITIONS,
),
"remove_non_numeric_columns": timeit(
_run_remove_non_numeric_columns,
number=REPETITIONS,
),
"summarize_statistics": timeit(
_run_summarize_statistics,
number=REPETITIONS,
),
}

# Print the timings
print(
ExperimentalTable(
{
"method": list(timings.keys()),
"timing": list(timings.values()),
}
)
)
41 changes: 27 additions & 14 deletions benchmarks/table/row_operations_polars.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from timeit import timeit

from safeds.data.tabular.containers import Table
import polars as pl

from safeds.data.tabular.containers import ExperimentalTable

from benchmarks.table.utils import create_synthetic_table_polars

Expand All @@ -15,14 +17,18 @@ def _run_remove_rows_with_missing_values() -> None:
table.remove_rows_with_missing_values()._lazy_frame.collect()


# def _run_remove_rows_with_outliers() -> None:
# table.remove_rows_with_outliers()
def _run_remove_rows_with_outliers() -> None:
table.remove_rows_with_outliers()


def _run_remove_rows() -> None:
table.remove_rows(lambda row: row.get_value("column_0") % 2 == 0)._lazy_frame.collect()


def _run_remove_rows_by_column() -> None:
table.remove_rows_by_column("column_0", lambda cell: cell % 2 == 0)._lazy_frame.collect()


def _run_shuffle_rows() -> None:
table.shuffle_rows()._lazy_frame.collect()

Expand Down Expand Up @@ -63,14 +69,18 @@ def _run_transform_column() -> None:
_run_remove_rows_with_missing_values,
number=REPETITIONS,
),
# "remove_rows_with_outliers": timeit(
# _run_remove_rows_with_outliers,
# number=REPETITIONS,
# ),
"remove_rows_with_outliers": timeit(
_run_remove_rows_with_outliers,
number=REPETITIONS,
),
"remove_rows": timeit(
_run_remove_rows,
number=REPETITIONS,
),
"remove_rows_by_column": timeit(
_run_remove_rows_by_column,
number=REPETITIONS,
),
"shuffle_rows": timeit(
_run_shuffle_rows,
number=REPETITIONS,
Expand Down Expand Up @@ -98,11 +108,14 @@ def _run_transform_column() -> None:
}

# Print the timings
print(
Table(
{
"method": list(timings.keys()),
"timing": list(timings.values()),
}
with pl.Config(
tbl_rows=-1,
):
print(
ExperimentalTable(
{
"method": list(timings.keys()),
"timing": list(timings.values()),
}
)
)
)
3 changes: 2 additions & 1 deletion benchmarks/table/utils/create_synthetic_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ def create_synthetic_table(
min_value: int = 0,
max_value: int = 1000,
) -> Table:
"""Create a synthetic Table with random numerical data.
"""
Create a synthetic Table with random numerical data.
Parameters
----------
Expand Down
3 changes: 2 additions & 1 deletion benchmarks/table/utils/create_synthetic_table_polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ def create_synthetic_table_polars(
min_value: int = 0,
max_value: int = 1000,
) -> ExperimentalTable:
"""Create a synthetic Table with random numerical data.
"""
Create a synthetic Table with random numerical data.
Parameters
----------
Expand Down
16 changes: 8 additions & 8 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ matplotlib = "^3.6.3"
openpyxl = "^3.1.2"
pandas = "^2.0.0"
pillow = ">=9.5,<11.0"
polars = {extras = ["numpy", "pyarrow"], version = "^0.20.24"}
polars = {extras = ["numpy", "pyarrow"], version = "^0.20.25"}
scikit-learn = "^1.2.0"
seaborn = "^0.13.0"
statsmodels = "^0.14.1"
Expand Down
6 changes: 6 additions & 0 deletions src/resources/from_json_file_2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"columns": [
{ "name": "a", "datatype": "Int64", "bit_settings": "", "values": [1, 2, 3] },
{ "name": "b", "datatype": "Int64", "bit_settings": "", "values": [4, 5, 6] }
]
}
Binary file added src/resources/from_parquet_file.parquet
Binary file not shown.
Binary file modified src/resources/to_excel_file.xlsx
Binary file not shown.
6 changes: 6 additions & 0 deletions src/resources/to_json_file_2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"columns": [
{ "name": "a", "datatype": "Int64", "bit_settings": "", "values": [1, 2, 3] },
{ "name": "b", "datatype": "Int64", "bit_settings": "", "values": [4, 5, 6] }
]
}
Binary file added src/resources/to_parquet_file.parquet
Binary file not shown.
8 changes: 5 additions & 3 deletions src/safeds/_config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,19 @@
import apipkg

if TYPE_CHECKING:
from ._device import _get_device, _init_default_device
from ._torch import _get_device, _init_default_device, _set_default_device

apipkg.initpkg(
__name__,
{
"_get_device": "._device:_get_device",
"_init_default_device": "._device:_init_default_device",
"_get_device": "._torch:_get_device",
"_init_default_device": "._torch:_init_default_device",
"_set_default_device": "._torch:_set_default_device",
},
)

__all__ = [
"_get_device",
"_init_default_device",
"_set_default_device",
]
17 changes: 17 additions & 0 deletions src/safeds/_config/_polars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from __future__ import annotations

from typing import TYPE_CHECKING

if TYPE_CHECKING:
import polars as pl


def _get_polars_config() -> pl.Config:
import polars as pl

return pl.Config(
float_precision=5,
tbl_cell_numeric_alignment="RIGHT",
tbl_formatting="ASCII_FULL_CONDENSED",
tbl_hide_dataframe_shape=True,
)
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def _get_device() -> Device:
def _init_default_device() -> None:
import torch

global _default_device
global _default_device # noqa: PLW0603

if _default_device is None:
_default_device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
Expand All @@ -28,7 +28,7 @@ def _init_default_device() -> None:

def _set_default_device(device: Device) -> None:
# This changes all future tensors, but not any tensor that already exists
global _default_device
global _default_device # noqa: PLW0603

_default_device = device
_init_default_device()
6 changes: 6 additions & 0 deletions src/safeds/_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,22 @@
if TYPE_CHECKING:
from ._file_io import _check_and_normalize_file_path
from ._hashing import _structural_hash
from ._plotting import _figure_to_image
from ._random import _get_random_seed

apipkg.initpkg(
__name__,
{
"_check_and_normalize_file_path": "._file_io:_check_and_normalize_file_path",
"_structural_hash": "._hashing:_structural_hash",
"_figure_to_image": "._plotting:_figure_to_image",
"_get_random_seed": "._random:_get_random_seed",
},
)

__all__ = [
"_check_and_normalize_file_path",
"_structural_hash",
"_figure_to_image",
"_get_random_seed",
]
32 changes: 32 additions & 0 deletions src/safeds/_utils/_plotting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from __future__ import annotations

import io
from typing import TYPE_CHECKING

from safeds.data.image.containers import Image

if TYPE_CHECKING:
import matplotlib.pyplot as plt


def _figure_to_image(figure: plt.Figure) -> Image:
"""
Store the figure as an image and closes it.
Parameters
----------
figure:
The figure to store.
Returns
-------
image:
The figure as an image.
"""
import matplotlib.pyplot as plt

buffer = io.BytesIO()
figure.savefig(buffer, format="png")
plt.close(figure) # Prevents the figure from being displayed directly
buffer.seek(0)
return Image.from_bytes(buffer.read())
3 changes: 3 additions & 0 deletions src/safeds/data/labeled/containers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,23 @@
import apipkg

if TYPE_CHECKING:
from ._experimental_tabular_dataset import ExperimentalTabularDataset
from ._image_dataset import ImageDataset
from ._tabular_dataset import TabularDataset
from ._time_series_dataset import TimeSeriesDataset

apipkg.initpkg(
__name__,
{
"ExperimentalTabularDataset": "._experimental_tabular_dataset:ExperimentalTabularDataset",
"ImageDataset": "._image_dataset:ImageDataset",
"TabularDataset": "._tabular_dataset:TabularDataset",
"TimeSeriesDataset": "._time_series_dataset:TimeSeriesDataset",
},
)

__all__ = [
"ExperimentalTabularDataset",
"ImageDataset",
"TabularDataset",
"TimeSeriesDataset",
Expand Down
Loading

0 comments on commit fc49895

Please sign in to comment.